This commit is contained in:
Jay D Dee
2017-11-28 16:32:04 -05:00
parent 6d1361c87f
commit 4b57ac0eb9
70 changed files with 10549 additions and 2852 deletions

View File

@@ -22,24 +22,6 @@ cpuminer_SOURCES = \
api.c \
sysinfos.c \
algo-gate-api.c\
algo/groestl/sph_groestl.c \
algo/bmw/sph_bmw.c \
algo/shavite/sph_shavite.c \
algo/shavite/shavite.c \
algo/echo/sph_echo.c \
algo/heavy/sph_hefty1.c \
algo/luffa/sph_luffa.c \
algo/cubehash/sph_cubehash.c \
algo/simd/sph_simd.c \
algo/hamsi/sph_hamsi.c \
algo/fugue/sph_fugue.c \
algo/gost/sph_gost.c \
algo/jh/sph_jh.c \
algo/sha/sph_sha2.c \
algo/sha/sph_sha2big.c \
algo/shabal/sph_shabal.c \
algo/sm3/sm3.c \
algo/whirlpool/sph_whirlpool.c\
crypto/blake2s.c \
crypto/oaes_lib.c \
crypto/c_keccak.c \
@@ -67,22 +49,34 @@ cpuminer_SOURCES = \
algo/blake/blake2s.c \
algo/blake/mod_blakecoin.c \
algo/blake/blakecoin.c \
algo/blake/decred-gate.c \
algo/blake/decred.c \
algo/blake/decred-4way.c \
algo/blake/pentablake-gate.c \
algo/blake/pentablake-4way.c \
algo/blake/pentablake.c \
algo/bmw/sph_bmw.c \
algo/bmw/bmw256.c \
algo/cubehash/sse2/cubehash_sse2.c\
algo/cryptonight/cryptolight.c \
algo/cryptonight/cryptonight-common.c\
algo/cryptonight/cryptonight-aesni.c\
algo/cryptonight/cryptonight.c\
algo/cubehash/sph_cubehash.c \
algo/cubehash/sse2/cubehash_sse2.c\
algo/drop.c \
algo/echo/sph_echo.c \
algo/echo/aes_ni/hash.c\
algo/fresh.c \
algo/gost/sph_gost.c \
algo/groestl/sph_groestl.c \
algo/groestl/groestl.c \
algo/groestl/myr-groestl.c \
algo/groestl/aes_ni/hash-groestl.c \
algo/groestl/aes_ni/hash-groestl256.c \
algo/fugue/sph_fugue.c \
algo/hamsi/sph_hamsi.c \
algo/haval/haval.c\
algo/heavy/sph_hefty1.c \
algo/heavy/heavy.c \
algo/heavy/bastion.c \
algo/hmq1725.c \
@@ -91,6 +85,10 @@ cpuminer_SOURCES = \
algo/hodl/hodl-wolf.c \
algo/hodl/sha512_avx.c \
algo/hodl/sha512_avx2.c \
algo/jh/sph_jh.c \
algo/jh/jh-hash-4way.c \
algo/jh/jha-gate.c \
algo/jh/jha-4way.c \
algo/jh/jha.c \
algo/keccak/sph_keccak.c \
algo/keccak/keccak.c\
@@ -99,6 +97,7 @@ cpuminer_SOURCES = \
algo/keccak/keccak-gate.c \
algo/keccak/sse2/keccak.c \
algo/lbry.c \
algo/luffa/sph_luffa.c \
algo/luffa/luffa.c \
algo/luffa/sse2/luffa_for_sse2.c \
algo/lyra2/lyra2.c \
@@ -109,7 +108,9 @@ cpuminer_SOURCES = \
algo/lyra2/lyra2z330.c \
algo/m7m.c \
algo/neoscrypt.c \
algo/nist5.c \
algo/nist5/nist5-gate.c \
algo/nist5/nist5-4way.c \
algo/nist5/nist5.c \
algo/pluck.c \
algo/polytimos/polytimos-gate.c \
algo/polytimos/polytimos.c \
@@ -119,8 +120,14 @@ cpuminer_SOURCES = \
algo/ripemd/sph_ripemd.c \
algo/scrypt.c \
algo/scryptjane/scrypt-jane.c \
algo/sha/sph_sha2.c \
algo/sha/sph_sha2big.c \
algo/sha/sha2.c \
algo/sha/sha256t.c \
algo/shabal/sph_shabal.c \
algo/shavite/sph_shavite.c \
algo/shavite/shavite.c \
algo/simd/sph_simd.c \
algo/simd/sse2/nist.c \
algo/simd/sse2/vector.c \
algo/skein/sph_skein.c \
@@ -132,11 +139,18 @@ cpuminer_SOURCES = \
algo/skein/skein2-4way.c \
algo/skein/skein2-gate.c \
algo/skunk.c \
algo/sm3/sm3.c \
algo/tiger/sph_tiger.c \
algo/timetravel.c \
algo/timetravel10.c \
algo/tribus.c \
algo/tribus/tribus-gate.c \
algo/tribus/tribus.c \
algo/tribus/tribus-4way.c \
algo/veltor.c \
algo/whirlpool/sph_whirlpool.c \
algo/whirlpool/whirlpool-hash-4way.c \
algo/whirlpool/whirlpool-gate.c \
algo/whirlpool/whirlpool-4way.c \
algo/whirlpool/whirlpool.c \
algo/whirlpool/whirlpoolx.c \
algo/x11/phi1612.c \

View File

@@ -17,13 +17,17 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
these CPUs. Some algos may crash the miner with an invalid instruction.
Users are recommended to use an unoptimized miner such as cpuminer-multi.
Exe name Compile opts Arch name
cpuminer-sse2.exe -march=core2, Core2
cpuminer-sse42.exe -march=corei7, Nehalem
cpuminer-aes-sse42.exe -maes -msse4.2 Westmere
cpuminer-aes-avx.exe -march=corei7-avx, Sandybridge, Ivybridge
cpuminer-aes-avx2.exe -march=core-avx2, Haswell, Broadwell, Skylake, Kabylake
Exe name Compile opts Arch name
cpuminer-sse2.exe -march=core2 Core2
cpuminer-sse42.exe -march=corei7 Nehalem
cpuminer-aes-sse42.exe -maes -msse4.2" Westmere
cpuminer-aes-avx.exe -march=corei7-avx" Sandybridge, Ivybridge
cpuminer-aes-avx2.exe "-march=core-avx2" Haswell, Broadwell, Skylake, Kabylake
cpuminer-4way.exe "-march=core-avx2 -DFOUR_WAY"
4way requires a CPU with AES and AVX2. It is still under development and
only a few algos are supported. See change log in RELEASE_NOTES in source
package for supported algos.
There is no binary support available for SHA on AMD Ryzen CPUs.

View File

@@ -67,6 +67,11 @@ have been due to AVX and AVX2 optimizations added to that version.
Additional improvements are expected on Ryzen with openssl 1.1.
"-march-znver1" or "-msha".
Additional instructions for static compilalation can be found here:
https://lxadm.com/Static_compilation_of_cpuminer
Static builds should only considered in a homogeneous HW and SW environment.
Local builds will always have the best performance and compatibility.
Extract cpuminer source.
tar xvzf cpuminer-opt-x.y.z.tar.gz
@@ -96,6 +101,11 @@ Start mining.
./cpuminer -a algo -o url -u username -p password
Windows
The following in how the Windows binary releases are built. It's old and
not very good but it works, for me anyway.
Building on Windows prerequisites:
msys
@@ -154,6 +164,12 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
Change Log
----------
v3.7.4
Removed unnecessary build options.
Added 4way support for tribus and nist5.
v3.7.3
Added polytimos algo.

View File

@@ -1,11 +1,11 @@
#include "algo-gate-api.h"
#include "blake-gate.h"
#include "sph_blake.h"
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>
#if defined (__AVX__)
#if defined (BLAKE_4WAY)
void blakehash_4way(void *state, const void *input)
{
@@ -41,7 +41,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found;
int num_found = 0;
// if (opt_benchmark)
// HTarget = 0x7f;
@@ -55,7 +55,6 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *noncep = vdata + 76; // 19*4
do {
found[0] = found[1] = found[2] = found[3] = false;
num_found = 0;
be32enc( noncep, n );
be32enc( noncep +2, n+1 );
be32enc( noncep +4, n+2 );

View File

@@ -8,11 +8,11 @@ int64_t blake_get_max64 ()
bool register_blake_algo( algo_gate_t* gate )
{
gate->get_max64 = (void*)&blake_get_max64;
#if defined (__AVX2__) && defined (FOUR_WAY)
//#if defined (__AVX2__) && defined (FOUR_WAY)
// gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
// gate->scanhash = (void*)&scanhash_blake_8way;
// gate->hash = (void*)&blakehash_8way;
#elif defined(__AVX__) && defined (FOUR_WAY)
#if defined(BLAKE_4WAY)
gate->optimizations = SSE2_OPT | AVX_OPT;
gate->scanhash = (void*)&scanhash_blake_4way;
gate->hash = (void*)&blakehash_4way;

View File

@@ -4,13 +4,11 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined (__AVX2__)
//void blakehash_84way(void *state, const void *input);
//int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done );
#if defined(FOUR_WAY) && defined(__AVX__)
#define BLAKE_4WAY
#endif
#if defined (__AVX__)
#if defined (BLAKE_4WAY)
void blakehash_4way(void *state, const void *input);
int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );

View File

@@ -346,7 +346,7 @@ static const sph_u32 CS[16] = {
#define CBF SPH_C64(0x636920D871574E69)
#if SPH_COMPACT_BLAKE_64
// not used
static const sph_u64 CB[16] = {
SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
@@ -428,7 +428,7 @@ do { \
} while (0)
#if SPH_COMPACT_BLAKE_64
// not used
#define ROUND_B_4WAY(r) do { \
GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
@@ -449,7 +449,7 @@ do { \
} while (0)
#else
//current_impl
#define ROUND_B_4WAY(r) do { \
GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
@@ -696,6 +696,7 @@ do { \
#if SPH_COMPACT_BLAKE_64
// not used
#define COMPRESS64_4WAY do { \
__m256i M[16]; \
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -759,6 +760,8 @@ do { \
#else
//current impl
#define COMPRESS64_4WAY do { \
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
__m256i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -986,7 +989,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
size_t ptr;
DECL_STATE64_4WAY
const int buf_size = 64; // sizeof/8
const int buf_size = 128; // sizeof/8
buf = sc->buf;
ptr = sc->ptr;
@@ -1037,7 +1040,7 @@ blake64_4way_close( blake_4way_big_context *sc,
__m256i *out;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3) + n;
bit_len = ((unsigned)ptr << 3);
z = 0x80 >> n;
zz = ((ub & -z) | z) & 0xFF;
u.buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
@@ -1057,9 +1060,9 @@ blake64_4way_close( blake_4way_big_context *sc,
{
sc->T0 -= 1024 - bit_len;
}
if ( ptr <= (96 >> 3) )
if ( ptr <= 104 )
{
memset_zero_m256i( u.buf + (ptr>>3) + 1, (96-ptr) >> 3 );
memset_zero_m256i( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
if ( out_size_w64 == 8 )
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
_mm256_set_epi64x( 0x0100000000000000,
@@ -1070,11 +1073,13 @@ blake64_4way_close( blake_4way_big_context *sc,
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_epi64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
}
else
{
memset_zero_m256i( u.buf + (ptr>>3) + 1, (127 - ptr) >> 3 );
memset_zero_m256i( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
@@ -1089,6 +1094,7 @@ blake64_4way_close( blake_4way_big_context *sc,
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_epi64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf, 128 );
}
out = (__m256i*)dst;

View File

@@ -62,9 +62,9 @@ extern "C"{
#ifdef __AVX__
typedef struct {
__m128i buf[16] __attribute__ ((aligned (64)));
size_t ptr;
__m128i H[8];
__m128i S[4];
size_t ptr;
sph_u32 T0, T1;
} blake_4way_small_context;
@@ -82,13 +82,13 @@ void blake256_4way_addbits_and_close(
typedef struct {
__m256i buf[16] __attribute__ ((aligned (64)));
size_t ptr;
__m256i H[8];
__m256i S[4];
size_t ptr;
sph_u64 T0, T1;
} blake_4way_big_context;
typedef blake_4way_big_context blake512_avx2_context;
typedef blake_4way_big_context blake512_4way_context;
void blake512_4way_init(void *cc);
void blake512_4way(void *cc, const void *data, size_t len);

153
algo/blake/decred-4way.c Normal file
View File

@@ -0,0 +1,153 @@
#include "decred-gate.h"
#include "sph_blake.h"
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>
#include <unistd.h>
#if defined (DECRED_4WAY)
static __thread blake256_4way_context blake_mid;
static __thread bool ctx_midstate_done = false;
void decred_hash_4way( void *state, const void *input )
{
uint32_t hash0[16] __attribute__ ((aligned (64)));
uint32_t hash1[16] __attribute__ ((aligned (64)));
uint32_t hash2[16] __attribute__ ((aligned (64)));
uint32_t hash3[16] __attribute__ ((aligned (64)));
uint32_t vhash[16*4] __attribute__ ((aligned (64)));
blake256_4way_context ctx __attribute__ ((aligned (64)));
sph_blake256_context ctx2 __attribute__ ((aligned (64)));
uint32_t hash[16] __attribute__ ((aligned (64)));
uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
m128_deinterleave_4x32( sin0, sin1, sin2, sin3, (uint32_t*)input, 180*8 );
void *tail = input + DECRED_MIDSTATE_LEN;
int tail_len = 180 - DECRED_MIDSTATE_LEN;
// #define MIDSTATE_LEN 128
/*
uint8_t *ending = (uint8_t*) input;
ending += MIDSTATE_LEN;
if ( !ctx_midstate_done )
{
blake256_4way_init( &blake_mid );
blake256_4way( &blake_mid, input, DECRED_MIDSTATE_LEN );
ctx_midstate_done = true;
}
memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
blake256_4way( &ctx, tail, tail_len );
blake256_4way_close( &ctx, vhash );
*/
sph_blake256_init( &ctx2 );
sph_blake256( &ctx2, sin0, 180 );
sph_blake256_close( &ctx2, hash );
blake256_4way_init( &ctx );
blake256_4way( &ctx, input, 180 );
blake256_4way_close( &ctx, vhash );
m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
/*
for ( int i = 0; i < 8; i++ )
if ( hash[i] != hash0[i] )
printf(" hash mismatch, i = %u\n",i);
printf("hash: %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
*(hash+2), *(hash+3) );
printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
*(hash0+2), *(hash0+3) );
printf("\n");
*/
// memcpy( state, hash0, 32 );
// memcpy( state+32, hash1, 32 );
// memcpy( state+64, hash1, 32 );
// memcpy( state+96, hash1, 32 );
memcpy( state, hash, 32 );
}
int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t vdata[45*4] __attribute__ ((aligned (64)));
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) endiandata[48];
// uint32_t _ALIGN(64) hash32[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
uint32_t n = first_nonce;
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
// #define DCR_NONCE_OFT32 35
ctx_midstate_done = false;
// memcpy(endiandata, pdata, 180);
m128_interleave_4x32( vdata, pdata, pdata, pdata, pdata, 180*8 );
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
do {
found[0] = found[1] = found[2] = found[3] = false;
* noncep = n;
*(noncep+2) = n+1;
*(noncep+4) = n+2;
*(noncep+6) = n+3;
decred_hash_4way( hash, vdata );
// endiandata[DCR_NONCE_OFT32] = n;
// decred_hash(hash32, endiandata);
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
{
work_set_target_ratio( work, hash );
found[0] = true;
num_found++;
nonces[0] = n;
pdata[DECRED_NONCE_INDEX] = n;
}
/* if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{
work_set_target_ratio( work, hash+8 );
found[1] = true;
num_found++;
nonces[1] = n;
}
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{
work_set_target_ratio( work, hash+16 );
found[2] = true;
num_found++;
nonces[2] = n;
}
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{
work_set_target_ratio( work, hash+24 );
found[3] = true;
num_found++;
nonces[3] = n;
}
*/
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

176
algo/blake/decred-gate.c Normal file
View File

@@ -0,0 +1,176 @@
#include "decred-gate.h"
#include <unistd.h>
#include <memory.h>
#include <string.h>
uint32_t *decred_get_nonceptr( uint32_t *work_data )
{
return &work_data[ DECRED_NONCE_INDEX ];
}
double decred_calc_network_diff( struct work* work )
{
// sample for diff 43.281 : 1c05ea29
// todo: endian reversed on longpoll could be zr5 specific...
uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
uint32_t bits = ( nbits & 0xffffff );
int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
int m;
double d = (double)0x0000ffff / (double)bits;
for ( m = shift; m < 29; m++ )
d *= 256.0;
for ( m = 29; m < shift; m++ )
d /= 256.0;
if ( shift == 28 )
d *= 256.0; // testnet
if ( opt_debug_diff )
applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
shift, bits );
return net_diff;
}
void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
{
// some random extradata to make the work unique
work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
work->height = work->data[32];
if (!have_longpoll && work->height > *net_blocks + 1)
{
char netinfo[64] = { 0 };
if (opt_showdiff && net_diff > 0.)
{
if (net_diff != work->targetdiff)
sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
work->targetdiff);
else
sprintf(netinfo, ", diff %.3f", net_diff);
}
applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
netinfo);
*net_blocks = work->height - 1;
}
}
void decred_be_build_stratum_request( char *req, struct work *work,
struct stratum_ctx *sctx )
{
unsigned char *xnonce2str;
uint32_t ntime, nonce;
char ntimestr[9], noncestr[9];
be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
sctx->xnonce1_size );
snprintf( req, JSON_BUF_LEN,
"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
free(xnonce2str);
}
#define min(a,b) (a>b ? (b) :(a))
void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
{
uchar merkle_root[64] = { 0 };
uint32_t extraheader[32] = { 0 };
int headersize = 0;
uint32_t* extradata = (uint32_t*) sctx->xnonce1;
size_t t;
int i;
// getwork over stratum, getwork merkle + header passed in coinb1
memcpy(merkle_root, sctx->job.coinbase, 32);
headersize = min((int)sctx->job.coinbase_size - 32,
sizeof(extraheader) );
memcpy( extraheader, &sctx->job.coinbase[32], headersize );
// Increment extranonce2
for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
// Assemble block header
memset( g_work->data, 0, sizeof(g_work->data) );
g_work->data[0] = le32dec( sctx->job.version );
for ( i = 0; i < 8; i++ )
g_work->data[1 + i] = swab32(
le32dec( (uint32_t *) sctx->job.prevhash + i ) );
for ( i = 0; i < 8; i++ )
g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
// for ( i = 0; i < 8; i++ ) // prevhash
// g_work->data[1 + i] = swab32( g_work->data[1 + i] );
// for ( i = 0; i < 8; i++ ) // merkle
// g_work->data[9 + i] = swab32( g_work->data[9 + i] );
for ( i = 0; i < headersize/4; i++ ) // header
g_work->data[17 + i] = extraheader[i];
// extradata
for ( i = 0; i < sctx->xnonce1_size/4; i++ )
g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
g_work->data[i] = 0;
g_work->data[37] = (rand()*4) << 8;
// block header suffix from coinb2 (stake version)
memcpy( &g_work->data[44],
&sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
sctx->bloc_height = g_work->data[32];
//applog_hex(work->data, 180);
//applog_hex(&work->data[36], 36);
}
#undef min
bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
int thr_id )
{
if ( have_stratum && strcmp(stratum->job.job_id, work->job_id) )
// need to regen g_work..
return false;
if ( have_stratum && !work->data[0] && !opt_benchmark )
{
sleep(1);
return false;
}
// extradata: prevent duplicates
work->data[ DECRED_XNONCE_INDEX ] += 1;
work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
return true;
}
bool register_decred_algo( algo_gate_t* gate )
{
#if defined(DECRED_4WAY)
gate->optimizations = SSE2_OPT | AVX_OPT;
gate->scanhash = (void*)&scanhash_decred_4way;
gate->hash = (void*)&decred_hash_4way;
#else
gate->optimizations = SSE2_OPT;
gate->scanhash = (void*)&scanhash_decred;
gate->hash = (void*)&decred_hash;
#endif
// gate->optimizations = SSE2_OPT;
// gate->scanhash = (void*)&scanhash_decred;
// gate->hash = (void*)&decred_hash;
gate->get_nonceptr = (void*)&decred_get_nonceptr;
gate->get_max64 = (void*)&get_max64_0x3fffffLL;
gate->display_extra_data = (void*)&decred_decode_extradata;
gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
gate->work_decode = (void*)&std_be_work_decode;
gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
gate->build_extraheader = (void*)&decred_build_extraheader;
gate->ready_to_mine = (void*)&decred_ready_to_mine;
gate->nbits_index = DECRED_NBITS_INDEX;
gate->ntime_index = DECRED_NTIME_INDEX;
gate->nonce_index = DECRED_NONCE_INDEX;
gate->work_data_size = DECRED_DATA_SIZE;
gate->work_cmp_size = DECRED_WORK_COMPARE_SIZE;
allow_mininginfo = false;
have_gbt = false;
return true;
}

36
algo/blake/decred-gate.h Normal file
View File

@@ -0,0 +1,36 @@
#ifndef __DECRED_GATE_H__
#define __DECRED_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#define DECRED_NBITS_INDEX 29
#define DECRED_NTIME_INDEX 34
#define DECRED_NONCE_INDEX 35
#define DECRED_XNONCE_INDEX 36
#define DECRED_DATA_SIZE 192
#define DECRED_WORK_COMPARE_SIZE 140
#define DECRED_MIDSTATE_LEN 128
#if defined (__AVX2__)
//void blakehash_84way(void *state, const void *input);
//int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done );
#endif
#if defined(FOUR_WAY) && defined(__AVX__)
#define DECRED_4WAY
#endif
#if defined (DECRED_4WAY)
void decred_hash_4way(void *state, const void *input);
int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void decred_hash( void *state, const void *input );
int scanhash_decred( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "decred-gate.h"
#include "sph_blake.h"
#include <string.h>
@@ -14,33 +14,33 @@
#define max(a,b) (a<b ? b : a)
#endif
*/
/*
#define DECRED_NBITS_INDEX 29
#define DECRED_NTIME_INDEX 34
#define DECRED_NONCE_INDEX 35
#define DECRED_XNONCE_INDEX 36
#define DECRED_DATA_SIZE 192
#define DECRED_WORK_COMPARE_SIZE 140
*/
static __thread sph_blake256_context blake_mid;
static __thread bool ctx_midstate_done = false;
void decred_hash(void *state, const void *input)
{
#define MIDSTATE_LEN 128
// #define MIDSTATE_LEN 128
sph_blake256_context ctx __attribute__ ((aligned (64)));
uint8_t *ending = (uint8_t*) input;
ending += MIDSTATE_LEN;
ending += DECRED_MIDSTATE_LEN;
if (!ctx_midstate_done) {
sph_blake256_init(&blake_mid);
sph_blake256(&blake_mid, input, MIDSTATE_LEN);
sph_blake256(&blake_mid, input, DECRED_MIDSTATE_LEN);
ctx_midstate_done = true;
}
memcpy(&ctx, &blake_mid, sizeof(blake_mid));
sph_blake256(&ctx, ending, (180 - MIDSTATE_LEN));
sph_blake256(&ctx, ending, (180 - DECRED_MIDSTATE_LEN));
sph_blake256_close(&ctx, state);
}
@@ -59,9 +59,9 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
#define DCR_NONCE_OFT32 35
// #define DCR_NONCE_OFT32 35
const uint32_t first_nonce = pdata[DCR_NONCE_OFT32];
const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
uint32_t n = first_nonce;
@@ -81,7 +81,7 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
do {
//be32enc(&endiandata[DCR_NONCE_OFT32], n);
endiandata[DCR_NONCE_OFT32] = n;
endiandata[DECRED_NONCE_INDEX] = n;
decred_hash(hash32, endiandata);
if (hash32[7] <= HTarget && fulltest(hash32, ptarget)) {
@@ -92,7 +92,7 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
applog_hash(ptarget);
applog_compare_hash(hash32, ptarget);
#endif
pdata[DCR_NONCE_OFT32] = n;
pdata[DECRED_NONCE_INDEX] = n;
return 1;
}
@@ -101,10 +101,11 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[DCR_NONCE_OFT32] = n;
pdata[DECRED_NONCE_INDEX] = n;
return 0;
}
/*
uint32_t *decred_get_nonceptr( uint32_t *work_data )
{
return &work_data[ DECRED_NONCE_INDEX ];
@@ -172,7 +173,7 @@ void decred_be_build_stratum_request( char *req, struct work *work,
rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
free(xnonce2str);
}
*/
/*
// data shared between gen_merkle_root and build_extraheader.
__thread uint32_t decred_extraheader[32] = { 0 };
@@ -188,7 +189,7 @@ void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
}
*/
/*
#define min(a,b) (a>b ? (b) :(a))
void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
@@ -282,4 +283,4 @@ bool register_decred_algo( algo_gate_t* gate )
have_gbt = false;
return true;
}
*/

View File

@@ -0,0 +1,206 @@
#include "pentablake-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "blake-hash-4way.h"
#include "sph_blake.h"
//#define DEBUG_ALGO
#ifdef PENTABLAKE_4WAY
extern void pentablakehash_4way( void *output, const void *input )
{
unsigned char _ALIGN(32) hash[128];
// // same as uint32_t hashA[16], hashB[16];
// #define hashB hash+64
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
blake512_4way_context ctx;
blake512_4way_init( &ctx );
blake512_4way( &ctx, input, 80 );
blake512_4way_close( &ctx, vhash );
uint64_t sin0[10], sin1[10], sin2[10], sin3[10];
m256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
sph_blake512_context ctx2_blake;
sph_blake512_init(&ctx2_blake);
sph_blake512(&ctx2_blake, sin0, 80);
sph_blake512_close(&ctx2_blake, (void*) hash);
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
uint64_t* hash64 = (uint64_t*)hash;
for( int i = 0; i < 8; i++ )
{
if ( hash0[i] != hash64[i] )
printf("hash mismatch %u\n",i);
}
blake512_4way_init( &ctx );
blake512_4way( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
blake512_4way_init( &ctx );
blake512_4way( &ctx, vhash, 64 );
blake512_4way_close( &ctx, vhash );
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
/*
uint64_t sin0[10] __attribute__ ((aligned (64)));
uint64_t sin1[10] __attribute__ ((aligned (64)));
uint64_t sin2[10] __attribute__ ((aligned (64)));
uint64_t sin3[10] __attribute__ ((aligned (64)));
sph_blake512_context ctx_blake;
sph_blake512_init(&ctx_blake);
sph_blake512(&ctx_blake, input, 80);
sph_blake512_close(&ctx_blake, hash);
sph_blake512_init(&ctx_blake);
sph_blake512(&ctx_blake, hash, 64);
sph_blake512_close(&ctx_blake, hash);
sph_blake512_init(&ctx_blake);
sph_blake512(&ctx_blake, hash, 64);
sph_blake512_close(&ctx_blake, hash);
sph_blake512_init(&ctx_blake);
sph_blake512(&ctx_blake, hash, 64);
sph_blake512_close(&ctx_blake, hash);
sph_blake512_init(&ctx_blake);
sph_blake512(&ctx_blake, hash, 64);
sph_blake512_close(&ctx_blake, hash);
memcpy(output, hash, 32);
*/
}
int scanhash_pentablake_4way( int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[32] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
// uint32_t _ALIGN(32) hash64[8];
// uint32_t _ALIGN(32) endiandata[32];
uint64_t htmax[] = {
0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000
};
uint32_t masks[] = {
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0
};
// we need bigendian data...
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ )
{
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
pentablakehash_4way( hash, vdata );
// return immediately on nonce found, only one submit
if ( ( !(hash[7] & mask) ) && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
pdata[19] = n;
*hashes_done = n - first_nonce + 1;
return 1;
}
if ( (! ((hash+8)[7] & mask) ) && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n;
*hashes_done = n - first_nonce + 1;
return 1;
}
if ( ( !((hash+16)[7] & mask) ) && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n;
*hashes_done = n - first_nonce + 1;
return 1;
}
if ( ( !((hash+24)[7] & mask) ) && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n;
*hashes_done = n - first_nonce + 1;
return 1;
}
n += 4;
} while (n < max_nonce && !work_restart[thr_id].restart);
break;
}
}
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
#endif

View File

@@ -0,0 +1,16 @@
#include "pentablake-gate.h"
bool register_pentablake_algo( algo_gate_t* gate )
{
#if defined (PENTABLAKE_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_pentablake_4way;
gate->hash = (void*)&pentablakehash_4way;
#else
gate->scanhash = (void*)&scanhash_pentablake;
gate->hash = (void*)&pentablakehash;
#endif
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -0,0 +1,21 @@
#ifndef __PENTABLAKE_GATE_H__
#define __PENTABLAKE_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX__)
#define PENTABLAKE_4WAY
#endif
#if defined(PENTABLAKE_4WAY)
void pentablakehash_4way( void *state, const void *input );
int scanhash_pentablake_4way( int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done );
#endif
void pentablakehash( void *state, const void *input );
int scanhash_pentablake( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "pentablake-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -110,11 +110,3 @@ int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce,
return 0;
}
bool register_pentablake_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_pentablake;
gate->hash = (void*)&pentablakehash;
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
};

View File

@@ -983,9 +983,11 @@ blake64_close(sph_blake_big_context *sc,
u.buf[111] |= 1;
sph_enc64be_aligned(u.buf + 112, th);
sph_enc64be_aligned(u.buf + 120, tl);
blake64(sc, u.buf + ptr, 128 - ptr);
} else {
memset(u.buf + ptr + 1, 0, 127 - ptr);
blake64(sc, u.buf + ptr, 128 - ptr);
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
@@ -994,6 +996,7 @@ blake64_close(sph_blake_big_context *sc,
u.buf[111] = 1;
sph_enc64be_aligned(u.buf + 112, th);
sph_enc64be_aligned(u.buf + 120, tl);
blake64(sc, u.buf, 128);
}
out = dst;

639
algo/jh/jh-hash-4way.c Normal file
View File

@@ -0,0 +1,639 @@
/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */
/*
* JH implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifdef __AVX2__
#include <stddef.h>
#include <string.h>
#include "jh-hash-4way.h"
#ifdef __cplusplus
extern "C"{
#endif
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH
#define SPH_SMALL_FOOTPRINT_JH 1
#endif
#if !defined SPH_JH_64 && SPH_64_TRUE
#define SPH_JH_64 1
#endif
#if !SPH_64
#undef SPH_JH_64
#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
/*
* The internal bitslice representation may use either big-endian or
* little-endian (true bitslice operations do not care about the bit
* ordering, and the bit-swapping linear operations in JH happen to
* be invariant through endianness-swapping). The constants must be
* defined according to the chosen endianness; we use some
* byte-swapping macros for that.
*/
#if SPH_LITTLE_ENDIAN
#if SPH_64
#define C64e(x) ((SPH_C64(x) >> 56) \
| ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
| ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
| ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \
| ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \
| ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
| ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
| ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
#define dec64e_aligned sph_dec64le_aligned
#define enc64e sph_enc64le
#endif
#else
#if SPH_64
#define C64e(x) SPH_C64(x)
#define dec64e_aligned sph_dec64be_aligned
#define enc64e sph_enc64be
#endif
#endif
#define Sb(x0, x1, x2, x3, c) \
do { \
__m256i cc = _mm256_set_epi64x( c, c, c, c ); \
x3 = mm256_bitnot( x3 ); \
x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_bitnot( x2 ) ) ); \
tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_bitnot( x1 ), x2 ) ); \
x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_bitnot( x3 ) ) ); \
x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
x2 = _mm256_xor_si256( x2, tmp ); \
} while (0)
/*
#define Sb(x0, x1, x2, x3, c) do { \
x3 = ~x3; \
x0 ^= (c) & ~x2; \
tmp = (c) ^ (x0 & x1); \
x0 ^= x2 & x3; \
x3 ^= ~x1 & x2; \
x1 ^= x0 & x2; \
x2 ^= x0 & ~x3; \
x0 ^= x1 | x3; \
x3 ^= x1 & x2; \
x1 ^= tmp & x0; \
x2 ^= tmp; \
} while (0)
*/
#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
do { \
x4 = _mm256_xor_si256( x4, x1 ); \
x5 = _mm256_xor_si256( x5, x2 ); \
x6 = _mm256_xor_si256( x6, _mm256_xor_si256( x3, x0 ) ); \
x7 = _mm256_xor_si256( x7, x0 ); \
x0 = _mm256_xor_si256( x0, x5 ); \
x1 = _mm256_xor_si256( x1, x6 ); \
x2 = _mm256_xor_si256( x2, _mm256_xor_si256( x7, x4 ) ); \
x3 = _mm256_xor_si256( x3, x4 ); \
} while (0)
/*
#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) do { \
x4 ^= x1; \
x5 ^= x2; \
x6 ^= x3 ^ x0; \
x7 ^= x0; \
x0 ^= x5; \
x1 ^= x6; \
x2 ^= x7 ^ x4; \
x3 ^= x4; \
} while (0)
*/
#if SPH_JH_64
static const sph_u64 C[] = {
C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557),
C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40),
C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a),
C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231),
C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410),
C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc),
C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0),
C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3),
C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce),
C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23),
C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8),
C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197),
C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95),
C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214),
C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80),
C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4),
C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989),
C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36),
C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7),
C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f),
C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727),
C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b),
C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e),
C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062),
C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984),
C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5),
C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2),
C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f),
C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465),
C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a),
C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1),
C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf),
C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48),
C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0),
C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134),
C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a),
C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff),
C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6),
C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae),
C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567),
C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a),
C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518),
C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446),
C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e),
C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee),
C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001),
C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779),
C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83),
C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a),
C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef),
C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d),
C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65),
C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a),
C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c),
C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d),
C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71),
C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc),
C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0),
C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c),
C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f),
C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751),
C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad),
C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56),
C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6),
C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a),
C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163),
C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826),
C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f),
C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30),
C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a),
C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3),
C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505),
C64e(0xb17681d913326cce), C64e(0x3c175284f805a262),
C64e(0xf42bcbb378471547), C64e(0xff46548223936a48),
C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e),
C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e),
C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd),
C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7),
C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be),
C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de),
C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9),
C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a),
C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b),
C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2)
};
#define Ceven_hi(r) (C[((r) << 2) + 0])
#define Ceven_lo(r) (C[((r) << 2) + 1])
#define Codd_hi(r) (C[((r) << 2) + 2])
#define Codd_lo(r) (C[((r) << 2) + 3])
#define S(x0, x1, x2, x3, cb, r) do { \
Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
} while (0)
#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \
Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
} while (0)
#define Wz(x, c, n) \
do { \
__m256i t = _mm256_slli_epi64( _mm256_and_si256(x ## h, (c)), (n) ); \
x ## h = _mm256_or_si256( _mm256_and_si256( \
_mm256_srli_epi64(x ## h, (n)), (c)), t ); \
t = _mm256_slli_epi64( _mm256_and_si256(x ## l, (c)), (n) ); \
x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
} while (0)
/*
#define Wz(x, c, n) do { \
sph_u64 t = (x ## h & (c)) << (n); \
x ## h = ((x ## h >> (n)) & (c)) | t; \
t = (x ## l & (c)) << (n); \
x ## l = ((x ## l >> (n)) & (c)) | t; \
} while (0)
*/
#define W0(x) Wz(x, _mm256_set_epi64x( 0x5555555555555555, \
0x5555555555555555, 0x5555555555555555, 0x5555555555555555 ), 1 )
#define W1(x) Wz(x, _mm256_set_epi64x( 0x3333333333333333, \
0x3333333333333333, 0x3333333333333333, 0x3333333333333333 ), 2 )
#define W2(x) Wz(x, _mm256_set_epi64x( 0x0F0F0F0F0F0F0F0F, \
0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F ), 4 )
#define W3(x) Wz(x, _mm256_set_epi64x( 0x00FF00FF00FF00FF, \
0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF ), 8 )
#define W4(x) Wz(x, _mm256_set_epi64x( 0x0000FFFF0000FFFF, \
0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF ), 16 )
#define W5(x) Wz(x, _mm256_set_epi64x( 0x00000000FFFFFFFF, \
0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF ), 32 )
#define W6(x) \
do { \
__m256i t = x ## h; \
x ## h = x ## l; \
x ## l = t; \
} while (0)
/*
#define W0(x) Wz(x, SPH_C64(0x5555555555555555), 1)
#define W1(x) Wz(x, SPH_C64(0x3333333333333333), 2)
#define W2(x) Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F), 4)
#define W3(x) Wz(x, SPH_C64(0x00FF00FF00FF00FF), 8)
#define W4(x) Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
#define W5(x) Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
#define W6(x) do { \
sph_u64 t = x ## h; \
x ## h = x ## l; \
x ## l = t; \
} while (0)
*/
#define DECL_STATE \
__m256i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
__m256i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
__m256i tmp;
#define READ_STATE(state) do { \
h0h = (state)->H[ 0]; \
h0l = (state)->H[ 1]; \
h1h = (state)->H[ 2]; \
h1l = (state)->H[ 3]; \
h2h = (state)->H[ 4]; \
h2l = (state)->H[ 5]; \
h3h = (state)->H[ 6]; \
h3l = (state)->H[ 7]; \
h4h = (state)->H[ 8]; \
h4l = (state)->H[ 9]; \
h5h = (state)->H[10]; \
h5l = (state)->H[11]; \
h6h = (state)->H[12]; \
h6l = (state)->H[13]; \
h7h = (state)->H[14]; \
h7l = (state)->H[15]; \
} while (0)
#define WRITE_STATE(state) do { \
(state)->H[ 0] = h0h; \
(state)->H[ 1] = h0l; \
(state)->H[ 2] = h1h; \
(state)->H[ 3] = h1l; \
(state)->H[ 4] = h2h; \
(state)->H[ 5] = h2l; \
(state)->H[ 6] = h3h; \
(state)->H[ 7] = h3l; \
(state)->H[ 8] = h4h; \
(state)->H[ 9] = h4l; \
(state)->H[10] = h5h; \
(state)->H[11] = h5l; \
(state)->H[12] = h6h; \
(state)->H[13] = h6l; \
(state)->H[14] = h7h; \
(state)->H[15] = h7l; \
} while (0)
#define INPUT_BUF1 \
__m256i m0h = buf[0]; \
__m256i m0l = buf[1]; \
__m256i m1h = buf[2]; \
__m256i m1l = buf[3]; \
__m256i m2h = buf[4]; \
__m256i m2l = buf[5]; \
__m256i m3h = buf[6]; \
__m256i m3l = buf[7]; \
h0h = _mm256_xor_si256( h0h, m0h ); \
h0l = _mm256_xor_si256( h0l, m0l ); \
h1h = _mm256_xor_si256( h1h, m1h ); \
h1l = _mm256_xor_si256( h1l, m1l ); \
h2h = _mm256_xor_si256( h2h, m2h ); \
h2l = _mm256_xor_si256( h2l, m2l ); \
h3h = _mm256_xor_si256( h3h, m3h ); \
h3l = _mm256_xor_si256( h3l, m3l ); \
#define INPUT_BUF2 \
h4h = _mm256_xor_si256( h4h, m0h ); \
h4l = _mm256_xor_si256( h4l, m0l ); \
h5h = _mm256_xor_si256( h5h, m1h ); \
h5l = _mm256_xor_si256( h5l, m1l ); \
h6h = _mm256_xor_si256( h6h, m2h ); \
h6l = _mm256_xor_si256( h6l, m2l ); \
h7h = _mm256_xor_si256( h7h, m3h ); \
h7l = _mm256_xor_si256( h7l, m3l ); \
static const sph_u64 IV256[] = {
C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477),
C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8),
C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262),
C64e(0x277695f776248f94), C64e(0x87d5b6574780296c),
C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f),
C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769)
};
static const sph_u64 IV512[] = {
C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543),
C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361),
C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80),
C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7),
C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a),
C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199),
C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156),
C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b)
};
#else
#endif
#define SL(ro) SLu(r + ro, ro)
#define SLu(r, ro) do { \
S(h0, h2, h4, h6, Ceven_, r); \
S(h1, h3, h5, h7, Codd_, r); \
L(h0, h2, h4, h6, h1, h3, h5, h7); \
W ## ro(h1); \
W ## ro(h3); \
W ## ro(h5); \
W ## ro(h7); \
} while (0)
#if SPH_SMALL_FOOTPRINT_JH
#if SPH_JH_64
/*
* The "small footprint" 64-bit version just uses a partially unrolled
* loop.
*/
#define E8 do { \
unsigned r; \
for (r = 0; r < 42; r += 7) { \
SL(0); \
SL(1); \
SL(2); \
SL(3); \
SL(4); \
SL(5); \
SL(6); \
} \
} while (0)
#else
#endif
#else
#if SPH_JH_64
/*
* On a "true 64-bit" architecture, we can unroll at will.
*/
#define E8 do { \
SLu( 0, 0); \
SLu( 1, 1); \
SLu( 2, 2); \
SLu( 3, 3); \
SLu( 4, 4); \
SLu( 5, 5); \
SLu( 6, 6); \
SLu( 7, 0); \
SLu( 8, 1); \
SLu( 9, 2); \
SLu(10, 3); \
SLu(11, 4); \
SLu(12, 5); \
SLu(13, 6); \
SLu(14, 0); \
SLu(15, 1); \
SLu(16, 2); \
SLu(17, 3); \
SLu(18, 4); \
SLu(19, 5); \
SLu(20, 6); \
SLu(21, 0); \
SLu(22, 1); \
SLu(23, 2); \
SLu(24, 3); \
SLu(25, 4); \
SLu(26, 5); \
SLu(27, 6); \
SLu(28, 0); \
SLu(29, 1); \
SLu(30, 2); \
SLu(31, 3); \
SLu(32, 4); \
SLu(33, 5); \
SLu(34, 6); \
SLu(35, 0); \
SLu(36, 1); \
SLu(37, 2); \
SLu(38, 3); \
SLu(39, 4); \
SLu(40, 5); \
SLu(41, 6); \
} while (0)
#else
#endif
#endif
static void
jh_4way_init( jh_4way_context *sc, const void *iv )
{
uint64_t *v = (uint64_t*)iv;
for ( int i = 0; i < 16; i++ )
sc->H[i] = _mm256_set_epi64x( v[i], v[i], v[i], v[i] );
sc->ptr = 0;
sc->block_count = 0;
}
static void
jh_4way_core( jh_4way_context *sc, const void *data, size_t len )
{
__m256i *buf;
__m256i *vdata = (__m256i*)data;
const int buf_size = 64; // 64 * _m256i
size_t ptr;
DECL_STATE
buf = sc->buf;
ptr = sc->ptr;
if ( len < (buf_size - ptr) )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE(sc);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata += (clen>>3);
len -= clen;
if ( ptr == buf_size )
{
INPUT_BUF1;
E8;
INPUT_BUF2;
sc->block_count ++;
ptr = 0;
}
}
WRITE_STATE(sc);
sc->ptr = ptr;
}
static void
jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
size_t out_size_w32, const void *iv )
{
__m256i buf[16*4];
__m256i *dst256 = (__m256i*)dst;
size_t numz, u;
sph_u64 l0, l1, l0e, l1e;
buf[0] = _mm256_set_epi64x( 0x80, 0x80, 0x80, 0x80 );
if ( sc->ptr == 0 )
numz = 48;
else
numz = 112 - sc->ptr;
memset_zero_m256i( buf+1, (numz>>3) - 1 );
l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
l1 = SPH_T64(sc->block_count >> 55);
sph_enc64be( &l0e, l0 );
sph_enc64be( &l1e, l1 );
*(buf + (numz>>3) ) = _mm256_set_epi64x( l1e, l1e, l1e, l1e );
*(buf + (numz>>3) + 1) = _mm256_set_epi64x( l0e, l0e, l0e, l0e );
jh_4way_core( sc, buf, numz + 16 );
for ( u=0; u < 8; u++ )
buf[u] = sc->H[u+8];
memcpy_m256i( dst256, buf, 8 );
}
void
jh256_4way_init(void *cc)
{
jh_4way_init(cc, IV256);
}
void
jh256_4way(void *cc, const void *data, size_t len)
{
jh_4way_core(cc, data, len);
}
void
jh256_4way_close(void *cc, void *dst)
{
jh_4way_close(cc, 0, 0, dst, 8, IV256);
}
void
jh512_4way_init(void *cc)
{
jh_4way_init(cc, IV512);
}
void
jh512_4way(void *cc, const void *data, size_t len)
{
jh_4way_core(cc, data, len);
}
void
jh512_4way_close(void *cc, void *dst)
{
jh_4way_close(cc, 0, 0, dst, 16, IV512);
}
#ifdef __cplusplus
}
#endif
#endif

100
algo/jh/jh-hash-4way.h Normal file
View File

@@ -0,0 +1,100 @@
/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */
/**
* JH interface. JH is a family of functions which differ by
* their output size; this implementation defines JH for output
* sizes 224, 256, 384 and 512 bits.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_jh.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef JH_HASH_4WAY_H__
#define JH_HASH_4WAY_H__
#ifdef __AVX2__
#ifdef __cplusplus
extern "C"{
#endif
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#define SPH_SIZE_jh256 256
#define SPH_SIZE_jh512 512
/**
* This structure is a context for JH computations: it contains the
* intermediate values and some data from the last entered block. Once
* a JH computation has been performed, the context can be reused for
* another computation.
*
* The contents of this structure are private. A running JH computation
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
__m256i buf[8] __attribute__ ((aligned (64)));
__m256i H[16];
size_t ptr;
uint64_t block_count;
/*
unsigned char buf[64];
size_t ptr;
union {
sph_u64 wide[16];
} H;
sph_u64 block_count;
*/
} jh_4way_context;
typedef jh_4way_context jh256_4way_context;
typedef jh_4way_context jh512_4way_context;
void jh256_4way_init(void *cc);
void jh256_4way(void *cc, const void *data, size_t len);
void jh256_4way_close(void *cc, void *dst);
void jh512_4way_init(void *cc);
void jh512_4way(void *cc, const void *data, size_t len);
void jh512_4way_close(void *cc, void *dst);
#ifdef __cplusplus
}
#endif
#endif
#endif

228
algo/jh/jha-4way.c Normal file
View File

@@ -0,0 +1,228 @@
#if defined(JHA_4WAY)
#include "jha-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "avxdefs.h"
#include "algo/blake/blake-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
//static __thread keccak512_4way_context jha_kec_mid
// __attribute__ ((aligned (64)));
void jha_hash_4way( void *output, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhasha[8*4] __attribute__ ((aligned (64)));
uint64_t vhashb[8*4] __attribute__ ((aligned (64)));
__m256i mask;
__m256i* vh256 = (__m256i*)vhash;
__m256i* vha256 = (__m256i*)vhasha;
__m256i* vhb256 = (__m256i*)vhashb;
blake512_4way_context ctx_blake;
hashState_groestl ctx_groestl;
jh512_4way_context ctx_jh;
skein512_4way_context ctx_skein;
keccak512_4way_context ctx_keccak;
keccak512_4way_init( &ctx_keccak );
keccak512_4way( &ctx_keccak, input, 80 );
keccak512_4way_close( &ctx_keccak, vhash );
// memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
// keccak512_4way( &ctx_keccak, input+64, 16 );
// keccak512_4way_close( &ctx_keccak, vhash );
// Heavy & Light Pair Loop
for ( int round = 0; round < 3; round++ )
{
memset_zero_m256i( vha256, 20 );
memset_zero_m256i( vhb256, 20 );
mask = _mm256_sub_epi64( _mm256_and_si256( vh256[0],
mm256_vec_epi64( 0x1 ) ), mm256_vec_epi64( 0x1 ) );
// groestl (serial) v skein
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash0,
(char*)hash0, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash1,
(char*)hash1, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash2,
(char*)hash2, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash3,
(char*)hash3, 512 );
m256_interleave_4x64( vhasha, hash0, hash1, hash2, hash3, 512 );
// skein
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, vhash, 64 );
skein512_4way_close( &ctx_skein, vhashb );
// merge vectored hash
for ( int i = 0; i < 8; i++ )
{
vha256[i] = _mm256_maskload_epi64(
vhasha + i*4, mm256_bitnot(mask ) );
vhb256[i] = _mm256_maskload_epi64(
vhashb + i*4, mask );
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
}
// blake v jh
blake512_4way_init( &ctx_blake );
blake512_4way( &ctx_blake, vhash, 64 );
blake512_4way_close( &ctx_blake, vhasha );
jh512_4way_init( &ctx_jh );
jh512_4way( &ctx_jh, vhash, 64 );
jh512_4way_close( &ctx_jh, vhashb );
// merge vectored hash
for ( int i = 0; i < 8; i++ )
{
vha256[i] = _mm256_maskload_epi64(
vhasha + i*4, mm256_bitnot(mask ) );
vhb256[i] = _mm256_maskload_epi64(
vhashb + i*4, mask );
vh256[i] = _mm256_or_si256( vha256[i], vhb256[i] );
}
}
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
}
int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t n = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
uint64_t htmax[] = {
0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000
};
uint32_t masks[] = {
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0
};
// we need bigendian data...
for ( int i=0; i < 19; i++ )
be32enc( &endiandata[i], pdata[i] );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// precalc midstate for keccak
// keccak512_4way_init( &jha_kec_mid );
// keccak512_4way( &jha_kec_mid, vdata, 64 );
for ( int m = 0; m < 6; m++ )
{
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
jha_hash_4way( hash, vdata );
pdata[19] = n;
if ( ( !(hash[7] & mask) )
&& fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( !((hash+8)[7] & mask) )
&& fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( !((hash+16)[7] & mask) )
&& fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( !((hash+24)[7] & mask) )
&& fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

18
algo/jh/jha-gate.c Normal file
View File

@@ -0,0 +1,18 @@
#include "jha-gate.h"
bool register_jha_algo( algo_gate_t* gate )
{
//#if defined (JHA_4WAY)
// gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
// gate->scanhash = (void*)&scanhash_jha_4way;
// gate->hash = (void*)&jha_hash_4way;
//#else
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_jha;
gate->hash = (void*)&jha_hash;
//#endif
gate->set_target = (void*)&scrypt_set_target;
return true;
};

27
algo/jh/jha-gate.h Normal file
View File

@@ -0,0 +1,27 @@
#ifndef JHA_GATE_H__
#define JHA_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
#define JHA_4WAY
#endif
//#if defined JHA_4WAY
//void jha_hash_4way( void *state, const void *input );
//int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done );
//#else
void jha_hash( void *state, const void *input );
int scanhash_jha( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
//#endif
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "jha-gate.h"
#include <stdlib.h>
#include <stdint.h>
@@ -38,7 +38,6 @@ void jha_hash(void *output, const void *input)
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein;
sph_keccak512_init(&ctx_keccak);
memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
sph_keccak512(&ctx_keccak, input+64, 16 );
sph_keccak512_close(&ctx_keccak, hash );
@@ -154,12 +153,3 @@ int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *ha
return 0;
}
bool register_jha_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_jha;
gate->hash = (void*)&jha_hash;
gate->set_target = (void*)&scrypt_set_target;
return true;
};

View File

@@ -914,6 +914,7 @@ jh_core(sph_jh_context *sc, const void *data, size_t len)
buf = sc->buf;
ptr = sc->ptr;
if (len < (sizeof sc->buf) - ptr) {
memcpy(buf + ptr, data, len);
ptr += len;

View File

@@ -6,7 +6,7 @@
#include "sph_keccak.h"
#include "keccak-hash-4way.h"
#ifdef __AVX2__
#ifdef KECCAK_4WAY
void keccakhash_4way(void *state, const void *input)
{
@@ -21,7 +21,7 @@ void keccakhash_4way(void *state, const void *input)
keccak256_4way( &ctx, input, 80 );
keccak256_4way_close( &ctx, vhash );
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
@@ -33,16 +33,16 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
// const uint32_t Htarg = ptarget[7];
uint32_t endiandata[20];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
@@ -52,11 +52,10 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
be32enc( &endiandata[i], pdata[i] );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
m256_interleave_4x64x( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
do {
found[0] = found[1] = found[2] = found[3] = false;
num_found = 0;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );

View File

@@ -12,7 +12,7 @@ bool register_keccak_algo( algo_gate_t* gate )
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->set_target = (void*)&keccak_set_target;
gate->get_max64 = (void*)&keccak_get_max64;
#if defined (FOUR_WAY) && defined (__AVX2__)
#if defined (KECCAK_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_keccak_4way;
gate->hash = (void*)&keccakhash_4way;

View File

@@ -4,7 +4,11 @@
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__)
#if defined(FOUR_WAY) && defined(__AVX2__)
#define KECCAK_4WAY
#endif
#if defined(KECCAK_4WAY)
void keccakhash_4way( void *state, const void *input );
int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,

View File

@@ -372,7 +372,7 @@ static const sph_u64 RC[] = {
} while (0)
static void keccak64_init( keccak64_ctx_m256i *kc, size_t lim )
static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
{
int i;
for (i = 0; i < 25; i ++)
@@ -386,7 +386,7 @@ static void keccak64_init( keccak64_ctx_m256i *kc, size_t lim )
kc->w[17] = mm256_neg1;
kc->w[20] = mm256_neg1;
kc->ptr = 0;
kc->lim = 200 - (lim >> 2);
kc->lim = 200 - (out_size >> 2);
}
static void
@@ -396,6 +396,7 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
__m256i *buf;
__m256i *vdata = (__m256i*)data;
size_t ptr;
DECL_STATE
buf = kc->buf;
ptr = kc->ptr;
@@ -407,6 +408,7 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
return;
}
READ_STATE( kc );
while ( len > 0 )
{
size_t clen;
@@ -425,6 +427,7 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
ptr = 0;
}
}
WRITE_STATE( kc );
kc->ptr = ptr;
}
@@ -440,12 +443,11 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
size_t m256_len = byte_len >> 3;
eb = 0x100 >> 8;
if ( kc->ptr == (lim - 1) )
if ( kc->ptr == (lim - 8) )
{
uint64_t t = eb | 0x80;
uint64_t t = eb | 0x8000000000000000;
u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
j = 1;
j = 8;
}
else
{

View File

@@ -1,581 +0,0 @@
#include <stddef.h>
#include "keccak-hash-4way.h"
static const sph_u64 RC[] = {
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
};
// change u.wide and u.narrow to just w, ie kc->w
#define a00 (kc->w[ 0])
#define a10 (kc->w[ 1])
#define a20 (kc->w[ 2])
#define a30 (kc->w[ 3])
#define a40 (kc->w[ 4])
#define a01 (kc->w[ 5])
#define a11 (kc->w[ 6])
#define a21 (kc->w[ 7])
#define a31 (kc->w[ 8])
#define a41 (kc->w[ 9])
#define a02 (kc->w[10])
#define a12 (kc->w[11])
#define a22 (kc->w[12])
#define a32 (kc->w[13])
#define a42 (kc->w[14])
#define a03 (kc->w[15])
#define a13 (kc->w[16])
#define a23 (kc->w[17])
#define a33 (kc->w[18])
#define a43 (kc->w[19])
#define a04 (kc->w[20])
#define a14 (kc->w[21])
#define a24 (kc->w[22])
#define a34 (kc->w[23])
#define a44 (kc->w[24])
// null when no copy
#define DECL_STATE
#define READ_STATE(sc)
#define WRITE_STATE(sc)
#define INPUT_BUF(size) do { \
size_t j; \
for (j = 0; j < (size>>3); j++ ) \
kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
} while (0)
// keccak256 4way uses 136 with 32 bit size
// keccak256 8way and keccak512 4 way use 72 with 64 bit size
//#define INPUT_BUF144 INPUT_BUF(144)
//#define INPUT_BUF136 INPUT_BUF(136)
//#define INPUT_BUF104 INPUT_BUF(104)
//#define INPUT_BUF72 INPUT_BUF(72)
//simply redefine these macros to do simd
#define mm256_neg1 \
(_mm256_set_epi64x( 0xffffffffffffffff, 0xffffffffffffffff, \
0xffffffffffffffff, 0xffffffffffffffff ) )
#define DECL64(x) __m256i x
#define MOV64(d, s) (d = s)
#define XOR64(d, a, b) (d = _mm256_xor_si256(a,b))
#define AND64(d, a, b) (d = _mm256_and_si256(a,b))
#define OR64(d, a, b) (d = _mm256_or_si256(a,b))
#define NOT64(d, s) (d = _mm256_xor_si256(s,mm256_neg1))
#define ROL64(d, v, n) (d = mm256_rotl_64(v, n))
#define XOR64_IOTA XOR64
/*
#define DECL64(x) sph_u64 x
#define MOV64(d, s) (d = s)
#define XOR64(d, a, b) (d = a ^ b)
#define AND64(d, a, b) (d = a & b)
#define OR64(d, a, b) (d = a | b)
#define NOT64(d, s) (d = SPH_T64(~s))
#define ROL64(d, v, n) (d = SPH_ROTL64(v, n))
#define XOR64_IOTA XOR64
*/
#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \
DECL64(tt0); \
DECL64(tt1); \
DECL64(tt2); \
DECL64(tt3); \
uint64_t *ttx = (uint64_t*)&tt0; \
uint64_t *d0x = (uint64_t*)&d0; \
uint64_t *d1x = (uint64_t*)&d1; \
uint64_t *d2x = (uint64_t*)&d2; \
uint64_t *d3x = (uint64_t*)&d3; \
uint64_t *d4x = (uint64_t*)&d4; \
XOR64(tt0, d0, d1); \
if (vtp) {printf("Velt0\n"); \
printf("d0= %016llx\n",*d0x ); \
printf("d1= %016llx\n",*d1x ); \
printf("d2= %016llx\n",*d2x ); \
printf("d3= %016llx\n",*d3x ); \
printf("d4= %016llx\n",*d4x ); \
printf("tt0= %016llx\n",*ttx );} \
XOR64(tt1, d2, d3); \
XOR64(tt0, tt0, d4); \
XOR64(tt0, tt0, tt1); \
if(vtp){\
printf("tt0= %016llx\n",*ttx );} \
ROL64(tt0, tt0, 1); \
if(vtp){\
printf("tt0= %016llx\n",*ttx );} \
XOR64(tt2, c0, c1); \
XOR64(tt3, c2, c3); \
XOR64(tt0, tt0, c4); \
XOR64(tt2, tt2, tt3); \
XOR64(t, tt0, tt2); \
} while (0)
int vtp = 0;
#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
b40, b41, b42, b43, b44) \
do { \
DECL64(t0); \
DECL64(t1); \
DECL64(t2); \
DECL64(t3); \
DECL64(t4); \
TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
if(vtp){printf("Velt0\n");\
uint64_t *tx = (uint64_t*)&t0; \
printf("t0= %016llx\n",tx );} \
vtp=0; \
TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
XOR64(b00, b00, t0); \
XOR64(b01, b01, t0); \
XOR64(b02, b02, t0); \
XOR64(b03, b03, t0); \
XOR64(b04, b04, t0); \
XOR64(b10, b10, t1); \
XOR64(b11, b11, t1); \
XOR64(b12, b12, t1); \
XOR64(b13, b13, t1); \
XOR64(b14, b14, t1); \
XOR64(b20, b20, t2); \
XOR64(b21, b21, t2); \
XOR64(b22, b22, t2); \
XOR64(b23, b23, t2); \
XOR64(b24, b24, t2); \
XOR64(b30, b30, t3); \
XOR64(b31, b31, t3); \
XOR64(b32, b32, t3); \
XOR64(b33, b33, t3); \
XOR64(b34, b34, t3); \
XOR64(b40, b40, t4); \
XOR64(b41, b41, t4); \
XOR64(b42, b42, t4); \
XOR64(b43, b43, t4); \
XOR64(b44, b44, t4); \
} while (0)
#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
b40, b41, b42, b43, b44) \
do { \
/* ROL64(b00, b00, 0); */ \
ROL64(b01, b01, 36); \
ROL64(b02, b02, 3); \
ROL64(b03, b03, 41); \
ROL64(b04, b04, 18); \
ROL64(b10, b10, 1); \
ROL64(b11, b11, 44); \
ROL64(b12, b12, 10); \
ROL64(b13, b13, 45); \
ROL64(b14, b14, 2); \
ROL64(b20, b20, 62); \
ROL64(b21, b21, 6); \
ROL64(b22, b22, 43); \
ROL64(b23, b23, 15); \
ROL64(b24, b24, 61); \
ROL64(b30, b30, 28); \
ROL64(b31, b31, 55); \
ROL64(b32, b32, 25); \
ROL64(b33, b33, 21); \
ROL64(b34, b34, 56); \
ROL64(b40, b40, 27); \
ROL64(b41, b41, 20); \
ROL64(b42, b42, 39); \
ROL64(b43, b43, 8); \
ROL64(b44, b44, 14); \
} while (0)
/*
* The KHI macro integrates the "lane complement" optimization. On input,
* some words are complemented:
* a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
* On output, the following words are complemented:
* a04 a10 a20 a22 a23 a31
*
* The (implicit) permutation and the theta expansion will bring back
* the input mask for the next round.
*/
#define KHI_XO(d, a, b, c) do { \
DECL64(kt); \
OR64(kt, b, c); \
XOR64(d, a, kt); \
} while (0)
#define KHI_XA(d, a, b, c) do { \
DECL64(kt); \
AND64(kt, b, c); \
XOR64(d, a, kt); \
} while (0)
#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
b40, b41, b42, b43, b44) \
do { \
DECL64(c0); \
DECL64(c1); \
DECL64(c2); \
DECL64(c3); \
DECL64(c4); \
DECL64(bnn); \
NOT64(bnn, b20); \
KHI_XO(c0, b00, b10, b20); \
KHI_XO(c1, b10, bnn, b30); \
KHI_XA(c2, b20, b30, b40); \
KHI_XO(c3, b30, b40, b00); \
KHI_XA(c4, b40, b00, b10); \
MOV64(b00, c0); \
MOV64(b10, c1); \
MOV64(b20, c2); \
MOV64(b30, c3); \
MOV64(b40, c4); \
NOT64(bnn, b41); \
KHI_XO(c0, b01, b11, b21); \
KHI_XA(c1, b11, b21, b31); \
KHI_XO(c2, b21, b31, bnn); \
KHI_XO(c3, b31, b41, b01); \
KHI_XA(c4, b41, b01, b11); \
MOV64(b01, c0); \
MOV64(b11, c1); \
MOV64(b21, c2); \
MOV64(b31, c3); \
MOV64(b41, c4); \
NOT64(bnn, b32); \
KHI_XO(c0, b02, b12, b22); \
KHI_XA(c1, b12, b22, b32); \
KHI_XA(c2, b22, bnn, b42); \
KHI_XO(c3, bnn, b42, b02); \
KHI_XA(c4, b42, b02, b12); \
MOV64(b02, c0); \
MOV64(b12, c1); \
MOV64(b22, c2); \
MOV64(b32, c3); \
MOV64(b42, c4); \
NOT64(bnn, b33); \
KHI_XA(c0, b03, b13, b23); \
KHI_XO(c1, b13, b23, b33); \
KHI_XO(c2, b23, bnn, b43); \
KHI_XA(c3, bnn, b43, b03); \
KHI_XO(c4, b43, b03, b13); \
MOV64(b03, c0); \
MOV64(b13, c1); \
MOV64(b23, c2); \
MOV64(b33, c3); \
MOV64(b43, c4); \
NOT64(bnn, b14); \
KHI_XA(c0, b04, bnn, b24); \
KHI_XO(c1, bnn, b24, b34); \
KHI_XA(c2, b24, b34, b44); \
KHI_XO(c3, b34, b44, b04); \
KHI_XA(c4, b44, b04, b14); \
MOV64(b04, c0); \
MOV64(b14, c1); \
MOV64(b24, c2); \
MOV64(b34, c3); \
MOV64(b44, c4); \
} while (0)
#define IOTA(r) XOR64_IOTA(a00, a00, r)
#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
#define P8_TO_P0 do { \
DECL64(t); \
MOV64(t, a01); \
MOV64(a01, a11); \
MOV64(a11, a43); \
MOV64(a43, t); \
MOV64(t, a02); \
MOV64(a02, a22); \
MOV64(a22, a31); \
MOV64(a31, t); \
MOV64(t, a03); \
MOV64(a03, a33); \
MOV64(a33, a24); \
MOV64(a24, t); \
MOV64(t, a04); \
MOV64(a04, a44); \
MOV64(a44, a12); \
MOV64(a12, t); \
MOV64(t, a10); \
MOV64(a10, a32); \
MOV64(a32, a13); \
MOV64(a13, t); \
MOV64(t, a14); \
MOV64(a14, a21); \
MOV64(a21, a20); \
MOV64(a20, t); \
MOV64(t, a23); \
MOV64(a23, a42); \
MOV64(a42, a40); \
MOV64(a40, t); \
MOV64(t, a30); \
MOV64(a30, a41); \
MOV64(a41, a34); \
MOV64(a34, t); \
} while (0)
#define LPAR (
#define RPAR )
#define KF_ELT(r, s, k) do { \
if(r==0){ vtp=1; printf("Vtheo0\n");}\
THETA LPAR P ## r RPAR; \
if(vtp=1){ \
uint64_t *W = (uint64_t*)(kc->w); \
printf("w: %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] ); \
printf(" %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] ); \
printf(" %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] ); \
printf(" %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );} \
vtp=0; \
RHO LPAR P ## r RPAR; \
if(r==0){ printf("Vrho0\n");\
uint64_t *W = (uint64_t*)(kc->w); \
printf("w: %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] ); \
printf(" %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] ); \
printf(" %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] ); \
printf(" %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );} \
KHI LPAR P ## s RPAR; \
if(r==0){ printf("Vkhi0\n");\
uint64_t *W = (uint64_t*)(kc->w); \
printf("w: %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] ); \
printf(" %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] ); \
printf(" %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] ); \
printf(" %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );} \
IOTA(k); \
} while (0)
#define DO(x) x
#define KECCAK_F_1600 DO(KECCAK_F_1600_)
#define KECCAK_F_1600_ do { \
int j; \
for (j = 0; j < 24; j += 8) \
{ \
KF_ELT( 0, 1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \
RC[j + 0], RC[j + 0])) ); \
KF_ELT( 1, 2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \
RC[j + 1], RC[j + 1])) ); \
KF_ELT( 2, 3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \
RC[j + 2], RC[j + 2])) ); \
KF_ELT( 3, 4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \
RC[j + 3], RC[j + 3])) ); \
KF_ELT( 4, 5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \
RC[j + 4], RC[j + 4])) ); \
KF_ELT( 5, 6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \
RC[j + 5], RC[j + 5])) ); \
KF_ELT( 6, 7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \
RC[j + 6], RC[j + 6])) ); \
KF_ELT( 7, 8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \
RC[j + 7], RC[j + 7])) ); \
P8_TO_P0; \
} \
} while (0)
static void keccak64_init( keccak64_ctx_m256i *kc, size_t lim )
{
int i;
for (i = 0; i < 25; i ++)
kc->w[i] = _mm256_setzero_si256();
// Initialization for the "lane complement".
kc->w[ 1] = mm256_neg1;
kc->w[ 2] = mm256_neg1;
kc->w[ 8] = mm256_neg1;
kc->w[12] = mm256_neg1;
kc->w[17] = mm256_neg1;
kc->w[20] = mm256_neg1;
kc->ptr = 0;
kc->lim = 200 - (lim >> 2);
}
static void
keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
size_t lim )
{
__m256i *buf;
__m256i *vdata = (__m256i*)data;
size_t ptr;
buf = kc->buf;
ptr = kc->ptr;
uint64_t *W = (uint64_t*)(kc->w);
if ( len < (lim - ptr) )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
kc->ptr = ptr + len;
return;
}
while ( len > 0 )
{
size_t clen;
clen = (lim - ptr);
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata = vdata + (clen>>3);
len -= clen;
if ( ptr == lim )
{
INPUT_BUF( lim );
printf("Vtransform before ptr= %u, len= %u, lim= %u\n",ptr, len, lim);
printf("w: %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] );
printf(" %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] );
printf(" %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] );
printf(" %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );
KECCAK_F_1600;
//printf("Vtransform after ptr= %u, len= %u, lim= %u\n",ptr, len, lim);
//printf("w: %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] );
//printf(" %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] );
ptr = 0;
}
}
kc->ptr = ptr;
}
// keccak512 4way d=64 lim=72, keccak256 8way d=32 lim=136
// keccak256 4way d=32 lim=136
// keccak512 d=64, lim=72, keccak256 d=32, lim=136
static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t d,
size_t lim )
{
unsigned eb;
union {
__m256i tmp[lim + 1];
sph_u64 dummy; /* for alignment */
} u;
size_t j;
// int d = 64;
eb = 0x100 >> 8;
if ( kc->ptr == (lim - 1) )
{
uint64_t t = eb | 0x80;
u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
j = 1;
}
else
{
j = lim - kc->ptr;
u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
memset_zero_m256i( u.tmp + 1, (j>>3) - 1 );
u.tmp[j - 1] = _mm256_set_epi64x( 0x8000000000000000,
0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
}
keccak64_core( kc, u.tmp, j, lim );
/* Finalize the "lane complement" */
NOT64( kc->w[ 1], kc->w[ 1] );
NOT64( kc->w[ 2], kc->w[ 2] );
NOT64( kc->w[ 8], kc->w[ 8] );
NOT64( kc->w[12], kc->w[12] );
NOT64( kc->w[17], kc->w[17] );
NOT64( kc->w[20], kc->w[20] );
for ( j = 0; j < d; j += 8 )
u.tmp[j] = kc->w[j>>3];
memcpy_m256i( dst, u.tmp, d>>3 );
}
void keccak256_4way_init( void *kc )
{
keccak64_init( kc, 256 );
}
void
keccak256_4way(void *cc, const void *data, size_t len)
{
keccak64_core(cc, data, len, 136);
}
void
keccak256_4way_close(void *cc, void *dst)
{
keccak64_close(cc, dst, 32, 136);
}
void keccak512_4way_init( void *kc )
{
keccak64_init( kc, 512 );
}
void
keccak512_4way(void *cc, const void *data, size_t len)
{
keccak64_core(cc, data, len, 72);
}
void
keccak512_4way_close(void *cc, void *dst)
{
keccak64_close(cc, dst, 64, 72);
}

View File

@@ -40,6 +40,8 @@
extern "C"{
#endif
#ifdef __AVX2__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
@@ -62,8 +64,6 @@ extern "C"{
* <code>memcpy()</code>).
*/
#ifdef __AVX2__
typedef struct {
__m256i buf[144*8]; /* first field, for alignment */
__m256i w[25];

File diff suppressed because it is too large Load Diff

View File

@@ -197,6 +197,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
sph_tiger( &ctx1.tiger, data, M7_MIDSTATE_LEN );
sph_ripemd160( &ctx1.ripemd, data, M7_MIDSTATE_LEN );
// the following calculations can be performed once and the results shared
mpz_t magipi, magisw, product, bns0, bns1;
mpf_t magifpi, magifpi0, mpt1, mpt2, mptmp, mpten;
@@ -221,6 +222,9 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );
// with 4 way can a single midstate be shared among lanes?
// do sinlge round of midstate and inyerleave for final
#ifndef USE_SPH_SHA
SHA256_Update( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
SHA256_Final( (unsigned char*) (bhash[0]), &ctx2.sha256 );
@@ -249,6 +253,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );
// 4 way serial
mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
mpz_set(bns1, bns0);
mpz_set(product, bns0);
@@ -274,6 +279,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
sph_sha256_close( &ctxf_sha256, (void*)(hash) );
#endif
// do once and share
digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
mpf_set_prec_raw(magifpi, prec);
@@ -296,7 +302,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
mpz_set_f(magipi, magifpi);
mpz_add(magipi,magipi,magisw);
mpz_add(product,product,magipi);
// share magipi, product and do serial
mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
mpz_add(bns1, bns1, bns0);
mpz_mul(product,product,bns1);
@@ -317,6 +323,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
#endif
}
// this is the scanhash part
const unsigned char *hash_ = (const unsigned char *)hash;
const unsigned char *target_ = (const unsigned char *)ptarget;
for ( i = 31; i >= 0; i-- )
@@ -346,6 +353,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
pdata[19] = n;
// do this in hashm7m
out:
mpf_set_prec_raw(magifpi, prec0);
mpf_set_prec_raw(magifpi0, prec0);

178
algo/nist5/nist5-4way.c Normal file
View File

@@ -0,0 +1,178 @@
#include "nist5-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#if defined(NIST5_4WAY)
#include "algo/blake/blake-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
// no improvement with midstate
//static __thread blake512_4way_context ctx_mid;
void nist5hash_4way( void *output, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
blake512_4way_context ctx_blake;
hashState_groestl ctx_groestl;
jh512_4way_context ctx_jh;
skein512_4way_context ctx_skein;
keccak512_4way_context ctx_keccak;
// memcpy( &ctx_blake, &ctx_mid, sizeof(ctx_mid) );
// blake512_4way( &ctx_blake, input + (64<<2), 16 );
blake512_4way_init( &ctx_blake );
blake512_4way( &ctx_blake, input, 80 );
blake512_4way_close( &ctx_blake, vhash );
m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash0,
(const char*)hash0, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash1,
(const char*)hash1, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash2,
(const char*)hash2, 512 );
init_groestl( &ctx_groestl, 64 );
update_and_final_groestl( &ctx_groestl, (char*)hash3,
(const char*)hash3, 512 );
m256_interleave_4x64x( vhash, hash0, hash1, hash2, hash3, 512 );
jh512_4way_init( &ctx_jh );
jh512_4way( &ctx_jh, vhash, 64 );
jh512_4way_close( &ctx_jh, vhash );
keccak512_4way_init( &ctx_keccak );
keccak512_4way( &ctx_keccak, vhash, 64 );
keccak512_4way_close( &ctx_keccak, vhash );
skein512_4way_init( &ctx_skein );
skein512_4way( &ctx_skein, vhash, 64 );
skein512_4way_close( &ctx_skein, vhash );
m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( output, hash0, 32 );
memcpy( output+32, hash1, 32 );
memcpy( output+64, hash2, 32 );
memcpy( output+96, hash3, 32 );
}
int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
uint64_t htmax[] = { 0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
// we need bigendian data...
swab32_array( endiandata, pdata, 20 );
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// precalc midstate
// blake512_4way_init( &ctx_mid );
// blake512_4way( &ctx_mid, vdata, 64 );
for ( int m=0; m < 6; m++ )
{
if (Htarg <= htmax[m])
{
uint32_t mask = masks[m];
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
nist5hash_4way( hash, vdata );
pdata[19] = n;
if ( ( !(hash[7] & mask) )
&& fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( ( !((hash+8)[7] & mask) )
&& fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( ( !((hash+16)[7] & mask) )
&& fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( ( !((hash+24)[7] & mask) )
&& fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
break;
}
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

17
algo/nist5/nist5-gate.c Normal file
View File

@@ -0,0 +1,17 @@
#include "nist5-gate.h"
bool register_nist5_algo( algo_gate_t* gate )
{
#if defined (NIST5_4WAY)
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_nist5_4way;
gate->hash = (void*)&nist5hash_4way;
#else
gate->optimizations = SSE2_OPT | AES_OPT;
init_nist5_ctx();
gate->scanhash = (void*)&scanhash_nist5;
gate->hash = (void*)&nist5hash;
#endif
return true;
};

26
algo/nist5/nist5-gate.h Normal file
View File

@@ -0,0 +1,26 @@
#ifndef __NIST5_GATE_H__
#define __NIST5_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
#define NIST5_4WAY
#endif
#if defined(NIST5_4WAY)
void nist5hash_4way( void *state, const void *input );
int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#else
void nist5hash( void *state, const void *input );
int scanhash_nist5( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "nist5-gate.h"
#include <stdlib.h>
#include <stdint.h>
@@ -147,7 +147,7 @@ int scanhash_nist5(int thr_id, struct work *work,
pdata[19] = n;
return 0;
}
/*
bool register_nist5_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AES_OPT;
@@ -156,4 +156,4 @@ bool register_nist5_algo( algo_gate_t* gate )
gate->hash = (void*)&nist5hash;
return true;
};
*/

View File

@@ -1,125 +0,0 @@
#include "polytimos-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algo/skein/sph_skein.h"
#include "algo/echo/sph_echo.h"
#include "algo/fugue/sph_fugue.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/gost/sph_gost.h"
#include "algo/luffa/sse2/luffa_for_sse2.h"
#ifndef NO_AES_NI
#include "algo/echo/aes_ni/hash_api.h"
#endif
/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
typedef struct {
sph_skein512_context skein;
sph_luffa512_context luffa;
// hashState_luffa luffa;
//#ifdef NO_AES_NI
sph_echo512_context echo;
//#else
// hashState_echo echo;
//#endif
sph_shabal512_context shabal;
sph_fugue512_context fugue;
sph_gost512_context gost;
} poly_context_holder;
static __thread poly_context_holder poly_ctx __attribute__ ((aligned (64)));
void init_polytimos_context()
{
sph_skein512_init(&poly_ctx.skein);
sph_shabal512_init(&poly_ctx.shabal);
//#ifdef NO_AES_NI
sph_echo512_init(&poly_ctx.echo);
//#else
// init_echo( &poly_ctx.echo, 512 );
//#endif
// init_luffa( &poly_ctx.luffa, 512 );
sph_luffa512_init(&poly_ctx.luffa);
sph_fugue512_init(&poly_ctx.fugue);
sph_gost512_init(&poly_ctx.gost);
}
void polytimos_hash(void *output, const void *input)
{
poly_context_holder ctx __attribute__ ((aligned (64)));
uint32_t hashA[16]__attribute__ ((aligned (64)));
memcpy( &ctx, &poly_ctx, sizeof(poly_ctx) );
sph_skein512(&ctx.skein, input, 80);
sph_skein512_close(&ctx.skein, hashA);
sph_shabal512(&ctx.shabal, hashA, 64);
sph_shabal512_close(&ctx.shabal, hashA);
//#ifdef NO_AES_NI
sph_echo512(&ctx.echo, hashA, 64);
sph_echo512_close(&ctx.echo, hashA);
//#else
// update_final_echo ( &ctx.echo, (BitSequence *)hashA,
// (const BitSequence *)hashA, 512 );
//#endif
// update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA,
// (const BitSequence*)hashA, 64 );
sph_luffa512(&ctx.luffa, hashA, 64);
sph_luffa512_close(&ctx.luffa, hashA);
sph_fugue512(&ctx.fugue, hashA, 64);
sph_fugue512_close(&ctx.fugue, hashA);
sph_gost512(&ctx.gost, hashA, 64);
sph_gost512_close(&ctx.gost, hashA);
memcpy(output, hashA, 32);
}
int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);
if (opt_benchmark)
ptarget[7] = 0x0cff;
// we need bigendian data...
for (int i=0; i < 19; i++) {
be32enc(&endiandata[i], pdata[i]);
}
do {
be32enc(&endiandata[19], nonce);
polytimos_hash(hash, endiandata);
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !(*restart));
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

View File

@@ -1,4 +1,3 @@
#include "algo-gate-api.h"
#include "skein-gate.h"
#include <string.h>
#include <stdint.h>
@@ -60,7 +59,7 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
// hash is returned deinterleaved
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found;
int num_found = 0;
// data is 80 bytes, 20 u32 or 4 u64.
@@ -76,7 +75,6 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
do
{
found[0] = found[1] = found[2] = found[3] = false;
num_found = 0;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );

View File

@@ -1,7 +1,4 @@
#include "skein-gate.h"
#include "algo-gate-api.h"
//#include <string.h>
//#include <stdint.h>
#include "sph_skein.h"
#include "skein-hash-4way.h"
@@ -9,7 +6,7 @@ int64_t skein_get_max64() { return 0x7ffffLL; }
bool register_skein_algo( algo_gate_t* gate )
{
#if defined (FOUR_WAY) && defined (__AVX2__)
#if defined (SKEIN_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_skein_4way;
gate->hash = (void*)&skeinhash_4way;

View File

@@ -1,15 +1,22 @@
#ifndef __SKEIN_GATE_H__
#define __SKEIN_GATE_H__
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX2__)
#if defined(FOUR_WAY) && defined(__AVX2__)
#define SKEIN_4WAY
#endif
#if defined(SKEIN_4WAY)
void skeinhash_4way( void *output, const void *input );
int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
void skeinhash( void *output, const void *input );
int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );

View File

@@ -296,7 +296,6 @@ do { \
w5 = _mm256_add_epi64( w5, _mm256_add_epi64( SKBI(k,s,5), \
_mm256_set_epi64x( SKBT(t,s,0), SKBT(t,s,0), \
SKBT(t,s,0), SKBT(t,s,0) ) ) ); \
__m256i skbi6 = SKBI(k,s,6); \
w6 = _mm256_add_epi64( w6, _mm256_add_epi64( SKBI(k,s,6), \
_mm256_set_epi64x( SKBT(t,s,1), SKBT(t,s,1), \
SKBT(t,s,1), SKBT(t,s,1) ) ) ); \
@@ -458,20 +457,10 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
unsigned first;
DECL_STATE_BIG_4WAY
// len is the array size, of data, ie 64 bytes
// data points to start of 4 element buf
// ptr is a len offset in bytes
// buff is an array of 4 elements
// buff_size is size of one array element
// One element is 8 bytes (64 bits) scalar but 32 bytes (256 bits) 4way
// To index buf using ptr it has to be scaled 8 to 1. the amounrt of
// data to copy is 32 bytes per element instead of 8, or one m256
buf = sc->buf;
ptr = sc->ptr;
const int buf_size = 64; // 64 * _m256i
// 64 byte len, no part block
if ( len <= buf_size - ptr )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
@@ -481,8 +470,6 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
READ_STATE_BIG( sc );
first = ( bcount == 0 ) << 7;
// 64 byte len, only one block, no transform here.
// 80 byte len, transform first 64 bytes.
do {
size_t clen;
@@ -512,19 +499,7 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
__m256i *buf;
size_t ptr;
unsigned et;
int i;
DECL_STATE_BIG_4WAY
/*
* Add bit padding if necessary.
*/
// if (n != 0) {
// unsigned z;
// unsigned char x;
//
// z = 0x80 >> n;
// x = ((ub & -z) | z) & 0xFF;
// skein_big_core(sc, &x, 1);
// }
buf = sc->buf;
ptr = sc->ptr;
@@ -543,8 +518,6 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
* the encoding of "0", over 8 bytes, then padded with zeros).
*/
// 64 byte len, process only block
// 80 byte len, process last part block (16 bytes) padded.
READ_STATE_BIG(sc);
memset_zero_m256i( buf + (ptr>>3), (buf_size - ptr) >> 3 );
@@ -555,28 +528,6 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
bcount = 0;
UBI_BIG_4WAY( 510, 8 );
// for ( i = 0; i < 2; i ++ )
// {
// UBI_BIG_AVX2( et, ptr );
// if (i == 0)
// {
// memset_zero_m256i( buf, buf_size >> 3 );
// bcount = 0;
// et = 510;
// ptr = 8;
// }
// }
// Can LE be assumed? Should be ok SPH_LITTLE_ENDIAN is defined
/* _mm256_enc64le( buf, h0 );
_mm256_enc64le( buf + 32, h1 );
_mm256_enc64le( buf + 64, h2 );
_mm256_enc64le( buf + 96, h3 );
_mm256_enc64le( buf + 128, h4 );
_mm256_enc64le( buf + 160, h5 );
_mm256_enc64le( buf + 192, h6 );
_mm256_enc64le( buf + 224, h7 );
*/
buf[0] = h0;
buf[1] = h1;
buf[2] = h2;
@@ -587,7 +538,6 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
buf[7] = h7;
memcpy_m256i( dst, buf, out_len >> 3 );
// memcpy( dst, buf, out_len * 4 );
}
static const sph_u64 IV256[] = {

View File

@@ -1,10 +1,9 @@
#include "skein-gate.h"
#include "algo-gate-api.h"
#include "skein2-gate.h"
#include <string.h>
#include <stdint.h>
#include "skein-hash-4way.h"
#if defined(__AVX2__)
#if defined(SKEIN2_4WAY)
void skein2hash_4way( void *output, const void *input )
{
@@ -38,7 +37,7 @@ int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
// hash is returned deinterleaved
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found;
int num_found = 0;
swab32_array( endiandata, pdata, 20 );
@@ -52,7 +51,6 @@ int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
do
{
found[0] = found[1] = found[2] = found[3] = false;
num_found = 0;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );

View File

@@ -1,9 +1,6 @@
#include "skein2-gate.h"
#include "algo-gate-api.h"
//#include <string.h>
#include <stdint.h>
#include "sph_skein.h"
//#include "skein-hash-avx2.h"
int64_t skein2_get_max64 ()
{

View File

@@ -1,8 +1,13 @@
#ifndef __SKEIN2GATE_H__
#define __SKEIN2_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__)
#if defined(FOUR_WAY) && defined(__AVX2__)
#define SKEIN2_4WAY
#endif
#if defined(SKEIN2_4WAY)
void skein2hash_4way( void *output, const void *input );
int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t* hashes_done );

View File

@@ -39,25 +39,6 @@
extern "C"{
#endif
void dump_sph_context( sph_u64 ptr, sph_u64 bcount, uint64_t* buf,
sph_u64 h0, sph_u64 h1, sph_u64 h2, sph_u64 h3, sph_u64 h4, sph_u64 h5,
sph_u64 h6, sph_u64 h7 )
{
//scalar
printf("sptr= %llu, bcount= %llu\n", ptr, bcount );
printf("sbuf: %016llx %016llx %016llx %016llx\n", *((uint64_t*)buf),
*((uint64_t*)buf+1), *((uint64_t*)buf+2), *((uint64_t*)buf+3) );
printf(" %016llx %016llx %016llx %016llx\n", *((uint64_t*)buf+4),
*((uint64_t*)buf+5), *((uint64_t*)buf+6), *((uint64_t*)buf+7) );
printf("sh:%016llx %016llx %016llx %016llx\n", h0, h1, h2, h3 );
printf(" %016llx %016llx %016llx %016llx\n", h4, h5, h6, h7 );
}
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN
#define SPH_SMALL_FOOTPRINT_SKEIN 1
#endif

162
algo/tribus/tribus-4way.c Normal file
View File

@@ -0,0 +1,162 @@
#include "tribus-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#if defined(__AVX2__) && !defined(NO_AES_NI)
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/echo/aes_ni/hash_api.h"
static __thread jh512_4way_context ctx_mid;
void tribus_hash_4way(void *state, const void *input)
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
jh512_4way_context ctx_jh;
keccak512_4way_context ctx_keccak;
hashState_echo ctx_echo;
memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) );
jh512_4way( &ctx_jh, input + (64<<2), 16 );
jh512_4way_close( &ctx_jh, vhash );
keccak512_4way_init( &ctx_keccak );
keccak512_4way( &ctx_keccak, vhash, 64 );
keccak512_4way_close( &ctx_keccak, vhash );
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// hash echo serially
init_echo( &ctx_echo, 512 );
update_final_echo( &ctx_echo, (BitSequence *) hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx_echo, 512 );
update_final_echo( &ctx_echo, (BitSequence *) hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx_echo, 512 );
update_final_echo( &ctx_echo, (BitSequence *) hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx_echo, 512 );
update_final_echo( &ctx_echo, (BitSequence *) hash3,
(const BitSequence *) hash3, 512 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_tribus_4way(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t n = pdata[19];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
uint64_t htmax[] = { 0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000 };
uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
// we need bigendian data...
for ( int i = 0; i < 20; i++ )
{
be32enc( &endiandata[i], pdata[i] );
}
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// precalc midstate
// doing it one way then then interleaving would be faster but too
// complicated tto interleave context.
jh512_4way_init( &ctx_mid );
jh512_4way( &ctx_mid, vdata, 64 );
for ( int m = 0; m < 6; m++ )
{
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
tribus_hash_4way( hash, vdata );
pdata[19] = n;
if ( ( !(hash[7] & mask) )
&& fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio(work, hash);
}
if ( ( !((hash+8)[7] & mask) )
&& fulltest (hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio(work, hash+8);
}
if ( ( !((hash+16)[7] & mask) )
&& fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio(work, hash+16);
}
if ( ( !((hash+24)[7] & mask) )
&& fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio(work, hash+24);
}
n += 4;
} while ( (num_found == 0) && ( n < max_nonce )
&& !work_restart[thr_id].restart);
break;
}
}
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

31
algo/tribus/tribus-gate.c Normal file
View File

@@ -0,0 +1,31 @@
#include "tribus-gate.h"
/*
bool tribus_thread_init()
{
sph_jh512_init( &tribus_ctx.jh );
sph_keccak512_init( &tribus_ctx.keccak );
#ifdef NO_AES_NI
sph_echo512_init( &tribus_ctx.echo );
#else
init_echo( &tribus_ctx.echo, 512 );
#endif
return true;
}
*/
bool register_tribus_algo( algo_gate_t* gate )
{
// gate->miner_thread_init = (void*)&tribus_thread_init;
gate->get_max64 = (void*)&get_max64_0x1ffff;
#if defined (TRIBUS_4WAY)
gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
gate->scanhash = (void*)&scanhash_tribus_4way;
gate->hash = (void*)&tribus_hash_4way;
#else
gate->miner_thread_init = (void*)&tribus_thread_init;
gate->optimizations = SSE2_OPT | AES_OPT;
gate->scanhash = (void*)&scanhash_tribus;
gate->hash = (void*)&tribus_hash;
#endif
return true;
};

29
algo/tribus/tribus-gate.h Normal file
View File

@@ -0,0 +1,29 @@
#ifndef TRIBUS_GATE_H__
#define TRIBUS_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
#define TRIBUS_4WAY
#endif
#if defined(TRIBUS_4WAY)
void tribus_hash_4way( void *state, const void *input );
int scanhash_tribus_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#else
void tribus_hash( void *state, const void *input );
int scanhash_tribus( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
bool tribus_thread_init();
#endif
#endif

View File

@@ -1,4 +1,4 @@
#include "algo-gate-api.h"
#include "tribus-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -25,6 +25,18 @@ typedef struct {
static __thread tribus_ctx_holder tribus_ctx;
bool tribus_thread_init()
{
sph_jh512_init( &tribus_ctx.jh );
sph_keccak512_init( &tribus_ctx.keccak );
#ifdef NO_AES_NI
sph_echo512_init( &tribus_ctx.echo );
#else
init_echo( &tribus_ctx.echo, 512 );
#endif
return true;
}
void tribus_hash(void *state, const void *input)
{
unsigned char hash[128] __attribute__ ((aligned (32)));
@@ -122,25 +134,4 @@ int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
return 0;
}
bool tribus_thread_init()
{
sph_jh512_init( &tribus_ctx.jh );
sph_keccak512_init( &tribus_ctx.keccak );
#ifdef NO_AES_NI
sph_echo512_init( &tribus_ctx.echo );
#else
init_echo( &tribus_ctx.echo, 512 );
#endif
return true;
}
bool register_tribus_algo( algo_gate_t* gate )
{
gate->miner_thread_init = (void*)&tribus_thread_init;
gate->optimizations = SSE2_OPT | AES_OPT;
gate->get_max64 = (void*)&get_max64_0x1ffff;
gate->scanhash = (void*)&scanhash_tribus;
gate->hash = (void*)&tribus_hash;
return true;
};

View File

@@ -0,0 +1,291 @@
/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
/*
* This file contains some functions which implement the external data
* handling and padding for Merkle-Damgard hash functions which follow
* the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
*
* API: this file is meant to be included, not compiled as a stand-alone
* file. Some macros must be defined:
* RFUN name for the round function
* HASH "short name" for the hash function
* BE32 defined for big-endian, 32-bit based (e.g. SHA-1)
* LE32 defined for little-endian, 32-bit based (e.g. MD5)
* BE64 defined for big-endian, 64-bit based (e.g. SHA-512)
* LE64 defined for little-endian, 64-bit based (no example yet)
* PW01 if defined, append 0x01 instead of 0x80 (for Tiger)
* BLEN if defined, length of a message block (in bytes)
* PLW1 if defined, length is defined on one 64-bit word only (for Tiger)
* PLW4 if defined, length is defined on four 64-bit words (for WHIRLPOOL)
* SVAL if defined, reference to the context state information
*
* BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
* this is used for instance for Tiger, which works on 64-bit words but
* uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
* ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
* set, then only one word (64 bits) will be used to encode the input
* message length (in bits), otherwise two words will be used (as in
* SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
* not PLW1), four 64-bit words will be used to encode the message length
* (in bits). Note that regardless of those settings, only 64-bit message
* lengths are supported (in bits): messages longer than 2 Exabytes will be
* improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
* 2 millions Terabytes, which is huge).
*
* If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
* function. This is used for Tiger2, which is identical to Tiger except
* when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
* of the 0x01 from original Tiger).
*
* The RFUN function is invoked with two arguments, the first pointing to
* aligned data (as a "const void *"), the second being state information
* from the context structure. By default, this state information is the
* "val" field from the context, and this field is assumed to be an array
* of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
* from the context structure. The "val" field can have any type, except
* for the output encoding which assumes that it is an array of "sph_u32"
* values. By defining NO_OUTPUT, this last step is deactivated; the
* includer code is then responsible for writing out the hash result. When
* NO_OUTPUT is defined, the third parameter to the "close()" function is
* ignored.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
#undef SPH_XCAT
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
#undef SPH_XCAT_
#define SPH_XCAT_(a, b) a ## b
#undef SPH_BLEN
#undef SPH_WLEN
#if defined BE64 || defined LE64
#define SPH_BLEN 128U
#define SPH_WLEN 8U
#else
#define SPH_BLEN 64U
#define SPH_WLEN 4U
#endif
#ifdef BLEN
#undef SPH_BLEN
#define SPH_BLEN BLEN
#endif
#undef SPH_MAXPAD
#if defined PLW1
#define SPH_MAXPAD (SPH_BLEN - SPH_WLEN)
#elif defined PLW4
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 2))
#else
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 1))
#endif
#undef SPH_VAL
#undef SPH_NO_OUTPUT
#ifdef SVAL
#define SPH_VAL SVAL
#define SPH_NO_OUTPUT 1
#else
#define SPH_VAL sc->val
#endif
#ifndef CLOSE_ONLY
#ifdef SPH_UPTR
static void
SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
#else
void
HASH ( void *cc, const void *data, size_t len )
#endif
{
SPH_XCAT( HASH, _context ) *sc;
__m256i *vdata = (__m256i*)data;
size_t ptr;
sc = cc;
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
while ( len > 0 )
{
size_t clen;
clen = SPH_BLEN - ptr;
if ( clen > len )
clen = len;
memcpy_m256i( sc->buf + (ptr>>3), vdata, clen>>3 );
vdata = vdata + (clen>>3);
ptr += clen;
len -= clen;
if ( ptr == SPH_BLEN )
{
RFUN( sc->buf, SPH_VAL );
ptr = 0;
}
sc->count += clen;
}
}
#ifdef SPH_UPTR
void
HASH (void *cc, const void *data, size_t len)
{
SPH_XCAT(HASH, _context) *sc;
__m256i *vdata = (__m256i*)data;
unsigned ptr;
if ( len < (2 * SPH_BLEN) )
{
SPH_XCAT(HASH, _short)(cc, data, len);
return;
}
sc = cc;
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
if ( ptr > 0 )
{
unsigned t;
t = SPH_BLEN - ptr;
SPH_XCAT( HASH, _short )( cc, data, t );
vdata = vdata + (t>>3);
len -= t;
}
SPH_XCAT( HASH, _short )( cc, data, len );
}
#endif
#endif
/*
* Perform padding and produce result. The context is NOT reinitialized
* by this function.
*/
static void
SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n,
void *dst, unsigned rnum )
{
SPH_XCAT(HASH, _context) *sc;
unsigned ptr, u;
sc = cc;
ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
uint64_t *b= (uint64_t*)sc->buf;
uint64_t *s= (uint64_t*)sc->state;
//printf("Vptr 1= %u\n", ptr);
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
#ifdef PW01
sc->buf[ptr>>3] = mm256_vec_epi64( 0x100 >> 8 );
// sc->buf[ptr++] = 0x100 >> 8;
#else
// need to overwrite exactly one byte
// sc->buf[ptr>>3] = _mm256_set_epi64x( 0, 0, 0, 0x80 );
sc->buf[ptr>>3] = mm256_vec_epi64( 0x80 );
// ptr++;
#endif
ptr += 8;
//printf("Vptr 2= %u\n", ptr);
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
if ( ptr > SPH_MAXPAD )
{
memset_zero_m256i( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
RFUN( sc->buf, SPH_VAL );
memset_zero_m256i( sc->buf, SPH_MAXPAD >> 3 );
}
else
{
memset_zero_m256i( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
}
#if defined BE64
#if defined PLW1
sc->buf[ SPH_MAXPAD>>3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
#elif defined PLW4
memset_zero_m256i( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count >> 61 ) );
sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
#else
sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count >> 61 ) );
sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
#endif // PLW
#else // LE64
#if defined PLW1
sc->buf[ SPH_MAXPAD >> 3 ] = mm256_vec_epi64( sc->count << 3 );
#elif defined PLW4
sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_vec_epi64( sc->count << 3 );
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
mm256_vec_epi64( c->count >> 61 );
memset_zero_m256i( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
2 * SPH_WLEN );
#else
sc->buf[ SPH_MAXPAD >> 3 ] = mm256_vec_epi64( sc->count << 3 );
sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
mm256_vec_epi64( sc->count >> 61 );
#endif // PLW
#endif // LE64
//printf("Vptr 3= %u\n", ptr);
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
RFUN( sc->buf, SPH_VAL );
//printf("Vptr after= %u\n", ptr);
//printf("VState %016llx %016llx %016llx %016llx\n", s[0], s[4], s[8], s[12] );
//printf("VState %016llx %016llx %016llx %016llx\n", s[16], s[20], s[24], s[28] );
#ifdef SPH_NO_OUTPUT
(void)dst;
(void)rnum;
(void)u;
#else
for ( u = 0; u < rnum; u ++ )
{
#if defined BE64
((__m256i*)dst)[u] = mm256_byteswap_epi64( sc->val[u] );
#else // LE64
((__m256i*)dst)[u] = sc->val[u];
#endif
}
#endif
}
static void
SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
{
SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
}

369
algo/whirlpool/md_helper.c Normal file
View File

@@ -0,0 +1,369 @@
/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
/*
* This file contains some functions which implement the external data
* handling and padding for Merkle-Damgard hash functions which follow
* the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
*
* API: this file is meant to be included, not compiled as a stand-alone
* file. Some macros must be defined:
* RFUN name for the round function
* HASH "short name" for the hash function
* BE32 defined for big-endian, 32-bit based (e.g. SHA-1)
* LE32 defined for little-endian, 32-bit based (e.g. MD5)
* BE64 defined for big-endian, 64-bit based (e.g. SHA-512)
* LE64 defined for little-endian, 64-bit based (no example yet)
* PW01 if defined, append 0x01 instead of 0x80 (for Tiger)
* BLEN if defined, length of a message block (in bytes)
* PLW1 if defined, length is defined on one 64-bit word only (for Tiger)
* PLW4 if defined, length is defined on four 64-bit words (for WHIRLPOOL)
* SVAL if defined, reference to the context state information
*
* BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
* this is used for instance for Tiger, which works on 64-bit words but
* uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
* ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
* set, then only one word (64 bits) will be used to encode the input
* message length (in bits), otherwise two words will be used (as in
* SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
* not PLW1), four 64-bit words will be used to encode the message length
* (in bits). Note that regardless of those settings, only 64-bit message
* lengths are supported (in bits): messages longer than 2 Exabytes will be
* improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
* 2 millions Terabytes, which is huge).
*
* If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
* function. This is used for Tiger2, which is identical to Tiger except
* when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
* of the 0x01 from original Tiger).
*
* The RFUN function is invoked with two arguments, the first pointing to
* aligned data (as a "const void *"), the second being state information
* from the context structure. By default, this state information is the
* "val" field from the context, and this field is assumed to be an array
* of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
* from the context structure. The "val" field can have any type, except
* for the output encoding which assumes that it is an array of "sph_u32"
* values. By defining NO_OUTPUT, this last step is deactivated; the
* includer code is then responsible for writing out the hash result. When
* NO_OUTPUT is defined, the third parameter to the "close()" function is
* ignored.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
#undef SPH_XCAT
#define SPH_XCAT(a, b) SPH_XCAT_(a, b)
#undef SPH_XCAT_
#define SPH_XCAT_(a, b) a ## b
#undef SPH_BLEN
#undef SPH_WLEN
#if defined BE64 || defined LE64
#define SPH_BLEN 128U
#define SPH_WLEN 8U
#else
#define SPH_BLEN 64U
#define SPH_WLEN 4U
#endif
#ifdef BLEN
#undef SPH_BLEN
#define SPH_BLEN BLEN
#endif
#undef SPH_MAXPAD
#if defined PLW1
#define SPH_MAXPAD (SPH_BLEN - SPH_WLEN)
#elif defined PLW4
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 2))
#else
#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 1))
#endif
#undef SPH_VAL
#undef SPH_NO_OUTPUT
#ifdef SVAL
#define SPH_VAL SVAL
#define SPH_NO_OUTPUT 1
#else
#define SPH_VAL sc->val
#endif
#ifndef CLOSE_ONLY
#ifdef SPH_UPTR
static void
SPH_XCAT(HASH, _short)(void *cc, const void *data, size_t len)
#else
void
SPH_XCAT(sph_, HASH)(void *cc, const void *data, size_t len)
#endif
{
SPH_XCAT(sph_, SPH_XCAT(HASH, _context)) *sc;
size_t current;
sc = cc;
#if SPH_64
current = (unsigned)sc->count & (SPH_BLEN - 1U);
#else
current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
#endif
while (len > 0) {
size_t clen;
#if !SPH_64
sph_u32 clow, clow2;
#endif
clen = SPH_BLEN - current;
if (clen > len)
clen = len;
memcpy(sc->buf + current, data, clen);
data = (const unsigned char *)data + clen;
current += clen;
len -= clen;
if (current == SPH_BLEN) {
RFUN(sc->buf, SPH_VAL);
current = 0;
}
#if SPH_64
sc->count += clen;
#else
clow = sc->count_low;
clow2 = SPH_T32(clow + clen);
sc->count_low = clow2;
if (clow2 < clow)
sc->count_high ++;
#endif
}
}
#ifdef SPH_UPTR
void
SPH_XCAT(sph_, HASH)(void *cc, const void *data, size_t len)
{
SPH_XCAT(sph_, SPH_XCAT(HASH, _context)) *sc;
unsigned current;
size_t orig_len;
#if !SPH_64
sph_u32 clow, clow2;
#endif
if (len < (2 * SPH_BLEN)) {
SPH_XCAT(HASH, _short)(cc, data, len);
return;
}
sc = cc;
#if SPH_64
current = (unsigned)sc->count & (SPH_BLEN - 1U);
#else
current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
#endif
if (current > 0) {
unsigned t;
t = SPH_BLEN - current;
SPH_XCAT(HASH, _short)(cc, data, t);
data = (const unsigned char *)data + t;
len -= t;
}
#if !SPH_UNALIGNED
if (((SPH_UPTR)data & (SPH_WLEN - 1U)) != 0) {
SPH_XCAT(HASH, _short)(cc, data, len);
return;
}
#endif
orig_len = len;
while (len >= SPH_BLEN) {
RFUN(data, SPH_VAL);
len -= SPH_BLEN;
data = (const unsigned char *)data + SPH_BLEN;
}
if (len > 0)
memcpy(sc->buf, data, len);
#if SPH_64
sc->count += (sph_u64)orig_len;
#else
clow = sc->count_low;
clow2 = SPH_T32(clow + orig_len);
sc->count_low = clow2;
if (clow2 < clow)
sc->count_high ++;
/*
* This code handles the improbable situation where "size_t" is
* greater than 32 bits, and yet we do not have a 64-bit type.
*/
orig_len >>= 12;
orig_len >>= 10;
orig_len >>= 10;
sc->count_high += orig_len;
#endif
}
#endif
#endif
/*
* Perform padding and produce result. The context is NOT reinitialized
* by this function.
*/
static void
SPH_XCAT(HASH, _addbits_and_close)(void *cc,
unsigned ub, unsigned n, void *dst, unsigned rnum)
{
SPH_XCAT(sph_, SPH_XCAT(HASH, _context)) *sc;
unsigned current, u;
#if !SPH_64
sph_u32 low, high;
#endif
sc = cc;
#if SPH_64
current = (unsigned)sc->count & (SPH_BLEN - 1U);
#else
current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
#endif
uint64_t *b= (uint64_t*)sc->buf;
uint64_t *s= (uint64_t*)sc->state;
// printf("Sptr 1= %u\n",current);
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );
#ifdef PW01
sc->buf[current ++] = (0x100 | (ub & 0xFF)) >> (8 - n);
#else
{
unsigned z;
z = 0x80 >> n;
sc->buf[current ++] = ((ub & -z) | z) & 0xFF;
}
#endif
// printf("Sptr 2= %u\n",current);
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );
if (current > SPH_MAXPAD) {
memset(sc->buf + current, 0, SPH_BLEN - current);
RFUN(sc->buf, SPH_VAL);
memset(sc->buf, 0, SPH_MAXPAD);
} else {
memset(sc->buf + current, 0, SPH_MAXPAD - current);
}
#if defined BE64
#if defined PLW1
sph_enc64be_aligned(sc->buf + SPH_MAXPAD,
SPH_T64(sc->count << 3) + (sph_u64)n);
#elif defined PLW4
memset(sc->buf + SPH_MAXPAD, 0, 2 * SPH_WLEN);
sph_enc64be_aligned(sc->buf + SPH_MAXPAD + 2 * SPH_WLEN,
sc->count >> 61);
sph_enc64be_aligned(sc->buf + SPH_MAXPAD + 3 * SPH_WLEN,
SPH_T64(sc->count << 3) + (sph_u64)n);
#else
sph_enc64be_aligned(sc->buf + SPH_MAXPAD, sc->count >> 61);
sph_enc64be_aligned(sc->buf + SPH_MAXPAD + SPH_WLEN,
SPH_T64(sc->count << 3) + (sph_u64)n);
#endif
#elif defined LE64
#if defined PLW1
sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
SPH_T64(sc->count << 3) + (sph_u64)n);
// BUG!! should be PLW4
#elif defined PLW1
sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
SPH_T64(sc->count << 3) + (sph_u64)n);
sph_enc64le_aligned(sc->buf + SPH_MAXPAD + SPH_WLEN, sc->count >> 61);
memset(sc->buf + SPH_MAXPAD + 2 * SPH_WLEN, 0, 2 * SPH_WLEN);
#else
sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
SPH_T64(sc->count << 3) + (sph_u64)n);
sph_enc64le_aligned(sc->buf + SPH_MAXPAD + SPH_WLEN, sc->count >> 61);
#endif
#else
#if SPH_64
#ifdef BE32
sph_enc64be_aligned(sc->buf + SPH_MAXPAD,
SPH_T64(sc->count << 3) + (sph_u64)n);
#else
sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
SPH_T64(sc->count << 3) + (sph_u64)n);
#endif
#else
low = sc->count_low;
high = SPH_T32((sc->count_high << 3) | (low >> 29));
low = SPH_T32(low << 3) + (sph_u32)n;
#ifdef BE32
sph_enc32be(sc->buf + SPH_MAXPAD, high);
sph_enc32be(sc->buf + SPH_MAXPAD + SPH_WLEN, low);
#else
sph_enc32le(sc->buf + SPH_MAXPAD, low);
sph_enc32le(sc->buf + SPH_MAXPAD + SPH_WLEN, high);
#endif
#endif
#endif
// printf("Sptr 3= %u\n",current);
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );
RFUN(sc->buf, SPH_VAL);
// printf("Sptr after= %u\n",current);
// printf("SState %016llx %016llx %016llx %016llx\n", s[0], s[1], s[2], s[3] );
// printf("SState %016llx %016llx %016llx %016llx\n", s[4], s[5], s[6], s[7] );
#ifdef SPH_NO_OUTPUT
(void)dst;
(void)rnum;
(void)u;
#else
for (u = 0; u < rnum; u ++) {
#if defined BE64
sph_enc64be((unsigned char *)dst + 8 * u, sc->val[u]);
#elif defined LE64
sph_enc64le((unsigned char *)dst + 8 * u, sc->val[u]);
#elif defined BE32
sph_enc32be((unsigned char *)dst + 4 * u, sc->val[u]);
#else
sph_enc32le((unsigned char *)dst + 4 * u, sc->val[u]);
#endif
}
#endif
}
static void
SPH_XCAT(HASH, _close)(void *cc, void *dst, unsigned rnum)
{
SPH_XCAT(HASH, _addbits_and_close)(cc, 0, 0, dst, rnum);
}

View File

@@ -3441,19 +3441,19 @@ ROUND_FUN(whirlpool1, old1)
#define RFUN whirlpool_round
#define HASH whirlpool
#include "algo/sha/md_helper.c"
#include "md_helper.c"
#undef RFUN
#undef HASH
#define RFUN whirlpool0_round
#define HASH whirlpool0
#include "algo/sha/md_helper.c"
#include "md_helper.c"
#undef RFUN
#undef HASH
#define RFUN whirlpool1_round
#define HASH whirlpool1
#include "algo/sha/md_helper.c"
#include "md_helper.c"
#undef RFUN
#undef HASH
@@ -3463,7 +3463,6 @@ sph_ ## name ## _close(void *cc, void *dst) \
{ \
sph_ ## name ## _context *sc; \
int i; \
\
name ## _close(cc, dst, 0); \
sc = cc; \
for (i = 0; i < 8; i ++) \

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,209 @@
/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
/**
* WHIRLPOOL interface.
*
* WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
* version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
* (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
* version, 2003, with a new diffusion matrix, also described as "plain
* WHIRLPOOL"). All three variants are implemented here.
*
* The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
* M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
* NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
*
* The current WHIRLPOOL specification and a reference implementation
* can be found on the WHIRLPOOL web page:
* http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_whirlpool.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SPH_WHIRLPOOL_H__
#define SPH_WHIRLPOOL_H__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#if SPH_64
/**
* Output size (in bits) for WHIRLPOOL.
*/
#define SPH_SIZE_whirlpool 512
/**
* Output size (in bits) for WHIRLPOOL-0.
*/
#define SPH_SIZE_whirlpool0 512
/**
* Output size (in bits) for WHIRLPOOL-1.
*/
#define SPH_SIZE_whirlpool1 512
/**
* This structure is a context for WHIRLPOOL computations: it contains the
* intermediate values and some data from the last entered block. Once
* a WHIRLPOOL computation has been performed, the context can be reused for
* another computation.
*
* The contents of this structure are private. A running WHIRLPOOL computation
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
sph_u64 state[8];
#if SPH_64
sph_u64 count;
#else
sph_u32 count_high, count_low;
#endif
#endif
} sph_whirlpool_context;
/**
* Initialize a WHIRLPOOL context. This process performs no memory allocation.
*
* @param cc the WHIRLPOOL context (pointer to a
* <code>sph_whirlpool_context</code>)
*/
void sph_whirlpool_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing). This function applies the
* plain WHIRLPOOL algorithm.
*
* @param cc the WHIRLPOOL context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_whirlpool(void *cc, const void *data, size_t len);
/**
* Terminate the current WHIRLPOOL computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (64 bytes). The context is automatically
* reinitialized.
*
* @param cc the WHIRLPOOL context
* @param dst the destination buffer
*/
void sph_whirlpool_close(void *cc, void *dst);
/**
* WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
*/
typedef sph_whirlpool_context sph_whirlpool0_context;
#ifdef DOXYGEN_IGNORE
/**
* Initialize a WHIRLPOOL-0 context. This function is identical to
* <code>sph_whirlpool_init()</code>.
*
* @param cc the WHIRLPOOL context (pointer to a
* <code>sph_whirlpool0_context</code>)
*/
void sph_whirlpool0_init(void *cc);
#endif
#ifndef DOXYGEN_IGNORE
#define sph_whirlpool0_init sph_whirlpool_init
#endif
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing). This function applies the
* WHIRLPOOL-0 algorithm.
*
* @param cc the WHIRLPOOL context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_whirlpool0(void *cc, const void *data, size_t len);
/**
* Terminate the current WHIRLPOOL-0 computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (64 bytes). The context is automatically
* reinitialized.
*
* @param cc the WHIRLPOOL-0 context
* @param dst the destination buffer
*/
void sph_whirlpool0_close(void *cc, void *dst);
/**
* WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
*/
typedef sph_whirlpool_context sph_whirlpool1_context;
#ifdef DOXYGEN_IGNORE
/**
* Initialize a WHIRLPOOL-1 context. This function is identical to
* <code>sph_whirlpool_init()</code>.
*
* @param cc the WHIRLPOOL context (pointer to a
* <code>sph_whirlpool1_context</code>)
*/
void sph_whirlpool1_init(void *cc);
#endif
#ifndef DOXYGEN_IGNORE
#define sph_whirlpool1_init sph_whirlpool_init
#endif
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing). This function applies the
* WHIRLPOOL-1 algorithm.
*
* @param cc the WHIRLPOOL context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_whirlpool1(void *cc, const void *data, size_t len);
/**
* Terminate the current WHIRLPOOL-1 computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (64 bytes). The context is automatically
* reinitialized.
*
* @param cc the WHIRLPOOL-1 context
* @param dst the destination buffer
*/
void sph_whirlpool1_close(void *cc, void *dst);
#endif
#endif

View File

@@ -0,0 +1,131 @@
#include "whirlpool-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "sph_whirlpool.h"
#include "whirlpool-hash-4way.h"
#if defined(__AVX2__)
static __thread whirlpool_4way_context whirl_mid;
void whirlpool_hash_4way( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
uint64_t hash1[8] __attribute__ ((aligned (64)));
uint64_t hash2[8] __attribute__ ((aligned (64)));
uint64_t hash3[8] __attribute__ ((aligned (64)));
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
const int midlen = 64;
const int tail = 80 - midlen;
whirlpool_4way_context ctx;
memcpy( &ctx, &whirl_mid, sizeof whirl_mid );
whirlpool1_4way( &ctx, input + (midlen<<2), tail );
whirlpool1_4way_close( &ctx, vhash);
// whirlpool1_4way_init( &ctx );
// whirlpool1_4way( &ctx, input, 80 );
// whirlpool1_4way_close( &ctx, vhash);
whirlpool1_4way_init( &ctx );
whirlpool1_4way( &ctx, vhash, 64 );
whirlpool1_4way_close( &ctx, vhash);
whirlpool1_4way_init( &ctx );
whirlpool1_4way( &ctx, vhash, 64 );
whirlpool1_4way_close( &ctx, vhash);
whirlpool1_4way_init( &ctx );
whirlpool1_4way( &ctx, vhash, 64 );
whirlpool1_4way_close( &ctx, vhash);
m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
memcpy( state , hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
}
int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
unsigned long *hashes_done )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(128) endiandata[20];
uint32_t* pdata = work->data;
uint32_t* ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
// if (opt_benchmark)
// ((uint32_t*)ptarget)[7] = 0x0000ff;
for (int i=0; i < 19; i++)
be32enc(&endiandata[i], pdata[i]);
uint64_t *edata = (uint64_t*)endiandata;
m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
// midstate
whirlpool1_4way_init( &whirl_mid );
whirlpool1_4way( &whirl_mid, vdata, 64 );
do {
const uint32_t Htarg = ptarget[7];
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
whirlpool_hash_4way( hash, vdata );
pdata[19] = n;
if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio(work, hash);
}
if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -0,0 +1,15 @@
#include "whirlpool-gate.h"
bool register_whirlpool_algo( algo_gate_t* gate )
{
//#if defined (WHIRLPOOL_4WAY)
// gate->scanhash = (void*)&scanhash_whirlpool_4way;
// gate->hash = (void*)&whirlpool_hash_4way;
//#else
gate->scanhash = (void*)&scanhash_whirlpool;
gate->hash = (void*)&whirlpool_hash;
init_whirlpool_ctx();
//#endif
return true;
};

View File

@@ -0,0 +1,24 @@
#ifndef WHIRLPOOL_GATE_H__
#define WHIRLPOOL_GATE_H__
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(FOUR_WAY) && defined(__AVX2__)
#define WHIRLPOOL_4WAY
#endif
//#if defined (WHIRLPOOL_4WAY)
//void whirlpool_hash_4way(void *state, const void *input);
//int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done );
//#endif
void whirlpool_hash( void *state, const void *input );
int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,108 @@
/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
/**
* WHIRLPOOL interface.
*
* WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
* version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
* (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
* version, 2003, with a new diffusion matrix, also described as "plain
* WHIRLPOOL"). All three variants are implemented here.
*
* The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
* M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
* NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
*
* The current WHIRLPOOL specification and a reference implementation
* can be found on the WHIRLPOOL web page:
* http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_whirlpool.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef WHIRLPOOL_HASH_4WAY_H__
#define WHIRLPOOL_HASH_4WAY_H__
#ifdef __AVX2__
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
/**
* Output size (in bits) for WHIRLPOOL.
*/
#define SPH_SIZE_whirlpool 512
/**
* Output size (in bits) for WHIRLPOOL-0.
*/
#define SPH_SIZE_whirlpool0 512
/**
* Output size (in bits) for WHIRLPOOL-1.
*/
#define SPH_SIZE_whirlpool1 512
typedef struct {
__m256i buf[8] __attribute__ ((aligned (64)));
__m256i state[8];
sph_u64 count;
} whirlpool_4way_context;
void whirlpool_4way_init( void *cc );
void whirlpool_4way( void *cc, const void *data, size_t len );
void whirlpool_4way_close( void *cc, void *dst );
/**
* WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
*/
typedef whirlpool_4way_context whirlpool0_4way_context;
#define whirlpool0_4way_init whirlpool_4way_init
void whirlpool0_4way( void *cc, const void *data, size_t len );
void whirlpool0_4way_close( void *cc, void *dst );
/**
* WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
*/
typedef whirlpool_4way_context whirlpool1_4way_context;
#define whirlpool1_4way_init whirlpool_4way_init
void whirlpool1_4way(void *cc, const void *data, size_t len);
void whirlpool1_4way_close(void *cc, void *dst);
#endif
#endif

View File

@@ -1,5 +1,4 @@
#include "algo-gate-api.h"
#include "whirlpool-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
@@ -58,7 +57,8 @@ void whirlpool_midstate( const void* input )
}
int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
int scanhash_whirlpool( int thr_id, struct work* work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t _ALIGN(128) endiandata[20];
uint32_t* pdata = work->data;
@@ -66,8 +66,8 @@ int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsign
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce - 1;
// if (opt_benchmark)
// ((uint32_t*)ptarget)[7] = 0x0000ff;
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
for (int i=0; i < 19; i++)
be32enc(&endiandata[i], pdata[i]);
@@ -83,7 +83,7 @@ int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsign
if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
{
// work_set_target_ratio(work, vhash);
work_set_target_ratio(work, vhash);
*hashes_done = n - first_nonce + 1;
return true;
}
@@ -95,11 +95,3 @@ int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsign
return 0;
}
bool register_whirlpool_algo( algo_gate_t* gate )
{
gate->scanhash = (void*)&scanhash_whirlpool;
gate->hash = (void*)&whirlpool_hash;
init_whirlpool_ctx();
return true;
};

View File

@@ -24,7 +24,8 @@ void whirlpoolx_hash(void *state, const void *input)
memcpy(state, hash, 32);
}
int scanhash_whirlpoolx(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
int scanhash_whirlpoolx( int thr_id, struct work* work, uint32_t max_nonce,
uint64_t *hashes_done)
{
uint32_t _ALIGN(128) endiandata[20];
uint32_t* pdata = work->data;
@@ -32,8 +33,8 @@ int scanhash_whirlpoolx(int thr_id, struct work* work, uint32_t max_nonce, unsig
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce - 1;
// if (opt_benchmark)
// ((uint32_t*)ptarget)[7] = 0x0000ff;
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
for (int i=0; i < 19; i++)
be32enc(&endiandata[i], pdata[i]);
@@ -47,7 +48,7 @@ int scanhash_whirlpoolx(int thr_id, struct work* work, uint32_t max_nonce, unsig
if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
{
// work_set_target_ratio(work, vhash);
work_set_target_ratio(work, vhash);
*hashes_done = n - first_nonce + 1;
return true;
}

View File

@@ -1,3 +1,6 @@
#ifndef AVXDEFS_H__
#define AVXDEFS_H__
// Some tools to help using AVX and AVX2
// AVX support is required to include this header file, AVX2 optional.
@@ -39,7 +42,23 @@ uint8_t v8 [16];
#if defined (__AVX2__)
// AVX2 replacements for vectorized data
// AVX2 implementations of
// vector versions of common scalar functions
// vector handling, indexing, pointer arithmetic
// vector scalar conversion
// overlay (union) handling for all integer data types
// vectorize 64 bit data by replication.
// don't need to repeat the val 4 times.
inline __m256i mm256_vec_epi64( uint64_t val )
{
return _mm256_set_epi64x( val, val, val, val );
}
inline __m256i mm256_vec_epi32( uint32_t val )
{
return _mm256_set_epi32( val, val, val, val, val, val, val, val );
}
// n = number of __m256i (32 bytes)
inline void memset_zero_m256i( __m256i *dst, int n )
@@ -282,8 +301,8 @@ inline __m256i mm256_byteswap_epi32( __m256i x )
_mm256_set_epi32( 0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000,
0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000 ) );
__m256i x0 = _mm256_slli_epi32( x, 24 ); // x0 = x << 24
x1 = _mm256_slli_epi32( x1, 8 ); // x1 = mask(x) << 8
x2 = _mm256_srli_epi32( x2, 8 ); // x2 = mask(x) >> 8
x1 = _mm256_slli_epi32( x1, 8 ); // x1 = mask1(x) << 8
x2 = _mm256_srli_epi32( x2, 8 ); // x2 = mask2(x) >> 8
__m256i x3 = _mm256_srli_epi32( x, 24 ); // x3 = x >> 24
return _mm256_or_si256( _mm256_or_si256( x0, x1 ),
_mm256_or_si256( x2, x3 ) );
@@ -318,6 +337,11 @@ inline __m256i mm256_byteswap_epi64( __m256i x )
return x;
}
// vectorized version of ~ operator
#define mm256_bitnot( x ) \
_mm256_xor_si256( (x), _mm256_set_epi64x( 0xffffffffffffffff, \
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff ) )
#endif // AVX2
// AVX replacements for vectorized data
@@ -614,12 +638,65 @@ inline void m256_deinterleave_4x64( uint64_t *dst0, uint64_t *dst1,
for ( int i = 0; i < bit_len>>6; i++, s += 4 )
{
*(dst0+i) = *s;
*(dst1+i) = *(s+1);
*(dst2+i) = *(s+2);
*(dst3+i) = *(s+3);
*(dst1+i) = *(s+1);
*(dst2+i) = *(s+2);
*(dst3+i) = *(s+3);
}
}
// optimized versions using AVX2 code
// only 512 bit and 640 bit length is supported
// all data must be aligned to256 bits
// 640 bit data needs padding for overrun to 768
// no looping
inline void m256_interleave_4x64x( uint64_t *dst, uint64_t *src0,
uint64_t *src1, uint64_t *src2, uint64_t *src3, int bit_len )
{
__m256i* d = (__m256i*)dst;
d[0] = _mm256_set_epi64x( src3[0], src2[0], src1[0], src0[0] );
d[1] = _mm256_set_epi64x( src3[1], src2[1], src1[1], src0[1] );
d[2] = _mm256_set_epi64x( src3[2], src2[2], src1[2], src0[2] );
d[3] = _mm256_set_epi64x( src3[3], src2[3], src1[3], src0[3] );
d[4] = _mm256_set_epi64x( src3[4], src2[4], src1[4], src0[4] );
d[5] = _mm256_set_epi64x( src3[5], src2[5], src1[5], src0[5] );
d[6] = _mm256_set_epi64x( src3[6], src2[6], src1[6], src0[6] );
d[7] = _mm256_set_epi64x( src3[7], src2[7], src1[7], src0[7] );
if ( bit_len == 512 ) return;
d[8] = _mm256_set_epi64x( src3[8], src2[8], src1[8], src0[8] );
d[9] = _mm256_set_epi64x( src3[9], src2[9], src1[9], src0[9] );
}
inline void m256_deinterleave_4x64x( uint64_t *dst0, uint64_t *dst1,
uint64_t *dst2, uint64_t *dst3, uint64_t *src, int bit_len )
{
__m256i* d0 = (__m256i*)dst0;
__m256i* d1 = (__m256i*)dst1;
__m256i* d2 = (__m256i*)dst2;
__m256i* d3 = (__m256i*)dst3;
d0[0] = _mm256_set_epi64x( src[12], src[ 8], src[ 4], src[ 0] );
d1[0] = _mm256_set_epi64x( src[13], src[ 9], src[ 5], src[ 1] );
d2[0] = _mm256_set_epi64x( src[14], src[10], src[ 6], src[ 2] );
d3[0] = _mm256_set_epi64x( src[15], src[11], src[ 7], src[ 3] );
d0[1] = _mm256_set_epi64x( src[28], src[24], src[20], src[16] );
d1[1] = _mm256_set_epi64x( src[29], src[25], src[21], src[17] );
d2[1] = _mm256_set_epi64x( src[30], src[26], src[22], src[18] );
d3[1] = _mm256_set_epi64x( src[31], src[27], src[23], src[19] );
if ( bit_len == 512 ) return;
// null change to overrun area
d0[2] = _mm256_set_epi64x( dst0[44], dst0[40], src[36], src[32] );
d1[2] = _mm256_set_epi64x( dst1[45], dst1[41], src[37], src[33] );
d2[2] = _mm256_set_epi64x( dst2[46], dst2[42], src[38], src[34] );
d3[2] = _mm256_set_epi64x( dst3[47], dst3[43], src[39], src[35] );
}
// interleave 8 arrays of 32 bit elements for AVX2 processing
// bit_len must be multiple of 32
inline void m256_interleave_8x32( uint32_t *dst, uint32_t *src0,
@@ -718,7 +795,8 @@ inline void m128_interleave_4x32( uint32_t *dst, uint32_t *src0,
// deinterleave 4 arrays into individual buffers for scalarm processing
// bit_len must be multiple of 32
inline void m128_deinterleave_4x32( uint32_t *dst0, uint32_t *dst1,
uint32_t *dst2,uint32_t *dst3, uint32_t *src, int bit_len )
uint32_t *dst2,uint32_t *dst3, uint32_t *src,
int bit_len )
{
uint32_t *s = src;
for ( int i = 0; i < bit_len >> 5; i++, s += 4 )
@@ -730,4 +808,5 @@ inline void m128_deinterleave_4x32( uint32_t *dst0, uint32_t *dst1,
}
}
#endif

View File

@@ -18,7 +18,8 @@ rm -f config.status
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
make -j 4

View File

@@ -18,7 +18,8 @@ rm -f config.status
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
make -j 4

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.3.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.4.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.7.3'
PACKAGE_STRING='cpuminer-opt 3.7.3'
PACKAGE_VERSION='3.7.4'
PACKAGE_STRING='cpuminer-opt 3.7.4'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.7.3 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.7.4 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.7.3:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.7.4:";;
esac
cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.7.3
cpuminer-opt configure 3.7.4
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.7.3, which was
It was created by cpuminer-opt $as_me 3.7.4, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2981,7 +2981,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.7.3'
VERSION='3.7.4'
cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.7.3, which was
This file was extended by cpuminer-opt $as_me 3.7.4, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.7.3
cpuminer-opt config.status 3.7.4
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.7.3])
AC_INIT([cpuminer-opt], [3.7.4])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -1815,14 +1815,30 @@ static void *miner_thread( void *userdata )
{
*algo_gate.get_nonceptr( work.data ) = work.nonces[n];
if ( !submit_work(mythr, &work) )
{
applog(LOG_WARNING, "Failed to submit share." );
break;
}
num_submitted++;
}
// must be a ine way algo, nonce is already in work data
#if FOUR_WAY
if (num_submitted>1)
applog(LOG_NOTICE, "4 WAY hash, %u nonces submitted," CL_MAG " BONUS!" CL_WHT, num_submitted);
else
applog(LOG_NOTICE, "4 WAY hash %u nonce submitted", num_submitted);
#endif
// must be a one way algo, nonce is already in work data
if ( !num_submitted )
{
if ( !submit_work(mythr, &work) )
{
applog(LOG_WARNING, "Failed to submir share.");
break;
}
#if FOUR_WAY
applog(LOG_NOTICE, "1 WAY hash 1 nonce submitted");
#endif
}
// prevent stale work in solo
@@ -1836,8 +1852,6 @@ static void *miner_thread( void *userdata )
}
}
// display hashrate
if (!opt_quiet)
{
char hc[16];
@@ -1846,7 +1860,6 @@ static void *miner_thread( void *userdata )
char hr_units[2] = {0,0};
double hashcount = thr_hashcount[thr_id];
double hashrate = thr_hashrates[thr_id];
//printf("display count= %.3f, tcount= %.3f, rate= %03f trate= %03f\n", hashcount, thr_hashcount[thr_id], hashrate,thr_hashrates[thr_id] );
if ( hashcount )
{
scale_hash_for_display( &hashcount, hc_units );

View File

@@ -673,19 +673,19 @@ Options:\n\
argon2\n\
axiom Shabal-256 MemoHash\n\
bastion\n\
blake Blake-256 (SFR)\n\
blake blake256r14 (SFR)\n\
blakecoin blake256r8\n\
blake2s Blake-2 S\n\
bmw BMW 256\n\
c11 Chaincoin\n\
cryptolight Cryptonight-light\n\
cryptonight cryptonote, Monero (XMR)\n\
decred Blake256r8dcr\n\
decred Blake256r14dcr\n\
deep Deepcoin (DCN)\n\
dmd-gr Diamond\n\
drop Dropcoin\n\
fresh Fresh\n\
groestl dmd-gr, Groestl coin\n\
groestl Groestl coin\n\
heavy Heavy\n\
hmq1725 Espers\n\
hodl Hodlcoin\n\

View File

@@ -3,7 +3,15 @@
make distclean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA -DFOUR_WAY" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-4way.exe
make clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-avx2.exe
@@ -11,7 +19,7 @@ mv cpuminer.exe cpuminer-aes-avx2.exe
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=corei7-avx -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
CFLAGS="-O3 -march=corei7-avx -Wall -DUSE_SPH_SHA" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-avx.exe
@@ -19,7 +27,7 @@ mv cpuminer.exe cpuminer-aes-avx.exe
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -maes -msse4.2 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
CFLAGS="-O3 -maes -msse4.2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-aes-sse42.exe
@@ -27,7 +35,7 @@ mv cpuminer.exe cpuminer-aes-sse42.exe
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=corei7 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
CFLAGS="-O3 -march=corei7 -Wall -DUSE_SPH_SHA" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-sse42.exe
@@ -35,7 +43,7 @@ mv cpuminer.exe cpuminer-sse42.exe
make clean || echo clean
rm -f config.status
./autogen.sh || echo done
CFLAGS="-O3 -march=core2 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
CFLAGS="-O3 -march=core2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
make -j 4
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-sse2.exe

View File

@@ -18,7 +18,8 @@ rm -f config.status
# Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
#extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
make -j 4