This commit is contained in:
Jay D Dee
2018-02-23 12:39:15 -05:00
parent 502ed0b1fe
commit 3c02653dbe
70 changed files with 3871 additions and 1848 deletions

View File

@@ -1,19 +1,18 @@
#include "blake-gate.h"
#if defined (BLAKE_4WAY)
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>
blake256r14_4way_context blake_ctx;
#if defined (BLAKE_4WAY)
blake256r14_4way_context blake_4w_ctx;
void blakehash_4way(void *state, const void *input)
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r14_4way_context ctx;
memcpy( &ctx, &blake_ctx, sizeof ctx );
memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
blake256r14_4way( &ctx, input + (64<<2), 16 );
blake256r14_4way_close( &ctx, vhash );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
@@ -31,7 +30,6 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
if (opt_benchmark)
@@ -39,15 +37,12 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
// we need big endian data...
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256r14_4way_init( &blake_ctx );
blake256r14_4way( &blake_ctx, vdata, 64 );
blake256r14_4way_init( &blake_4w_ctx );
blake256r14_4way( &blake_4w_ctx, vdata, 64 );
uint32_t *noncep = vdata + 76; // 19*4
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
@@ -55,34 +50,11 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
blakehash_4way( hash, vdata );
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
pdata[19] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
@@ -95,3 +67,77 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
#endif
#if defined(BLAKE_8WAY)
blake256r14_8way_context blake_8w_ctx;
void blakehash_8way( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256r14_8way_context ctx;
memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
blake256r14_8way( &ctx, input + (64<<3), 16 );
blake256r14_8way_close( &ctx, vhash );
mm256_deinterleave_8x32( state, state+ 32, state+ 64, state+ 96,
state+128, state+160, state+192, state+224,
vhash, 256 );
}
int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
if (opt_benchmark)
HTarget = 0x7f;
// we need big endian data...
swab32_array( edata, pdata, 20 );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake256r14_8way_init( &blake_8w_ctx );
blake256r14_8way( &blake_8w_ctx, vdata, 64 );
uint32_t *noncep = vdata + 152; // 19*8
do {
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
be32enc( noncep +4, n+4 );
be32enc( noncep +5, n+5 );
be32enc( noncep +6, n+6 );
be32enc( noncep +7, n+7 );
pdata[19] = n;
blakehash_8way( hash, vdata );
for ( int i = 0; i < 8; i++ )
if ( (hash+i)[7] <= HTarget && fulltest( hash+i, ptarget ) )
{
found[i] = true;
num_found++;
nonces[i] = n+i;
work_set_target_ratio( work, hash+1 );
}
n += 8;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

View File

@@ -58,6 +58,8 @@ extern "C"{
#pragma warning (disable: 4146)
#endif
// Blake-256
static const sph_u32 IV256[8] = {
SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
@@ -67,6 +69,8 @@ static const sph_u32 IV256[8] = {
#if defined (__AVX2__)
// Blake-512
static const sph_u64 IV512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
@@ -78,7 +82,7 @@ static const sph_u64 IV512[8] = {
#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
// Blake-256 4 & 8 way, Blake-512 4way
// Blake-256 4 & 8 way, Blake-512 4 way
static const unsigned sigma[16][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
@@ -371,6 +375,8 @@ do { \
#if SPH_COMPACT_BLAKE_32
// Blake-256 4 way
#define ROUND_S_4WAY(r) do { \
GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \
@@ -407,7 +413,7 @@ do { \
#if defined (__AVX2__)
// BLAKE256 8 WAY
// Blake-256 8 way
#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
do { \
@@ -487,6 +493,8 @@ do { \
#endif
// Blake-256 4 way
#define DECL_STATE32_4WAY \
__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
__m128i S0, S1, S2, S3; \
@@ -527,6 +535,7 @@ do { \
} while (0)
#if SPH_COMPACT_BLAKE_32
// not used
#define COMPRESS32_4WAY( rounds ) do { \
__m128i M[16]; \
@@ -778,7 +787,6 @@ do { \
S3 ), H7 ); \
} while (0)
// Blake-512 4 way
#define DECL_STATE64_4WAY \
@@ -967,6 +975,8 @@ do { \
#endif
// Blake-256 4 way
static const sph_u32 salt_zero_4way_small[4] = { 0, 0, 0, 0 };
static void
@@ -988,52 +998,51 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
{
__m128i *vdata = (__m128i*)data;
__m128i *buf;
size_t ptr;
const int buf_size = 64; // number of elements, sizeof/4
DECL_STATE32_4WAY
size_t ptr;
const int buf_size = 64; // number of elements, sizeof/4
DECL_STATE32_4WAY
buf = sc->buf;
ptr = sc->ptr;
if ( len < buf_size - ptr )
{
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
}
buf = sc->buf;
ptr = sc->ptr;
if ( len < buf_size - ptr )
{
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
}
READ_STATE32_4WAY(sc);
while ( len > 0 )
{
size_t clen;
READ_STATE32_4WAY(sc);
while ( len > 0 )
{
size_t clen;
clen = buf_size - ptr;
if (clen > len)
clen = len;
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
if ( ptr == buf_size )
{
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
T1 = SPH_T32(T1 + 1);
COMPRESS32_4WAY( sc->rounds );
ptr = 0;
}
}
WRITE_STATE32_4WAY(sc);
sc->ptr = ptr;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
if ( ptr == buf_size )
{
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
T1 = SPH_T32(T1 + 1);
COMPRESS32_4WAY( sc->rounds );
ptr = 0;
}
}
WRITE_STATE32_4WAY(sc);
sc->ptr = ptr;
}
static void
blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
union {
// union {
__m128i buf[16];
sph_u32 dummy;
} u;
// sph_u32 dummy;
// } u;
size_t ptr, k;
unsigned bit_len;
sph_u32 th, tl;
@@ -1041,7 +1050,7 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
buf[ptr>>2] = _mm_set1_epi32( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
@@ -1060,26 +1069,26 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
if ( ptr <= 52 )
{
memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
memset_zero_128( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
buf[52>>2] = _mm_or_si128( buf[52>>2],
_mm_set1_epi32( 0x01000000UL ) );
*(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
*(buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
*(buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
memset_zero_128( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_128( u.buf, 56>>2 );
memset_zero_128( buf, 56>>2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
*(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, u.buf, 64 );
buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
*(buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
*(buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
blake32_4way( sc, buf, 64 );
}
out = (__m128i*)dst;
for ( k = 0; k < out_size_w32; k++ )
@@ -1114,7 +1123,6 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
size_t ptr;
const int buf_size = 64; // number of elements, sizeof/4
DECL_STATE32_8WAY
buf = sc->buf;
ptr = sc->ptr;
if ( len < buf_size - ptr )
@@ -1153,10 +1161,10 @@ static void
blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
void *dst, size_t out_size_w32 )
{
union {
// union {
__m256i buf[16];
sph_u32 dummy;
} u;
// sph_u32 dummy;
// } u;
size_t ptr, k;
unsigned bit_len;
sph_u32 th, tl;
@@ -1164,7 +1172,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
u.buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
@@ -1183,26 +1191,26 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
if ( ptr <= 52 )
{
memset_zero_256( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2],
memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
if ( out_size_w32 == 8 )
buf[52>>2] = _mm256_or_si256( buf[52>>2],
_mm256_set1_epi32( 0x01000000UL ) );
*(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_256( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
memset_zero_256( u.buf, 56>>2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
*(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, u.buf, 64 );
memset_zero_256( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
*(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
*(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
blake32_8way( sc, buf, 64 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w32; k++ )
@@ -1274,10 +1282,10 @@ static void
blake64_4way_close( blake_4way_big_context *sc,
unsigned ub, unsigned n, void *dst, size_t out_size_w64)
{
union {
// union {
__m256i buf[16];
sph_u64 dummy;
} u;
// sph_u64 dummy;
// } u;
size_t ptr, k;
unsigned bit_len;
uint64_t z, zz;
@@ -1288,7 +1296,7 @@ blake64_4way_close( blake_4way_big_context *sc,
bit_len = ((unsigned)ptr << 3);
z = 0x80 >> n;
zz = ((ub & -z) | z) & 0xFF;
u.buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
tl = sc->T0 + bit_len;
th = sc->T1;
if (ptr == 0 )
@@ -1307,33 +1315,33 @@ blake64_4way_close( blake_4way_big_context *sc,
}
if ( ptr <= 104 )
{
memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
if ( out_size_w64 == 8 )
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
_mm256_set1_epi64x( 0x0100000000000000ULL ) );
*(u.buf+(112>>3)) = mm256_bswap_64(
*(buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_bswap_64(
*(buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
}
else
{
memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
memset_zero_256( u.buf, 112>>3 );
memset_zero_256( buf, 112>>3 );
if ( out_size_w64 == 8 )
u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
*(u.buf+(112>>3)) = mm256_bswap_64(
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
*(buf+(112>>3)) = mm256_bswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_bswap_64(
*(buf+(120>>3)) = mm256_bswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf, 128 );
blake64_4way( sc, buf, 128 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w64; k++ )
@@ -1342,7 +1350,7 @@ blake64_4way_close( blake_4way_big_context *sc,
#endif
// Blake-256 4 way & 8 way
// Blake-256 4 way
// default 14 rounds, backward copatibility
void
@@ -1364,6 +1372,9 @@ blake256_4way_close(void *cc, void *dst)
}
#if defined(__AVX2__)
// Blake-256 8way
void
blake256_8way_init(void *cc)
{

View File

@@ -35,7 +35,7 @@
*/
#ifndef __BLAKE_HASH_4WAY__
#define __BLAKE_HASH_4WAY__
#define __BLAKE_HASH_4WAY__ 1
#ifdef __AVX__
@@ -117,11 +117,11 @@ void blake256r8_8way_close(void *cc, void *dst);
// Blake-512 4 way
typedef struct {
__m256i buf[16] __attribute__ ((aligned (64)));
__m256i H[8];
__m256i S[4];
size_t ptr;
sph_u64 T0, T1;
__m256i buf[16] __attribute__ ((aligned (64)));
__m256i H[8];
__m256i S[4];
size_t ptr;
sph_u64 T0, T1;
} blake_4way_big_context;
typedef blake_4way_big_context blake512_4way_context;

134
algo/blake/blake2s-4way.c Normal file
View File

@@ -0,0 +1,134 @@
#include "blake2s-gate.h"
#include "blake2s-hash-4way.h"
#include <string.h>
#include <stdint.h>
#if defined(BLAKE2S_8WAY)
static __thread blake2s_8way_state blake2s_8w_ctx;
void blake2s_8way_hash( void *output, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake2s_8way_state ctx;
memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
blake2s_8way_update( &ctx, input + (64<<3), 16 );
blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
mm256_deinterleave_8x32( output, output+ 32, output+ 64, output+ 96,
output+128, output+160, output+192, output+224,
vhash, 256 );
}
int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) edata[20];
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 152; // 19*8
swab32_array( edata, pdata, 20 );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
do {
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
be32enc( noncep +4, n+4 );
be32enc( noncep +5, n+5 );
be32enc( noncep +6, n+6 );
be32enc( noncep +7, n+7 );
pdata[19] = n;
blake2s_8way_hash( hash, vdata );
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
{
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 8;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#elif defined(BLAKE2S_4WAY)
static __thread blake2s_4way_state blake2s_4w_ctx;
void blake2s_4way_hash( void *output, const void *input )
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake2s_4way_state ctx;
memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
blake2s_4way_update( &ctx, input + (64<<2), 16 );
blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
mm_deinterleave_4x32( output, output+32, output+64, output+96, vhash, 256 );
}
int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(64) edata[20];
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
do {
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
pdata[19] = n;
blake2s_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
{
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}
#endif

27
algo/blake/blake2s-gate.c Normal file
View File

@@ -0,0 +1,27 @@
#include "blake2s-gate.h"
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blake2s_get_max64 ()
{
return 0x7ffffLL;
}
bool register_blake2s_algo( algo_gate_t* gate )
{
#if defined(BLAKE2S_8WAY)
gate->scanhash = (void*)&scanhash_blake2s_8way;
gate->hash = (void*)&blake2s_8way_hash;
#elif defined(BLAKE2S_4WAY)
gate->scanhash = (void*)&scanhash_blake2s_4way;
gate->hash = (void*)&blake2s_4way_hash;
#else
gate->scanhash = (void*)&scanhash_blake2s;
gate->hash = (void*)&blake2s_hash;
#endif
gate->get_max64 = (void*)&blake2s_get_max64;
gate->optimizations = AVX_OPT | AVX2_OPT;
return true;
};

35
algo/blake/blake2s-gate.h Normal file
View File

@@ -0,0 +1,35 @@
#ifndef __BLAKE2S_GATE_H__
#define __BLAKE2S_GATE_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#if defined(__AVX__)
#define BLAKE2S_4WAY
#endif
#if defined(__AVX2__)
#define BLAKE2S_8WAY
#endif
bool register_blake2s_algo( algo_gate_t* gate );
#if defined(BLAKE2S_8WAY)
void blake2s_8way_hash( void *state, const void *input );
int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#elif defined (BLAKE2S_4WAY)
void blake2s_4way_hash( void *state, const void *input );
int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#else
void blake2s_hash( void *state, const void *input );
int scanhash_blake2s( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
#endif

View File

@@ -0,0 +1,362 @@
/**
* BLAKE2 reference source code package - reference C implementations
*
* Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
*
* To the extent possible under law, the author(s) have dedicated all copyright
* and related and neighboring rights to this software to the public domain
* worldwide. This software is distributed without any warranty.
*
* You should have received a copy of the CC0 Public Domain Dedication along with
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
#include "blake2s-hash-4way.h"
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#if defined(__AVX__)
static const uint32_t blake2s_IV[8] =
{
0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
};
static const uint8_t blake2s_sigma[10][16] =
{
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
};
// define a constant for initial param.
int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
{
blake2s_nway_param P[1];
P->digest_length = outlen;
P->key_length = 0;
P->fanout = 1;
P->depth = 1;
P->leaf_length = 0;
*((uint64_t*)(P->node_offset)) = 0;
P->node_depth = 0;
P->inner_length = 0;
memset( P->salt, 0, sizeof( P->salt ) );
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_4way_state ) );
for( int i = 0; i < 8; ++i )
S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
uint32_t *p = ( uint32_t * )( P );
/* IV XOR ParamBlock */
for ( size_t i = 0; i < 8; ++i )
S->h[i] = _mm_xor_si128( S->h[i], _mm_set1_epi32( p[i] ) );
return 0;
}
int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
{
__m128i m[16];
__m128i v[16];
memcpy_128( m, block, 16 );
memcpy_128( v, S->h, 8 );
v[ 8] = _mm_set1_epi32( blake2s_IV[0] );
v[ 9] = _mm_set1_epi32( blake2s_IV[1] );
v[10] = _mm_set1_epi32( blake2s_IV[2] );
v[11] = _mm_set1_epi32( blake2s_IV[3] );
v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
_mm_set1_epi32( blake2s_IV[4] ) );
v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
_mm_set1_epi32( blake2s_IV[5] ) );
v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
_mm_set1_epi32( blake2s_IV[6] ) );
v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
_mm_set1_epi32( blake2s_IV[7] ) );
#define G4W(r,i,a,b,c,d) \
do { \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+0] ] ); \
d = mm_rotr_32( _mm_xor_si128( d, a ), 16 ); \
c = _mm_add_epi32( c, d ); \
b = mm_rotr_32( _mm_xor_si128( b, c ), 12 ); \
a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+1] ] ); \
d = mm_rotr_32( _mm_xor_si128( d, a ), 8 ); \
c = _mm_add_epi32( c, d ); \
b = mm_rotr_32( _mm_xor_si128( b, c ), 7 ); \
} while(0)
#define ROUND4W(r) \
do { \
G4W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
G4W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
G4W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
G4W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
G4W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
G4W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
G4W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
G4W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
} while(0)
ROUND4W( 0 );
ROUND4W( 1 );
ROUND4W( 2 );
ROUND4W( 3 );
ROUND4W( 4 );
ROUND4W( 5 );
ROUND4W( 6 );
ROUND4W( 7 );
ROUND4W( 8 );
ROUND4W( 9 );
for( size_t i = 0; i < 8; ++i )
S->h[i] = _mm_xor_si128( _mm_xor_si128( S->h[i], v[i] ), v[i + 8] );
#undef G4W
#undef ROUND4W
return 0;
}
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
uint64_t inlen )
{
__m128i *input = (__m128i*)in;
__m128i *buf = (__m128i*)S->buf;
const int bsize = BLAKE2S_BLOCKBYTES;
while( inlen > 0 )
{
size_t left = S->buflen;
if( inlen >= bsize - left )
{
memcpy_128( buf + (left>>2), input, (bsize - left) >> 2 );
S->buflen += bsize - left;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
blake2s_4way_compress( S, buf );
S->buflen = 0;
input += ( bsize >> 2 );
inlen -= bsize;
}
else
{
memcpy_128( buf + ( left>>2 ), input, inlen>>2 );
S->buflen += (size_t) inlen;
input += ( inlen>>2 );
inlen -= inlen;
}
}
return 0;
}
int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
{
__m128i *buf = (__m128i*)S->buf;
S->t[0] += S->buflen;
S->t[1] += ( S->t[0] < S->buflen );
if ( S->last_node )
S->f[1] = ~0U;
S->f[0] = ~0U;
memset_zero_128( buf + ( S->buflen>>2 ),
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
blake2s_4way_compress( S, buf );
for ( int i = 0; i < 8; ++i )
casti_m128i( out, i ) = S->h[ i ];
return 0;
}
#if defined(__AVX2__)
int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
{
__m256i m[16];
__m256i v[16];
memcpy_256( m, block, 16 );
memcpy_256( v, S->h, 8 );
v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
v[10] = _mm256_set1_epi32( blake2s_IV[2] );
v[11] = _mm256_set1_epi32( blake2s_IV[3] );
v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
_mm256_set1_epi32( blake2s_IV[4] ) );
v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
_mm256_set1_epi32( blake2s_IV[5] ) );
v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
_mm256_set1_epi32( blake2s_IV[6] ) );
v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
_mm256_set1_epi32( blake2s_IV[7] ) );
#define G8W(r,i,a,b,c,d) \
do { \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
m[ blake2s_sigma[r][2*i+0] ] ); \
d = mm256_rotr_32( _mm256_xor_si256( d, a ), 16 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_rotr_32( _mm256_xor_si256( b, c ), 12 ); \
a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
m[ blake2s_sigma[r][2*i+1] ] ); \
d = mm256_rotr_32( _mm256_xor_si256( d, a ), 8 ); \
c = _mm256_add_epi32( c, d ); \
b = mm256_rotr_32( _mm256_xor_si256( b, c ), 7 ); \
} while(0)
#define ROUND8W(r) \
do { \
G8W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
G8W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
G8W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
G8W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
G8W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
G8W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
G8W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
G8W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
} while(0)
ROUND8W( 0 );
ROUND8W( 1 );
ROUND8W( 2 );
ROUND8W( 3 );
ROUND8W( 4 );
ROUND8W( 5 );
ROUND8W( 6 );
ROUND8W( 7 );
ROUND8W( 8 );
ROUND8W( 9 );
for( size_t i = 0; i < 8; ++i )
S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
#undef G8W
#undef ROUND8W
return 0;
}
int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
{
blake2s_nway_param P[1];
P->digest_length = outlen;
P->key_length = 0;
P->fanout = 1;
P->depth = 1;
P->leaf_length = 0;
*((uint64_t*)(P->node_offset)) = 0;
P->node_depth = 0;
P->inner_length = 0;
memset( P->salt, 0, sizeof( P->salt ) );
memset( P->personal, 0, sizeof( P->personal ) );
memset( S, 0, sizeof( blake2s_8way_state ) );
for( int i = 0; i < 8; ++i )
S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
uint32_t *p = ( uint32_t * )( P );
/* IV XOR ParamBlock */
for ( size_t i = 0; i < 8; ++i )
S->h[i] = _mm256_xor_si256( S->h[i], _mm256_set1_epi32( p[i] ) );
return 0;
}
int blake2s_8way_update( blake2s_8way_state *S, const void *in,
uint64_t inlen )
{
__m256i *input = (__m256i*)in;
__m256i *buf = (__m256i*)S->buf;
const int bsize = BLAKE2S_BLOCKBYTES;
while( inlen > 0 )
{
size_t left = S->buflen;
if( inlen >= bsize - left )
{
memcpy_256( buf + (left>>2), input, (bsize - left) >> 2 );
S->buflen += bsize - left;
S->t[0] += BLAKE2S_BLOCKBYTES;
S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
blake2s_8way_compress( S, buf );
S->buflen = 0;
input += ( bsize >> 2 );
inlen -= bsize;
}
else
{
memcpy_256( buf + ( left>>2 ), input, inlen>>2 );
S->buflen += (size_t) inlen;
input += ( inlen>>2 );
inlen -= inlen;
}
}
return 0;
}
int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
{
__m256i *buf = (__m256i*)S->buf;
S->t[0] += S->buflen;
S->t[1] += ( S->t[0] < S->buflen );
if ( S->last_node )
S->f[1] = ~0U;
S->f[0] = ~0U;
memset_zero_256( buf + ( S->buflen>>2 ),
( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
blake2s_8way_compress( S, buf );
for ( int i = 0; i < 8; ++i )
casti_m256i( out, i ) = S->h[ i ];
return 0;
}
#endif // __AVX2__
#if 0
int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
{
blake2s_state S[1];
/* Verify parameters */
if ( NULL == in ) return -1;
if ( NULL == out ) return -1;
if ( NULL == key ) keylen = 0; /* Fail here instead if keylen != 0 and key == NULL? */
if( keylen > 0 )
{
if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
}
else
{
if( blake2s_init( S, outlen ) < 0 ) return -1;
}
blake2s_update( S, ( uint8_t * )in, inlen );
blake2s_final( S, out, outlen );
return 0;
}
#endif
#endif // __AVX__

View File

@@ -0,0 +1,112 @@
/**
* BLAKE2 reference source code package - reference C implementations
*
* Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
*
* To the extent possible under law, the author(s) have dedicated all copyright
* and related and neighboring rights to this software to the public domain
* worldwide. This software is distributed without any warranty.
*
* You should have received a copy of the CC0 Public Domain Dedication along with
* this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
*/
//#pragma once
#ifndef __BLAKE2S_HASH_4WAY_H__
#define __BLAKE2S_HASH_4WAY_H__ 1
#if defined(__AVX__)
#include "avxdefs.h"
#include <stddef.h>
#include <stdint.h>
#if defined(_MSC_VER)
#include <inttypes.h>
#define inline __inline
#define ALIGN(x) __declspec(align(x))
#else
#define ALIGN(x) __attribute__((aligned(x)))
#endif
#if defined(__cplusplus)
extern "C" {
#endif
enum blake2s_constant
{
BLAKE2S_BLOCKBYTES = 64,
BLAKE2S_OUTBYTES = 32,
BLAKE2S_KEYBYTES = 32,
BLAKE2S_SALTBYTES = 8,
BLAKE2S_PERSONALBYTES = 8
};
#pragma pack(push, 1)
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[BLAKE2S_SALTBYTES]; // 24
uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32
} blake2s_nway_param;
#pragma pack(pop)
ALIGN( 64 ) typedef struct __blake2s_4way_state
{
__m128i h[8];
uint8_t buf[ BLAKE2S_BLOCKBYTES * 4 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
uint8_t last_node;
} blake2s_4way_state ;
int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen );
int blake2s_4way_update( blake2s_4way_state *S, const void *in,
uint64_t inlen );
int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
#if defined(__AVX2__)
ALIGN( 64 ) typedef struct __blake2s_8way_state
{
__m256i h[8];
uint8_t buf[ BLAKE2S_BLOCKBYTES * 8 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
uint8_t last_node;
} blake2s_8way_state ;
int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
int blake2s_8way_update( blake2s_8way_state *S, const void *in,
uint64_t inlen );
int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
#endif
#if 0
// Simple API
// int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
// Direct Hash Mining Helpers
#define blake2s_salt32(out, in, inlen, key32) blake2s(out, in, key32, 32, inlen, 32) /* neoscrypt */
#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
#endif
#if defined(__cplusplus)
}
#endif
#endif // __AVX__
#endif

View File

@@ -1,26 +1,29 @@
#include "algo-gate-api.h"
#include "blake2s-gate.h"
#include <string.h>
#include <stdint.h>
#include "sph-blake2s.h"
static __thread blake2s_state s_midstate;
static __thread blake2s_state s_ctx;
static __thread blake2s_state blake2s_ctx;
//static __thread blake2s_state s_ctx;
#define MIDLEN 76
void blake2s_hash(void *output, const void *input)
void blake2s_hash( void *output, const void *input )
{
unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
blake2s_state blake2_ctx __attribute__ ((aligned (64)));
blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES);
blake2s_update(&blake2_ctx, input, 80);
blake2s_final(&blake2_ctx, hash, BLAKE2S_OUTBYTES);
unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
blake2s_state ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &blake2s_ctx, sizeof ctx );
blake2s_update( &ctx, input+64, 16 );
// blake2s_init(&ctx, BLAKE2S_OUTBYTES);
// blake2s_update(&ctx, input, 80);
blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );
memcpy(output, hash, 32);
}
/*
static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
{
s_ctx.buflen = MIDLEN;
@@ -28,7 +31,7 @@ static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
}
*/
int scanhash_blake2s(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
{
@@ -46,13 +49,12 @@ int scanhash_blake2s(int thr_id, struct work *work,
swab32_array( endiandata, pdata, 20 );
// midstate
blake2s_init(&s_midstate, BLAKE2S_OUTBYTES);
blake2s_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
memcpy(&s_ctx, &s_midstate, sizeof(blake2s_state));
blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );
do {
be32enc(&endiandata[19], n);
blake2s_hash_end(hash64, endiandata);
blake2s_hash( hash64, endiandata );
if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
@@ -67,7 +69,7 @@ int scanhash_blake2s(int thr_id, struct work *work,
return 0;
}
/*
// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
int64_t blake2s_get_max64 ()
{
@@ -81,4 +83,4 @@ bool register_blake2s_algo( algo_gate_t* gate )
gate->get_max64 = (void*)&blake2s_get_max64;
return true;
};
*/

View File

@@ -1,21 +1,22 @@
#include "blakecoin-gate.h"
#if defined (BLAKECOIN_4WAY)
#include "blake-hash-4way.h"
#include <string.h>
#include <stdint.h>
#include <memory.h>
blake256r8_4way_context blakecoin_ctx;
#if defined (BLAKECOIN_4WAY)
blake256r8_4way_context blakecoin_4w_ctx;
void blakecoin_4way_hash(void *state, const void *input)
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
blake256r8_4way_context ctx;
memcpy( &ctx, &blakecoin_ctx, sizeof ctx );
memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
blake256r8_4way( &ctx, input + (64<<2), 16 );
blake256r8_4way_close( &ctx, vhash );
mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
}
@@ -31,58 +32,29 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
if (opt_benchmark)
if ( opt_benchmark )
HTarget = 0x7f;
// we need big endian data...
swab32_array( edata, pdata, 20 );
mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256r8_4way_init( &blakecoin_ctx );
blake256r8_4way( &blakecoin_ctx, vdata, 64 );
blake256r8_4way_init( &blakecoin_4w_ctx );
blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );
uint32_t *noncep = vdata + 76; // 19*4
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
blakecoin_4way_hash( hash, vdata );
pdata[19] = n;
blakecoin_4way_hash( hash, vdata );
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
work_set_target_ratio( work, hash );
}
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n+1;
work_set_target_ratio( work, hash+8 );
}
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n+2;
work_set_target_ratio( work, hash+16 );
}
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n+3;
work_set_target_ratio( work, hash+24 );
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
@@ -90,15 +62,76 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
// workaround to prevent flood of hash reports when nonce range exhasuted
// and thread is spinning waiting for new work
if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
{
*hashes_done = 0;
// sleep(1);
}
return num_found;
}
#endif
#if defined(BLAKECOIN_8WAY)
blake256r8_8way_context blakecoin_8w_ctx;
void blakecoin_8way_hash( void *state, const void *input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256r8_8way_context ctx;
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
blake256r8_8way( &ctx, input + (64<<3), 16 );
blake256r8_8way_close( &ctx, vhash );
mm256_deinterleave_8x32( state, state+ 32, state+ 64, state+ 96,
state+128, state+160, state+192, state+224,
vhash, 256 );
}
int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t HTarget = ptarget[7];
uint32_t _ALIGN(32) edata[20];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
uint32_t *noncep = vdata + 152; // 19*8
int num_found = 0;
if ( opt_benchmark )
HTarget = 0x7f;
// we need big endian data...
swab32_array( edata, pdata, 20 );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
blake256r8_8way_init( &blakecoin_8w_ctx );
blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );
do {
be32enc( noncep, n );
be32enc( noncep +1, n+1 );
be32enc( noncep +2, n+2 );
be32enc( noncep +3, n+3 );
be32enc( noncep +4, n+4 );
be32enc( noncep +5, n+5 );
be32enc( noncep +6, n+6 );
be32enc( noncep +7, n+7 );
pdata[19] = n;
blakecoin_8way_hash( hash, vdata );
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
{
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 8;
} while ( (num_found == 0) && (n < max_nonce)
&& !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
}

View File

@@ -8,55 +8,21 @@ int64_t blakecoin_get_max64 ()
// return 0x3fffffLL;
}
// Blakecoin 4 way hashes so fast it runs out of nonces.
// This is an attempt to solve this but the result may be
// to rehash old nonces until new work is received.
void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id,
uint32_t *end_nonce_ptr, bool clean_job )
{
uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
// if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) )
// algo_gate.stratum_gen_work( &stratum, g_work );
if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
|| ( *nonceptr >= *end_nonce_ptr )
|| ( ( work->job_id != g_work->job_id ) && clean_job ) )
/*
if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
&& ( clean_job || ( *nonceptr >= *end_nonce_ptr )
|| ( work->job_id != g_work->job_id ) ) )
*/
{
work_free( work );
work_copy( work, g_work );
*nonceptr = 0xffffffffU / opt_n_threads * thr_id;
if ( opt_randomize )
*nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
*end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
// try incrementing the xnonce to chsnge the data
// for ( int i = 0; i < work->xnonce2_size && !( ++work->xnonce2[i] ); i++ );
}
else
++(*nonceptr);
}
// vanilla uses default gen merkle root, otherwise identical to blakecoin
bool register_vanilla_algo( algo_gate_t* gate )
{
#if defined(BLAKECOIN_4WAY)
// four_way_not_tested();
#if defined(BLAKECOIN_8WAY)
gate->scanhash = (void*)&scanhash_blakecoin_8way;
gate->hash = (void*)&blakecoin_8way_hash;
#elif defined(BLAKECOIN_4WAY)
gate->scanhash = (void*)&scanhash_blakecoin_4way;
gate->hash = (void*)&blakecoin_4way_hash;
// gate->get_new_work = (void*)&bc4w_get_new_work;
// blakecoin_4way_init( &blake_4way_init_ctx );
#else
gate->scanhash = (void*)&scanhash_blakecoin;
gate->hash = (void*)&blakecoinhash;
// blakecoin_init( &blake_init_ctx );
#endif
gate->optimizations = AVX2_OPT;
gate->optimizations = AVX_OPT | AVX2_OPT;
gate->get_max64 = (void*)&blakecoin_get_max64;
return true;
}

View File

@@ -1,12 +1,21 @@
#ifndef __BLAKECOIN_GATE_H__
#define __BLAKECOIN_GATE_H__
#define __BLAKECOIN_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__AVX2__)
#if defined(__AVX__)
#define BLAKECOIN_4WAY
#endif
#if defined(__AVX2__)
#define BLAKECOIN_8WAY
#endif
#if defined (BLAKECOIN_8WAY)
void blakecoin_8way_hash(void *state, const void *input);
int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
#endif
#if defined (BLAKECOIN_4WAY)
void blakecoin_4way_hash(void *state, const void *input);

View File

@@ -38,7 +38,6 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
// copy to buffer guaranteed to be aligned.
@@ -52,7 +51,6 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
do {
found[0] = found[1] = found[2] = found[3] = false;
* noncep = n;
*(noncep+1) = n+1;
*(noncep+2) = n+2;
@@ -60,35 +58,11 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
decred_hash_4way( hash, vdata );
if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
{
work_set_target_ratio( work, hash );
found[0] = true;
num_found++;
nonces[0] = n;
pdata[DECRED_NONCE_INDEX] = n;
}
if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
{
work_set_target_ratio( work, hash+8 );
found[1] = true;
num_found++;
nonces[1] = n+1;
}
if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
{
work_set_target_ratio( work, hash+16 );
found[2] = true;
num_found++;
nonces[2] = n+2;
}
if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
{
work_set_target_ratio( work, hash+24 );
found[3] = true;
num_found++;
nonces[3] = n+3;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce)

View File

@@ -111,12 +111,8 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
bool *found = work->nfound;
int num_found = 0;
uint32_t *noncep0 = vdata + 73; // 9*8 + 1
uint32_t *noncep1 = vdata + 75;
uint32_t *noncep2 = vdata + 77;
uint32_t *noncep3 = vdata + 79;
uint32_t *noncep = vdata + 73; // 9*8 + 1
// uint32_t _ALIGN(32) hash64[8];
// uint32_t _ALIGN(32) endiandata[32];
@@ -150,47 +146,19 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
{
uint32_t mask = masks[m];
do {
found[0] = found[1] = found[2] = found[3] = false;
be32enc( noncep0, n );
be32enc( noncep1, n+1 );
be32enc( noncep2, n+2 );
be32enc( noncep3, n+3 );
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
pentablakehash_4way( hash, vdata );
// return immediately on nonce found, only one submit
if ( ( !(hash[7] & mask) ) && fulltest( hash, ptarget ) )
for ( int i = 0; i < 4; i++ )
if ( !( (hash+(i<<3))[7] & mask )
&& fulltest( hash+(i<<3), ptarget ) )
{
found[0] = true;
num_found++;
nonces[0] = n;
pdata[19] = n;
*hashes_done = n - first_nonce + 1;
return 1;
}
if ( (! ((hash+8)[7] & mask) ) && fulltest( hash+8, ptarget ) )
{
found[1] = true;
num_found++;
nonces[1] = n;
*hashes_done = n - first_nonce + 1;
return 1;
}
if ( ( !((hash+16)[7] & mask) ) && fulltest( hash+16, ptarget ) )
{
found[2] = true;
num_found++;
nonces[2] = n;
*hashes_done = n - first_nonce + 1;
return 1;
}
if ( ( !((hash+24)[7] & mask) ) && fulltest( hash+24, ptarget ) )
{
found[3] = true;
num_found++;
nonces[3] = n;
*hashes_done = n - first_nonce + 1;
return 1;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
}
n += 4;