Files
cpuminer-opt-gpu/algo/yespower/yespower-4way.c
Jay D Dee fb9163185a v3.12.7
2020-03-20 16:30:12 -04:00

693 lines
20 KiB
C

/*-
* Copyright 2009 Colin Percival
* Copyright 2013-2018 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*
* This is a proof-of-work focused fork of yescrypt, including reference and
* cut-down implementation of the obsolete yescrypt 0.5 (based off its first
* submission to PHC back in 2014) and a new proof-of-work specific variation
* known as yespower 1.0. The former is intended as an upgrade for
* cryptocurrencies that already use yescrypt 0.5 and the latter may be used
* as a further upgrade (hard fork) by those and other cryptocurrencies. The
* version of algorithm to use is requested through parameters, allowing for
* both algorithms to co-exist in client and miner implementations (such as in
* preparation for a hard-fork).
*
* This is the reference implementation. Its purpose is to provide a simple
* human- and machine-readable specification that implementations intended
* for actual use should be tested against. It is deliberately mostly not
* optimized, and it is not meant to be used in production. Instead, use
* yespower-opt.c.
*/
/*
#warning "This reference implementation is deliberately mostly not optimized. Use yespower-opt.c instead unless you're testing (against) the reference implementation on purpose."
*/
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "algo/sha/hmac-sha256-hash-4way.h"
//#include "sysendian.h"
#include "yespower.h"
#if defined(__AVX2__)
static void blkcpy_8way( __m256i *dst, const __m256i *src, size_t count )
{
do {
*dst++ = *src++;
} while (--count);
}
static void blkxor_8way( __m256i *dst, const __m256i *src, size_t count )
{
do {
*dst++ ^= *src++;
} while (--count);
}
/**
* salsa20(B):
* Apply the Salsa20 core to the provided block.
*/
static void salsa20_8way( __m256i B[16], uint32_t rounds )
{
__m256i x[16];
size_t i;
/* SIMD unshuffle */
for ( i = 0; i < 16; i++ )
x[i * 5 % 16] = B[i];
for ( i = 0; i < rounds; i += 2 )
{
#define R( a, b, c ) mm256_rol_32( _mm256_add_epi32( a, b ), c )
/* Operate on columns */
x[ 4] = _mm256_xor_si256( x[ 4], R( x[ 0], x[12], 7 ) );
x[ 8] = _mm256_xor_si256( x[ 8], R( x[ 4], x[ 0], 9 ) );
x[12] = _mm256_xor_si256( x[12], R( x[ 8], x[ 4], 13 ) );
x[ 0] = _mm256_xor_si256( x[ 0], R( x[12], x[ 8], 18 ) );
x[ 9] = _mm256_xor_si256( x[ 9], R( x[ 5], x[ 1], 7 ) );
x[13] = _mm256_xor_si256( x[13], R( x[ 9], x[ 5], 9 ) );
x[ 1] = _mm256_xor_si256( x[ 1], R( x[13], x[ 9], 13 ) );
x[ 5] = _mm256_xor_si256( x[ 5], R( x[ 1], x[13], 18 ) );
x[14] = _mm256_xor_si256( x[14], R( x[10], x[ 6], 7 ) );
x[ 2] = _mm256_xor_si256( x[ 2], R( x[14], x[10], 9 ) );
x[ 6] = _mm256_xor_si256( x[ 6], R( x[ 2], x[14], 13 ) );
x[10] = _mm256_xor_si256( x[10], R( x[ 6], x[ 2], 18 ) );
x[ 3] = _mm256_xor_si256( x[ 3], R( x[15], x[11], 7 ) );
x[ 7] = _mm256_xor_si256( x[ 7], R( x[ 3], x[15], 9 ) );
x[11] = _mm256_xor_si256( x[11], R( x[ 7], x[ 3], 13 ) );
x[15] = _mm256_xor_si256( x[15], R( x[11], x[ 7], 18 ) );
/* Operate on rows */
x[ 1] = _mm256_xor_si256( x[ 1], R( x[ 0], x[ 3], 7 ) );
x[ 2] = _mm256_xor_si256( x[ 2], R( x[ 1], x[ 0], 9 ) );
x[ 3] = _mm256_xor_si256( x[ 3], R( x[ 2], x[ 1], 13 ) );
x[ 0] = _mm256_xor_si256( x[ 0], R( x[ 3], x[ 2], 18 ) );
x[ 6] = _mm256_xor_si256( x[ 6], R( x[ 5], x[ 4], 7 ) );
x[ 7] = _mm256_xor_si256( x[ 7], R( x[ 6], x[ 5], 9 ) );
x[ 4] = _mm256_xor_si256( x[ 4], R( x[ 7], x[ 6], 13 ) );
x[ 5] = _mm256_xor_si256( x[ 5], R( x[ 4], x[ 7], 18 ) );
x[11] = _mm256_xor_si256( x[11], R( x[10], x[ 9], 7 ) );
x[ 8] = _mm256_xor_si256( x[ 8], R( x[11], x[10], 9 ) );
x[ 9] = _mm256_xor_si256( x[ 9], R( x[ 8], x[11], 13 ) );
x[10] = _mm256_xor_si256( x[10], R( x[ 9], x[ 8], 18 ) );
x[12] = _mm256_xor_si256( x[12], R( x[15], x[14], 7 ) );
x[13] = _mm256_xor_si256( x[13], R( x[12], x[15], 9 ) );
x[14] = _mm256_xor_si256( x[14], R( x[13], x[12], 13 ) );
x[15] = _mm256_xor_si256( x[15], R( x[14], x[13], 18 ) );
#undef R
}
/* SIMD shuffle */
for (i = 0; i < 16; i++)
B[i] = _mm256_add_epi32( B[i], x[i * 5 % 16] );
}
/**
* blockmix_salsa(B):
* Compute B = BlockMix_{salsa20, 1}(B). The input B must be 128 bytes in
* length.
*/
static void blockmix_salsa_8way( __m256i *B, uint32_t rounds )
{
__m256i X[16];
size_t i;
/* 1: X <-- B_{2r - 1} */
blkcpy_8way( X, &B[16], 16 );
/* 2: for i = 0 to 2r - 1 do */
for ( i = 0; i < 2; i++ )
{
/* 3: X <-- H(X xor B_i) */
blkxor_8way( X, &B[i * 16], 16 );
salsa20_8way( X, rounds );
/* 4: Y_i <-- X */
/* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */
blkcpy_8way( &B[i * 16], X, 16 );
}
}
/*
* These are tunable, but they must meet certain constraints and are part of
* what defines a yespower version.
*/
#define PWXsimple 2
#define PWXgather 4
/* Version 0.5 */
#define PWXrounds_0_5 6
#define Swidth_0_5 8
/* Version 1.0 */
#define PWXrounds_1_0 3
#define Swidth_1_0 11
/* Derived values. Not tunable on their own. */
#define PWXbytes (PWXgather * PWXsimple * 8)
#define PWXwords (PWXbytes / sizeof(uint32_t))
#define rmin ((PWXbytes + 127) / 128)
/* Runtime derived values. Not tunable on their own. */
#define Swidth_to_Sbytes1(Swidth) ((1 << Swidth) * PWXsimple * 8)
#define Swidth_to_Smask(Swidth) (((1 << Swidth) - 1) * PWXsimple * 8)
typedef struct {
__m256i (*S0)[2], (*S1)[2], (*S2)[2];
__m256i *S;
yespower_version_t version;
uint32_t salsa20_rounds;
uint32_t PWXrounds, Swidth, Sbytes, Smask;
size_t w;
} pwxform_8way_ctx_t __attribute__ ((aligned (128)));
/**
* pwxform(B):
* Transform the provided block using the provided S-boxes.
*/
static void pwxform_8way( __m256i *B, pwxform_8way_ctx_t *ctx )
{
__m256i (*X)[PWXsimple][2] = (__m256i (*)[PWXsimple][2])B;
__m256i (*S0)[2] = ctx->S0, (*S1)[2] = ctx->S1, (*S2)[2] = ctx->S2;
__m256i Smask = _mm256_set1_epi32( ctx->Smask );
size_t w = ctx->w;
size_t i, j, k;
/* 1: for i = 0 to PWXrounds - 1 do */
for ( i = 0; i < ctx->PWXrounds; i++ )
{
/* 2: for j = 0 to PWXgather - 1 do */
for ( j = 0; j < PWXgather; j++ )
{
// Are these pointers or data?
__m256i xl = X[j][0][0];
__m256i xh = X[j][0][1];
__m256i (*p0)[2], (*p1)[2];
// 3: p0 <-- (lo(B_{j,0}) & Smask) / (PWXsimple * 8)
// playing with pointers
/*
p0 = S0 + (xl & Smask) / sizeof(*S0);
// 4: p1 <-- (hi(B_{j,0}) & Smask) / (PWXsimple * 8)
p1 = S1 + (xh & Smask) / sizeof(*S1);
*/
/* 5: for k = 0 to PWXsimple - 1 do */
for ( k = 0; k < PWXsimple; k++ )
{
// shift from 32 bit data to 64 bit data
__m256i x0, x1, s00, s01, s10, s11;
__m128i *p0k = (__m128i*)p0[k];
__m128i *p1k = (__m128i*)p1[k];
s00 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p0k[0] ),
_mm256_slli_epi64( _mm256_cvtepu32_epi64( p0k[2] ), 32 ) );
s01 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p0k[1] ),
_mm256_slli_epi64( _mm256_cvtepu32_epi64( p0k[3] ), 32 ) );
s10 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p1k[0] ),
_mm256_slli_epi64( _mm256_cvtepu32_epi64( p1k[2] ), 32 ) );
s11 = _mm256_add_epi64( _mm256_cvtepu32_epi64( p1k[1] ),
_mm256_slli_epi64( _mm256_cvtepu32_epi64( p1k[3] ), 32 ) );
__m128i *xx = (__m128i*)X[j][k];
x0 = _mm256_mul_epu32( _mm256_cvtepu32_epi64( xx[0] ),
_mm256_cvtepu32_epi64( xx[2] ) );
x1 = _mm256_mul_epu32( _mm256_cvtepu32_epi64( xx[1] ),
_mm256_cvtepu32_epi64( xx[3] ) );
x0 = _mm256_add_epi64( x0, s00 );
x1 = _mm256_add_epi64( x1, s01 );
x0 = _mm256_xor_si256( x0, s10 );
x1 = _mm256_xor_si256( x1, s11 );
X[j][k][0] = x0;
X[j][k][1] = x1;
}
if ( ctx->version != YESPOWER_0_5 &&
( i == 0 || j < PWXgather / 2 ) )
{
if ( j & 1 )
{
for ( k = 0; k < PWXsimple; k++ )
{
S1[w][0] = X[j][k][0];
S1[w][1] = X[j][k][1];
w++;
}
}
else
{
for ( k = 0; k < PWXsimple; k++ )
{
S0[w + k][0] = X[j][k][0];
S0[w + k][1] = X[j][k][1];
}
}
}
}
}
if ( ctx->version != YESPOWER_0_5 )
{
/* 14: (S0, S1, S2) <-- (S2, S0, S1) */
ctx->S0 = S2;
ctx->S1 = S0;
ctx->S2 = S1;
/* 15: w <-- w mod 2^Swidth */
ctx->w = w & ( ( 1 << ctx->Swidth ) * PWXsimple - 1 );
}
}
/**
* blockmix_pwxform(B, ctx, r):
* Compute B = BlockMix_pwxform{salsa20, ctx, r}(B). The input B must be
* 128r bytes in length.
*/
static void blockmix_pwxform_8way( uint32_t *B, pwxform_8way_ctx_t *ctx,
size_t r )
{
__m256i X[PWXwords];
size_t r1, i;
/* Convert 128-byte blocks to PWXbytes blocks */
/* 1: r_1 <-- 128r / PWXbytes */
r1 = 128 * r / PWXbytes;
/* 2: X <-- B'_{r_1 - 1} */
blkcpy_8way( X, &B[ (r1 - 1) * PWXwords ], PWXwords );
/* 3: for i = 0 to r_1 - 1 do */
for ( i = 0; i < r1; i++ )
{
/* 4: if r_1 > 1 */
if ( r1 > 1 )
{
/* 5: X <-- X xor B'_i */
blkxor_8way( X, &B[ i * PWXwords ], PWXwords );
}
/* 7: X <-- pwxform(X) */
pwxform_8way( X, ctx );
/* 8: B'_i <-- X */
blkcpy_8way( &B[ i * PWXwords ], X, PWXwords );
}
/* 10: i <-- floor((r_1 - 1) * PWXbytes / 64) */
i = ( r1 - 1 ) * PWXbytes / 64;
/* 11: B_i <-- H(B_i) */
salsa20_8way( &B[i * 16], ctx->salsa20_rounds );
#if 1 /* No-op with our current pwxform settings, but do it to make sure */
/* 12: for i = i + 1 to 2r - 1 do */
for ( i++; i < 2 * r; i++ )
{
/* 13: B_i <-- H(B_i xor B_{i-1}) */
blkxor_8way( &B[i * 16], &B[ (i - 1) * 16 ], 16 );
salsa20_8way( &B[i * 16], ctx->salsa20_rounds );
}
#endif
}
// This looks a lot like data dependent addressing
/**
* integerify(B, r):
* Return the result of parsing B_{2r-1} as a little-endian integer.
*/
static __m256i integerify8( const __m256i *B, size_t r )
{
/*
* Our 32-bit words are in host byte order. Also, they are SIMD-shuffled, but
* we only care about the least significant 32 bits anyway.
*/
const __m256i *X = &B[ (2 * r - 1) * 16 ];
return X[0];
}
/**
* p2floor(x):
* Largest power of 2 not greater than argument.
*/
static uint32_t p2floor8( uint32_t x )
{
uint32_t y;
while ( ( y = x & (x - 1) ) )
x = y;
return x;
}
/**
* wrap(x, i):
* Wrap x to the range 0 to i-1.
*/
static uint32_t wrap8( uint32_t x, uint32_t i )
{
uint32_t n = p2floor( i );
return ( x & (n - 1) ) + (i - n);
}
/**
* smix1(B, r, N, V, X, ctx):
* Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in
* length; the temporary storage V must be 128rN bytes in length; the temporary
* storage X must be 128r bytes in length.
*/
static void smix1_8way( __m256i *B, size_t r, uint32_t N,
__m256i *V, __m256i *X, pwxform_8way_ctx_t *ctx )
{
size_t s = 32 * r;
uint32_t i, j;
size_t k;
/* 1: X <-- B */
for ( k = 0; k < 2 * r; k++ )
for ( i = 0; i < 16; i++ )
X[ k * 16 + i ] = B[ k * 16 + ( i * 5 % 16 ) ];
if ( ctx->version != YESPOWER_0_5 )
{
for ( k = 1; k < r; k++ )
{
blkcpy_8way( &X[k * 32], &X[ (k - 1) * 32 ], 32 );
blockmix_pwxform_8way( &X[k * 32], ctx, 1 );
}
}
/* 2: for i = 0 to N - 1 do */
for ( i = 0; i < N; i++ )
{
/* 3: V_i <-- X */
blkcpy_8way( &V[i * s], X, s );
if ( i > 1 )
{
// is j int or vector? Integrify has data dependent addressing?
/* j <-- Wrap(Integerify(X), i) */
// j = wrap8( integerify8( X, r ), i );
/* X <-- X xor V_j */
blkxor_8way( X, &V[j * s], s );
}
/* 4: X <-- H(X) */
if ( V != ctx->S )
blockmix_pwxform_8way( X, ctx, r );
else
blockmix_salsa_8way( X, ctx->salsa20_rounds );
}
/* B' <-- X */
for ( k = 0; k < 2 * r; k++ )
for ( i = 0; i < 16; i++ )
B[ k * 16 + ( i * 5 % 16 ) ] = X[ k * 16 + i ];
}
/**
* smix2(B, r, N, Nloop, V, X, ctx):
* Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in
* length; the temporary storage V must be 128rN bytes in length; the temporary
* storage X must be 128r bytes in length. The value N must be a power of 2
* greater than 1.
*/
static void smix2_8way( __m256i *B, size_t r, uint32_t N, uint32_t Nloop,
__m256i *V, __m256i *X, pwxform_8way_ctx_t *ctx )
{
size_t s = 32 * r;
uint32_t i, j;
size_t k;
/* X <-- B */
for ( k = 0; k < 2 * r; k++ )
for ( i = 0; i < 16; i++ )
X[ k * 16 + i ] = B[ k * 16 + ( i * 5 % 16 ) ];
/* 6: for i = 0 to N - 1 do */
for ( i = 0; i < Nloop; i++ )
{
/* 7: j <-- Integerify(X) mod N */
// j = integerify8(X, r) & (N - 1);
/* 8.1: X <-- X xor V_j */
blkxor_8way( X, &V[j * s], s );
/* V_j <-- X */
if ( Nloop != 2 )
blkcpy_8way( &V[j * s], X, s );
/* 8.2: X <-- H(X) */
blockmix_pwxform_8way( X, ctx, r );
}
/* 10: B' <-- X */
for ( k = 0; k < 2 * r; k++ )
for ( i = 0; i < 16; i++ )
B[ k * 16 + ( i * 5 % 16 ) ] = X[ k * 16 + i ];
}
/**
* smix(B, r, N, p, t, V, X, ctx):
* Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the
* temporary storage V must be 128rN bytes in length; the temporary storage
* X must be 128r bytes in length. The value N must be a power of 2 and at
* least 16.
*/
static void smix_8way( __m256i *B, size_t r, uint32_t N,
__m256i *V, __m256i *X, pwxform_8way_ctx_t *ctx)
{
uint32_t Nloop_all = (N + 2) / 3; /* 1/3, round up */
uint32_t Nloop_rw = Nloop_all;
Nloop_all++; Nloop_all &= ~(uint32_t)1; /* round up to even */
if ( ctx->version == YESPOWER_0_5 )
Nloop_rw &= ~(uint32_t)1; /* round down to even */
else
Nloop_rw++; Nloop_rw &= ~(uint32_t)1; /* round up to even */
smix1_8way( B, 1, ctx->Sbytes / 128, ctx->S, X, ctx );
smix1_8way( B, r, N, V, X, ctx );
smix2_8way( B, r, N, Nloop_rw /* must be > 2 */, V, X, ctx );
smix2_8way( B, r, N, Nloop_all - Nloop_rw /* 0 or 2 */, V, X, ctx );
}
/**
* yespower(local, src, srclen, params, dst):
* Compute yespower(src[0 .. srclen - 1], N, r), to be checked for "< target".
*
* Return 0 on success; or -1 on error.
*/
int yespower_8way( yespower_local_t *local, const __m256i *src, size_t srclen,
const yespower_params_t *params, yespower_8way_binary_t *dst,
int thrid )
{
yespower_version_t version = params->version;
uint32_t N = params->N;
uint32_t r = params->r;
const uint8_t *pers = params->pers;
size_t perslen = params->perslen;
int retval = -1;
size_t B_size, V_size;
uint32_t *B, *V, *X, *S;
pwxform_8way_ctx_t ctx;
__m256i sha256[8];
/* Sanity-check parameters */
if ( (version != YESPOWER_0_5 && version != YESPOWER_1_0 ) ||
N < 1024 || N > 512 * 1024 || r < 8 || r > 32 ||
(N & (N - 1)) != 0 || r < rmin ||
(!pers && perslen) )
{
errno = EINVAL;
return -1;
}
/* Allocate memory */
B_size = (size_t)128 * r;
V_size = B_size * N;
if ((V = malloc(V_size)) == NULL)
return -1;
if ((B = malloc(B_size)) == NULL)
goto free_V;
if ((X = malloc(B_size)) == NULL)
goto free_B;
ctx.version = version;
if (version == YESPOWER_0_5) {
ctx.salsa20_rounds = 8;
ctx.PWXrounds = PWXrounds_0_5;
ctx.Swidth = Swidth_0_5;
ctx.Sbytes = 2 * Swidth_to_Sbytes1(ctx.Swidth);
} else {
ctx.salsa20_rounds = 2;
ctx.PWXrounds = PWXrounds_1_0;
ctx.Swidth = Swidth_1_0;
ctx.Sbytes = 3 * Swidth_to_Sbytes1(ctx.Swidth);
}
if ((S = malloc(ctx.Sbytes)) == NULL)
goto free_X;
ctx.S = S;
ctx.S0 = (__m256i (*)[2])S;
ctx.S1 = ctx.S0 + (1 << ctx.Swidth) * PWXsimple;
ctx.S2 = ctx.S1 + (1 << ctx.Swidth) * PWXsimple;
ctx.Smask = Swidth_to_Smask(ctx.Swidth);
ctx.w = 0;
// do prehash
sha256_8way_full( sha256, src, srclen );
// need flexible size, use malloc;
__m256i vpers[128];
if ( version != YESPOWER_0_5 && perslen )
for ( int i = 0; i < perslen/4 + 1; i++ )
vpers[i] = _mm256_set1_epi32( pers[i] );
/* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */
pbkdf2_sha256_8way( B, B_size, sha256, sizeof(sha256), vpers, perslen, 1 );
blkcpy_8way( sha256, B, sizeof(sha256) / sizeof(sha256[0] ) );
/* 3: B_i <-- MF(B_i, N) */
smix_8way( B, r, N, V, X, &ctx );
if ( version == YESPOWER_0_5 )
{
/* 5: DK <-- PBKDF2(P, B, 1, dkLen) */
pbkdf2_sha256_8way( dst, sizeof(*dst), sha256, sizeof(sha256),
B, B_size, 1 );
if ( pers )
{
hmac_sha256_8way_full( dst, sizeof(*dst), vpers, perslen, sha256 );
sha256_8way_full( dst, sha256, sizeof(sha256) );
}
}
else
hmac_sha256_8way_full( dst, B + B_size - 64, 64, sha256, sizeof(sha256) );
/* Success! */
retval = 1;
/* Free memory */
free(S);
free_X:
free(X);
free_B:
free(B);
free_V:
free(V);
return retval;
}
int yespower_8way_tls( const __m256i *src, size_t srclen,
const yespower_params_t *params, yespower_8way_binary_t *dst, int trhid )
{
/* The reference implementation doesn't use thread-local storage */
return yespower_8way( NULL, src, srclen, params, dst, trhid );
}
int yespower_init_local8( yespower_local_t *local )
{
/* The reference implementation doesn't use the local structure */
local->base = local->aligned = NULL;
local->base_size = local->aligned_size = 0;
return 0;
}
int yespower_free_local8( yespower_local_t *local )
{
/* The reference implementation frees its memory in yespower() */
(void)local; /* unused */
return 0;
}
int yespower_8way_hash( const char *input, char *output, uint32_t len,
int thrid )
{
return yespower_8way_tls( input, len, &yespower_params,
(yespower_binary_t*)output, thrid );
}
int scanhash_yespower_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8*8];
uint32_t _ALIGN(128) vdata[20*8];
uint32_t _ALIGN(128) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
endiandata[19] = n;
// do sha256 prehash
SHA256_Init( &sha256_prehash_ctx );
SHA256_Update( &sha256_prehash_ctx, endiandata, 64 );
do {
if ( yespower_hash( vdata, hash, 80, thr_id ) )
if unlikely( valid_hash( hash, ptarget ) && !opt_benchmark )
{
be32enc( pdata+19, n );
submit_solution( work, hash, mythr );
}
endiandata[19] = ++n;
} while ( n < last_nonce && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce;
pdata[19] = n;
return 0;
}
#endif // AVX2