Compare commits

...

8 Commits

Author SHA1 Message Date
Jay D Dee
71d6b97ee8 v3.9.3.1 2019-06-13 21:15:58 -04:00
Jay D Dee
b2331375a3 v3.9.2.5 2019-06-13 11:20:27 -04:00
Jay D Dee
7fec680835 v3.9.2.4 2019-06-07 23:30:38 -04:00
Jay D Dee
1b0a5aadf6 v3.9.2.3 2019-06-05 12:20:04 -04:00
Jay D Dee
0a3c52810e v3.9.2.2 2019-06-04 17:14:03 -04:00
Jay D Dee
4d4386a374 v3.9.2.1 2019-06-04 16:56:44 -04:00
Jay D Dee
ce259b915a v3.9.2 2019-06-03 21:36:33 -04:00
Jay D Dee
02202ab803 v3.9.1.1 2019-05-31 13:20:12 -04:00
111 changed files with 8012 additions and 8880 deletions

View File

@@ -68,7 +68,8 @@ cpuminer_SOURCES = \
algo/blake/pentablake-4way.c \
algo/blake/pentablake.c \
algo/bmw/sph_bmw.c \
algo/bmw/bmw-hash-4way.c \
algo/bmw/bmw256-hash-4way.c \
algo/bmw/bmw512-hash-4way.c \
algo/bmw/bmw256.c \
algo/cryptonight/cryptolight.c \
algo/cryptonight/cryptonight-common.c\
@@ -162,10 +163,13 @@ cpuminer_SOURCES = \
algo/sha/sph_sha2.c \
algo/sha/sph_sha2big.c \
algo/sha/sha2-hash-4way.c \
algo/sha/sha256_hash_11way.c \
algo/sha/sha2.c \
algo/sha/sha256t-gate.c \
algo/sha/sha256t-4way.c \
algo/sha/sha256t.c \
algo/sha/sha256q-4way.c \
algo/sha/sha256q.c \
algo/shabal/sph_shabal.c \
algo/shabal/shabal-hash-4way.c \
algo/shavite/sph_shavite.c \
@@ -262,7 +266,7 @@ cpuminer_SOURCES = \
algo/yescrypt/sha256_Y.c \
algo/yescrypt/yescrypt-best.c \
algo/yespower/yespower.c \
algo/yespower/sha256.c \
algo/yespower/sha256_p.c \
algo/yespower/yespower-opt.c
disable_flags =

View File

@@ -12,7 +12,7 @@ the software, don't use it.
Choose the exe that best matches you CPU's features or use trial and
error to find the fastest one that doesn't crash. Pay attention to
the features listed at cpuminer startup to ensure you are mining at
optimum speed using all the available features.
optimum speed using the best available features.
Architecture names and compile options used are only provided for Intel
Core series. Even the newest Pentium and Celeron CPUs are often missing
@@ -22,8 +22,6 @@ AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
these CPUs. Some algos may crash the miner with an invalid instruction.
Users are recommended to use an unoptimized miner such as cpuminer-multi.
Changes in v3.8.4 may have improved compatibility with some of these CPUs.
Exe name Compile flags Arch name

View File

@@ -33,11 +33,55 @@ Requirements
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
supported.
64 bit Linux or Windows operating system. Apple is not supported.
64 bit Linux or Windows operating system. Apple and Android are not supported.
Change Log
----------
v3.9.3.1
Skippped v3.9.3 due to misidentification of v3.9.2.5 as v3.9.3.
Fixed x16r algo 25% invalid share reject rate. The bug may have also
affected other algos.
v3.9.2.5
Fixed 2 regressions: hodl AES detection, x16r invalid shares with AVX2.
More restructuring.
v3.9.2.4
Yet another affinity fix. Hopefully the last one.
v3.9.2.3
Another cpu-affinity fix.
Disabled test code that fails to compile on some CPUs with limited
AVX512 capabilities.
v3.9.2.2
Fixed some day one cpu-affinity issues.
v3.9.2
Added sha256q algo.
Yespower now uses openssl SHA256, but no observable hash rate increase
on Ryzen.
Ongoing rearchitecting.
Lyra2z now hashes 8-way on CPUs with AVX2.
Lyra2 (all including phi2) now runs optimized code with SSE2.
v3.9.1.1
Fixed lyra2v3 AVX and below.
Compiling on Windows using Cygwin now works. Simply use "./build.sh"
just like on Linux. It isn't portable therefore the binaries package will
continue to use the existing procedure.
The Cygwin procedure will be documented in more detail later and will
include a list of packages that need to be installed.
v3.9.1
Fixed AVX2 version of anime algo.

View File

@@ -210,6 +210,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break;
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
case ALGO_SHA256Q: register_sha256q_algo ( gate ); break;
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
case ALGO_SKEIN: register_skein_algo ( gate ); break;
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
@@ -344,9 +345,9 @@ const char* const algo_alias_map[][2] =
{ NULL, NULL }
};
// if arg is a valid alias for a known algo it is updated with the proper name.
// No validation of the algo or alias is done, It is the responsinility of the
// calling function to validate the algo after return.
// if arg is a valid alias for a known algo it is updated with the proper
// name. No validation of the algo or alias is done, It is the responsinility
// of the calling function to validate the algo after return.
void get_algo_alias( char** algo_or_alias )
{
int i;
@@ -361,3 +362,22 @@ void get_algo_alias( char** algo_or_alias )
#undef ALIAS
#undef PROPER
// only for parallel when there are lanes.
bool submit_solution( struct work *work, void *hash,
struct thr_info *thr, int lane )
{
work_set_target_ratio( work, hash );
if ( submit_work( thr, work ) )
{
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr->id, lane );
return true;
}
else
applog( LOG_WARNING, "Failed to submit share." );
return false;
}

View File

@@ -2,8 +2,7 @@
#include <stdbool.h>
#include <stdint.h>
#include "miner.h"
#include "avxdefs.h"
#include "interleave.h"
#include "simd-utils.h"
/////////////////////////////
////
@@ -196,8 +195,9 @@ void four_way_not_tested();
int null_scanhash();
// The one and only, a callback for scanhash.
bool submit_solution( struct work *work, void *hash,
struct thr_info *thr, int lane );
bool submit_work( struct thr_info *thr, const struct work *work_in );
// displays warning

View File

@@ -45,7 +45,7 @@ extern "C"{
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#define SPH_SIZE_blake256 256

View File

@@ -16,7 +16,7 @@
#if defined(__SSE4_2__)
#include "avxdefs.h"
#include "simd-utils.h"
#include <stddef.h>
#include <stdint.h>

View File

@@ -41,15 +41,18 @@ extern "C"{
#endif
#include <stddef.h>
#ifdef __AVX2__
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#define SPH_SIZE_bmw256 256
#define SPH_SIZE_bmw512 512
#if defined(__SSE2__)
// BMW-256 4 way 32
typedef struct {
__m128i buf[64];
__m128i H[16];
@@ -59,6 +62,60 @@ typedef struct {
typedef bmw_4way_small_context bmw256_4way_context;
void bmw256_4way_init(void *cc);
void bmw256_4way(void *cc, const void *data, size_t len);
void bmw256_4way_close(void *cc, void *dst);
void bmw256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#endif // __SSE2__
#if defined(__AVX2__)
// BMW-256 8 way 32
typedef struct {
__m256i buf[64];
__m256i H[16];
size_t ptr;
uint32_t bit_count; // assume bit_count fits in 32 bits
} bmw_8way_small_context __attribute__ ((aligned (64)));
typedef bmw_8way_small_context bmw256_8way_context;
void bmw256_8way_init( bmw256_8way_context *ctx );
void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len );
void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
#endif
#if defined(__SSE2__)
// BMW-512 2 way 64
typedef struct {
__m128i buf[16];
__m128i H[16];
size_t ptr;
uint64_t bit_count;
} bmw_2way_big_context __attribute__ ((aligned (64)));
typedef bmw_2way_big_context bmw512_2way_context;
void bmw512_2way_init( bmw512_2way_context *ctx );
void bmw512_2way( bmw512_2way_context *ctx, const void *data, size_t len );
void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );
#endif // __SSE2__
#if defined(__AVX2__)
// BMW-512 4 way 64
typedef struct {
__m256i buf[16];
__m256i H[16];
@@ -68,14 +125,6 @@ typedef struct {
typedef bmw_4way_big_context bmw512_4way_context;
void bmw256_4way_init(void *cc);
void bmw256_4way(void *cc, const void *data, size_t len);
void bmw256_4way_close(void *cc, void *dst);
void bmw256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
void bmw512_4way_init(void *cc);
@@ -86,10 +135,10 @@ void bmw512_4way_close(void *cc, void *dst);
void bmw512_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#endif
#endif // __AVX2__
#ifdef __cplusplus
}
#endif
#endif
#endif // BMW_HASH_H__

File diff suppressed because it is too large Load Diff

1109
algo/bmw/bmw512-hash-4way.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,7 @@
#if defined(__AVX2__)
#include <stdint.h>
#include "avxdefs.h"
#include "simd-utils.h"
// 2x128, 2 way parallel SSE2

View File

@@ -13,7 +13,7 @@
#include <stdbool.h>
#include <unistd.h>
#include <memory.h>
#include "avxdefs.h"
#include "simd-utils.h"
#include <stdio.h>
// The result of hashing 10 rounds of initial data which is params and

View File

@@ -11,6 +11,8 @@ extern "C"{
#pragma warning (disable: 4146)
#endif
#define SPH_FUGUE_NOCOPY 1
static const sph_u32 IV224[] = {
SPH_C32(0xf4c9120d), SPH_C32(0x6286f757), SPH_C32(0xee39e01c),
SPH_C32(0xe074e3cb), SPH_C32(0xa1127c62), SPH_C32(0x9a43d215),

View File

@@ -12,7 +12,7 @@
#include <memory.h>
#include "hash-groestl.h"
#include "miner.h"
#include "avxdefs.h"
#include "simd-utils.h"
#ifndef NO_AES_NI

View File

@@ -9,7 +9,7 @@
#include <memory.h>
#include "hash-groestl256.h"
#include "miner.h"
#include "avxdefs.h"
#include "simd-utils.h"
#ifndef NO_AES_NI

View File

@@ -40,7 +40,7 @@
#if defined (__AVX2__)
#include "avxdefs.h"
#include "simd-utils.h"
#ifdef __cplusplus
extern "C"{

View File

@@ -69,7 +69,7 @@ extern "C"{
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#define SPH_SIZE_haval256_5 256

View File

@@ -156,7 +156,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
bool register_hodl_algo( algo_gate_t* gate )
{
#if defined(__AES__)
#if !defined(__AES__)
applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
return false;
#endif

View File

@@ -11,6 +11,10 @@
#include <sys/endian.h>
#endif
#if defined(__CYGWIN__)
#include <endian.h>
#endif
#include "tmmintrin.h"
#include "smmintrin.h"

View File

@@ -8,6 +8,10 @@
#include <sys/endian.h>
#endif
#if defined(__CYGWIN__)
#include <endian.h>
#endif
#include "tmmintrin.h"
#include "smmintrin.h"
#include "immintrin.h"

View File

@@ -44,7 +44,7 @@ extern "C"{
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#define SPH_SIZE_jh256 256

View File

@@ -44,7 +44,7 @@ extern "C"{
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#define SPH_SIZE_keccak256 256

View File

@@ -91,7 +91,7 @@ extern "C"{
#pragma warning (disable: 4146)
#endif
/*
static const sph_u64 RC[] = {
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
@@ -106,7 +106,7 @@ static const sph_u64 RC[] = {
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
};
*/
#define kekDECL_STATE \
sph_u64 keca00, keca01, keca02, keca03, keca04; \
sph_u64 keca10, keca11, keca12, keca13, keca14; \
@@ -756,6 +756,20 @@ static const sph_u64 RC[] = {
* tested faster saving space
*/
#define KECCAK_F_1600_ do { \
static const sph_u64 RC[] = { \
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), \
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), \
SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), \
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), \
SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), \
SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), \
SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), \
SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), \
SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), \
SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), \
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), \
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) \
}; \
int j; \
for (j = 0; j < 24; j += 4) { \
KF_ELT( 0, 1, RC[j + 0]); \
@@ -791,7 +805,7 @@ static const sph_u64 RC[] = {
/* load initial constants */
#define KEC_I
static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 };
//static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 };
/*
unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
*/
@@ -799,6 +813,7 @@ static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0
/* load hash for loop */
#define KEC_U \
do { \
static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
/*memcpy(hashbuf, hash, 64); */ \
memcpy(hash + 64, keczword, 8); \
} while (0);

View File

@@ -24,7 +24,7 @@
#if defined(__AVX2__)
#include "avxdefs.h"
#include "simd-utils.h"
#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
0UL, 0UL, 0UL, 0xffffffffUL )

View File

@@ -24,7 +24,7 @@
#include <immintrin.h>
#include "algo/sha/sha3-defs.h"
#include "avxdefs.h"
#include "simd-utils.h"
/* The length of digests*/
#define DIGEST_BIT_LEN_224 224

View File

@@ -20,7 +20,7 @@
#include <string.h>
#include <emmintrin.h>
#include "avxdefs.h"
#include "simd-utils.h"
#include "luffa_for_sse2.h"
#define MULT2(a0,a1) do \

View File

@@ -44,10 +44,11 @@ void allium_4way_hash( void *state, const void *input )
blake256_4way( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash32 );
mm256_reinterleave_4x64( vhash64, vhash32, 256 );
mm256_rintrlv_4x32_4x64( vhash64, vhash32, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
@@ -67,73 +68,64 @@ void allium_4way_hash( void *state, const void *input )
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
mm256_intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
sizeof(hashState_groestl256) );
update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
}
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
blake256_4way_init( &allium_4way_ctx.blake );
blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
allium_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
if ( fulltest( hash+(lane<<3), ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, hash+(lane<<3), mythr, lane );
}
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -70,7 +70,7 @@ void allium_hash(void *state, const void *input)
}
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[20];
@@ -80,6 +80,7 @@ int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x3ffff;

View File

@@ -1,6 +1,43 @@
#include "lyra2-gate.h"
// huge pages
//
// Use MAP_PRIVATE instead
// In register algo:
// replace thread safe whole matrix with a char**
// alloc huge pages matrixsize * threads
// make pointers to each thread to each thread, creating an
// array[thread][matrix].
// Each thread can create its own matrix pointer:
// my_matrix = the matrix + ( thread_id * matrix_size )
//
// Compiler version check?
// Fallback?
//
// create a generic utility to map & unmap huge pages.
// ptr = malloc_huge( size );
// Yespower wrapper checks for 64 byte alignment, seems unnecessary as
// it should be aligned to the page boundary. It may be desireable to
// have the matrix size rounded up if necessary to something bigger
// than 64 byte, say 4 kbytes a small page size.
// Define some constants for indivual parameters and matrix size for
// each algo. Use the parameter constants where apropriate.
// Convert algos that don't yet do so to use dynamic alllocation.
// Alloc huge pages globally. If ok each thread will create a pointer to
// its chunk. If fail each thread will use use _mm_alloc for itself.
// BLOCK_LEN_BYTES is 768.
#define LYRA2REV3_NROWS 4
#define LYRA2REV3_NCOLS 4
/*
#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
(LYRA2REV3_NROWS)*8)
*/
#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)<<4)
__thread uint64_t* l2v3_wholeMatrix;
bool lyra2rev3_thread_init()

View File

@@ -5,7 +5,9 @@
#include <stdint.h>
#include "lyra2.h"
#if defined(__AVX2__)
//#if defined(__AVX2__)
#if defined(__SSE2__)
#define LYRA2REV3_4WAY
#endif
@@ -43,25 +45,25 @@ bool register_lyra2rev2_algo( algo_gate_t* gate );
void lyra2rev2_4way_hash( void *state, const void *input );
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool init_lyra2rev2_4way_ctx();
#else
void lyra2rev2_hash( void *state, const void *input );
int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool init_lyra2rev2_ctx();
#endif
/////////////////////////
#if defined(__SSE4_2__)
#if defined(__SSE2__)
#define LYRA2Z_4WAY
#endif
#if defined(__AVX2__)
// #define LYRA2Z_8WAY
#define LYRA2Z_8WAY
#endif
@@ -71,21 +73,21 @@ bool init_lyra2rev2_ctx();
void lyra2z_8way_hash( void *state, const void *input );
int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_8way_thread_init();
#elif defined(LYRA2Z_4WAY)
void lyra2z_4way_hash( void *state, const void *input );
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_4way_thread_init();
#else
void lyra2z_hash( void *state, const void *input );
int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_thread_init();
#endif
@@ -102,14 +104,14 @@ bool lyra2z_thread_init();
void lyra2h_4way_hash( void *state, const void *input );
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2h_4way_thread_init();
#else
void lyra2h_hash( void *state, const void *input );
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2h_thread_init();
#endif
@@ -126,14 +128,14 @@ bool register_allium_algo( algo_gate_t* gate );
void allium_4way_hash( void *state, const void *input );
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool init_allium_4way_ctx();
#else
void allium_hash( void *state, const void *input );
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool init_allium_ctx();
#endif
@@ -146,7 +148,7 @@ bool register_phi2_algo( algo_gate_t* gate );
void phi2_hash( void *state, const void *input );
int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_phi2_ctx();
#endif // LYRA2_GATE_H__

View File

@@ -236,7 +236,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
//Tries to allocate enough space for the whole memory matrix
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
/*
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
@@ -566,7 +566,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
#if defined(__AVX2__)
memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
#elif defined(__SSE4_2__)
#elif defined(__SSE2__)
memset_zero_128( (__m128i*)wholeMatrix, i>>4 );
#else
memset( wholeMatrix, 0, i );

View File

@@ -36,66 +36,53 @@ void lyra2h_4way_hash( void *state, const void *input )
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
LYRA2Z( lyra2h_4way_matrix, state, 32, hash0, 32, hash0, 32,
16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, state+32, 32, hash1, 32, hash1,
32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, state+64, 32, hash2, 32, hash2,
32, 16, 16, 16 );
LYRA2Z( lyra2h_4way_matrix, state+96, 32, hash3, 32, hash3,
32, 16, 16, 16 );
}
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep= vdata + 76; // 19*4
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 20; i++ )
be32enc( &edata[i], pdata[i] );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
lyra2h_4way_midstate( vdata );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
be32enc( &edata[19], n );
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2h_4way_hash( hash, vdata );
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -36,7 +36,7 @@ void lyra2h_hash( void *state, const void *input )
}
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) endiandata[20];
@@ -45,6 +45,7 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if (opt_benchmark)
ptarget[7] = 0x0000ff;

View File

@@ -6,7 +6,7 @@
#include "algo/keccak/sph_keccak.h"
#include "lyra2.h"
#include "algo-gate-api.h"
#include "avxdefs.h"
#include "simd-utils.h"
#if defined(__AES__)
#include "algo/groestl/aes_ni/hash-groestl256.h"
#endif
@@ -81,8 +81,8 @@ void lyra2re_hash(void *state, const void *input)
memcpy(state, hashA, 32);
}
int scanhash_lyra2re(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_lyra2re( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -91,6 +91,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
swab32_array( endiandata, pdata, 20 );

View File

@@ -42,10 +42,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
blake256_4way( &ctx.blake, input + (64<<2), 16 );
blake256_4way_close( &ctx.blake, vhash );
mm256_reinterleave_4x64( vhash64, vhash, 256 );
mm256_rintrlv_4x32_4x64( vhash64, vhash, 256 );
keccak256_4way( &ctx.keccak, vhash64, 32 );
keccak256_4way_close( &ctx.keccak, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
cubehashInit( &ctx.cube, 256, 16, 32 );
@@ -60,10 +62,12 @@ void lyra2rev2_4way_hash( void *state, const void *input )
LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
mm256_intrlv_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
skein256_4way( &ctx.skein, vhash64, 32 );
skein256_4way_close( &ctx.skein, vhash64 );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
@@ -74,60 +78,55 @@ void lyra2rev2_4way_hash( void *state, const void *input )
cubehashInit( &ctx.cube, 256, 16, 32 );
cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, vhash );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
}
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
blake256_4way_init( &l2v2_4way_ctx.blake );
blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
do
{
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2rev2_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -73,7 +73,7 @@ void lyra2rev2_hash( void *state, const void *input )
}
int scanhash_lyra2rev2(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -82,6 +82,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;

View File

@@ -35,7 +35,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
blake256_4way( &ctx.blake, input, 80 );
blake256_4way_close( &ctx.blake, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
@@ -55,10 +55,9 @@ void lyra2rev3_4way_hash( void *state, const void *input )
LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
bmw256_4way( &ctx.bmw, vhash, 32 );
bmw256_4way_close( &ctx.bmw, state );
}
int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -66,7 +65,6 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
@@ -74,22 +72,13 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
int num_found = 0;
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
// Need big endian data
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
do
{
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -100,22 +89,14 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
for ( int lane = 0; lane < 4; lane++ ) if ( hash7[lane] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return 0;
}

View File

@@ -36,66 +36,51 @@ void lyra2z_4way_hash( void *state, const void *input )
blake256_4way( &ctx_blake, input + (64*4), 16 );
blake256_4way_close( &ctx_blake, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
memcpy( state, hash0, 32 );
memcpy( state+32, hash1, 32 );
memcpy( state+64, hash2, 32 );
memcpy( state+96, hash3, 32 );
LYRA2Z( lyra2z_4way_matrix, state , 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, state+32, 32, hash1, 32, hash1, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, state+64, 32, hash2, 32, hash2, 32, 8, 8, 8 );
LYRA2Z( lyra2z_4way_matrix, state+96, 32, hash3, 32, hash3, 32, 8, 8, 8 );
}
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 20; i++ )
be32enc( &edata[i], pdata[i] );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
lyra2z_4way_midstate( vdata );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2z_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif
@@ -134,8 +119,8 @@ void lyra2z_8way_hash( void *state, const void *input )
blake256_8way( &ctx_blake, input + (64*8), 16 );
blake256_8way_close( &ctx_blake, vhash );
mm256_deinterleave_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
mm256_dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
@@ -150,64 +135,49 @@ void lyra2z_8way_hash( void *state, const void *input )
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash1, 32 );
memcpy( state+160, hash2, 32 );
memcpy( state+192, hash3, 32 );
memcpy( state+224, hash1, 32 );
memcpy( state+128, hash4, 32 );
memcpy( state+160, hash5, 32 );
memcpy( state+192, hash6, 32 );
memcpy( state+224, hash7, 32 );
}
int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) edata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 152; // 19*8
__m256i *noncev = (__m256i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 19; i++ )
be32enc( &edata[i], pdata[i] );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_8x32( vdata, pdata );
lyra2z_8way_midstate( vdata );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
be32enc( noncep+4, n+4 );
be32enc( noncep+5, n+5 );
be32enc( noncep+6, n+6 );
be32enc( noncep+7, n+7 );
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
lyra2z_8way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 8;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}

View File

@@ -3,7 +3,7 @@
#include "lyra2-gate.h"
#include "lyra2.h"
#include "algo/blake/sph_blake.h"
#include "avxdefs.h"
#include "simd-utils.h"
__thread uint64_t* lyra2z_matrix;
@@ -44,7 +44,7 @@ void lyra2z_hash( void *state, const void *input )
}
int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) endiandata[20];
@@ -53,6 +53,7 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if (opt_benchmark)
ptarget[7] = 0x0000ff;

View File

@@ -1,7 +1,7 @@
#include <memory.h>
#include "algo-gate-api.h"
#include "lyra2.h"
#include "avxdefs.h"
#include "simd-utils.h"
__thread uint64_t* lyra2z330_wholeMatrix;
@@ -16,39 +16,46 @@ void lyra2z330_hash(void *state, const void *input, uint32_t height)
}
int scanhash_lyra2z330( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
if (opt_benchmark)
ptarget[7] = 0x0000ff;
uint32_t hash[8] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
for (int i=0; i < 19; i++) {
be32enc(&endiandata[i], pdata[i]);
}
if (opt_benchmark)
ptarget[7] = 0x0000ff;
do {
be32enc(&endiandata[19], nonce);
lyra2z330_hash( hash, endiandata, work->height );
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
do
{
be32enc(&endiandata[19], nonce);
lyra2z330_hash( hash, endiandata, work->height );
if ( hash[7] <= Htarg && fulltest(hash, ptarget) && !opt_benchmark )
{
work_set_target_ratio(work, hash);
pdata[19] = nonce;
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d",
accepted_share_count + rejected_share_count + 1,
mythr->id );
else
applog( LOG_WARNING, "Failed to submit share." );
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
void lyra2z330_set_target( struct work* work, double job_diff )

View File

@@ -50,11 +50,11 @@ void phi2_hash(void *state, const void *input)
unsigned char _ALIGN(128) hashA[64];
unsigned char _ALIGN(128) hashB[64];
phi2_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
phi2_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
phi2_has_roots ? 144 : 80 );
cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
phi2_has_roots ? 144 : 80 );
LYRA2RE( &hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8 );
LYRA2RE( &hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8 );
@@ -63,17 +63,17 @@ void phi2_hash(void *state, const void *input)
sph_jh512_close( &ctx.jh, (void*)hash );
if ( hash[0] & 1 )
{
sph_gost512( &ctx.gost, (const void*)hash, 64 );
{
sph_gost512( &ctx.gost, (const void*)hash, 64 );
sph_gost512_close( &ctx.gost, (void*)hash );
}
else
{
else
{
#if defined(__AES__)
update_final_echo ( &ctx.echo1, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
update_final_echo ( &ctx.echo2, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
update_final_echo ( &ctx.echo1, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
update_final_echo ( &ctx.echo2, (BitSequence *)hash,
(const BitSequence *)hash, 512 );
#else
sph_echo512( &ctx.echo1, (const void*)hash, 64 );
sph_echo512_close( &ctx.echo1, (void*)hash );
@@ -92,42 +92,50 @@ void phi2_hash(void *state, const void *input)
memcpy(state, hash, 32);
}
int scanhash_phi2(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[36];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[36];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
if(opt_benchmark){
ptarget[7] = 0x00ff;
}
if(opt_benchmark){
ptarget[7] = 0x00ff;
}
phi2_has_roots = false;
for ( int i=0; i < 36; i++ )
{
be32enc(&endiandata[i], pdata[i]);
if (i >= 20 && pdata[i]) phi2_has_roots = true;
}
phi2_has_roots = false;
for (int i=0; i < 36; i++) {
be32enc(&endiandata[i], pdata[i]);
if (i >= 20 && pdata[i]) phi2_has_roots = true;
}
do {
be32enc( &endiandata[19], n );
phi2_hash( hash, endiandata );
do {
be32enc(&endiandata[19], n);
phi2_hash(hash, endiandata);
if (hash[7] < Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
{
pdata[19] = n;
work_set_target_ratio( work, hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d.",
accepted_share_count + rejected_share_count + 1,
thr_id );
else
applog( LOG_WARNING, "Failed to submit share." );
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 1;
}
n++;
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);
} while ( n < max_nonce && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}

View File

@@ -51,7 +51,7 @@ inline void initState( uint64_t State[/*16*/] )
state[3] = _mm256_set_epi64x( blake2b_IV[7], blake2b_IV[6],
blake2b_IV[5], blake2b_IV[4] );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
@@ -137,7 +137,7 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
//Squeezes remaining bytes
memcpy_256( out, state, ( len_m256i % BLOCK_LEN_M256I ) );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
const int len_m128i = len / 16;
const int fullBlocks = len_m128i / BLOCK_LEN_M128I;
@@ -205,7 +205,7 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i* in = (__m128i*)In;
@@ -273,7 +273,7 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i* in = (__m128i*)In;
@@ -355,7 +355,7 @@ inline void reducedSqueezeRow0( uint64_t* State, uint64_t* rowOut,
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i state0 = _mm_load_si128( state );
@@ -494,7 +494,7 @@ inline void reducedDuplexRow1( uint64_t *State, uint64_t *rowIn,
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i state0 = _mm_load_si128( state );
@@ -694,7 +694,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined (__SSE4_2__)
#elif defined (__SSE2__)
__m128i* in = (__m128i*)rowIn;
__m128i* inout = (__m128i*)rowInOut;
@@ -713,9 +713,9 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
__m128i* state = (__m128i*)State;
// For the last round in this function not optimized for AVX
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
// uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
// uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
// uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
for ( i = 0; i < nCols; i++ )
{
@@ -750,6 +750,28 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
out[4] = _mm_xor_si128( state[4], in[4] );
out[5] = _mm_xor_si128( state[5], in[5] );
__m128i t0, t1;
t0 = _mm_srli_si128( state[0], 8 );
t1 = _mm_srli_si128( state[1], 8 );
inout[0] = _mm_xor_si128( inout[0],
_mm_or_si128( _mm_slli_si128( state[0], 8 ),
_mm_srli_si128( state[5], 8 ) ) );
inout[1] = _mm_xor_si128( inout[1],
_mm_or_si128( _mm_slli_si128( state[1], 8 ), t0 ) );
t0 = _mm_srli_si128( state[2], 8 );
inout[2] = _mm_xor_si128( inout[2],
_mm_or_si128( _mm_slli_si128( state[2], 8 ), t1 ) );
t1 = _mm_srli_si128( state[3], 8 );
inout[3] = _mm_xor_si128( inout[3],
_mm_or_si128( _mm_slli_si128( state[3], 8 ), t0 ) );
t0 = _mm_srli_si128( state[4], 8 );
inout[4] = _mm_xor_si128( inout[4],
_mm_or_si128( _mm_slli_si128( state[4], 8 ), t1 ) );
inout[5] = _mm_xor_si128( inout[5],
_mm_or_si128( _mm_slli_si128( state[5], 8 ), t0 ) );
/*
ptrWordInOut[0] ^= State[11];
ptrWordInOut[1] ^= State[0];
ptrWordInOut[2] ^= State[1];
@@ -768,7 +790,7 @@ inline void reducedDuplexRowSetup( uint64_t *State, uint64_t *rowIn,
ptrWordIn += BLOCK_LEN_INT64;
//Output: goes to previous column
ptrWordOut -= BLOCK_LEN_INT64;
*/
inout += BLOCK_LEN_M128I;
in += BLOCK_LEN_M128I;
out -= BLOCK_LEN_M128I;
@@ -930,7 +952,7 @@ inline void reducedDuplexRow( uint64_t *State, uint64_t *rowIn,
_mm256_store_si256( (__m256i*)State + 2, state2 );
_mm256_store_si256( (__m256i*)State + 3, state3 );
#elif defined(__SSE4_2__)
#elif defined (__SSE2__)
__m128i* state = (__m128i*)State;
__m128i* in = (__m128i*)rowIn;

View File

@@ -23,7 +23,7 @@
#define SPONGE_H_
#include <stdint.h>
#include "avxdefs.h"
#include "simd-utils.h"
#if defined(__GNUC__)
#define ALIGN __attribute__ ((aligned(32)))
@@ -59,7 +59,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
// returns void, updates all args
#define G_4X64(a,b,c,d) \
a = _mm256_add_epi64( a, b ); \
d = mm256_ror_64( _mm256_xor_si256( d, a), 32 ); \
d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
c = _mm256_add_epi64( c, d ); \
b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
a = _mm256_add_epi64( a, b ); \
@@ -108,7 +108,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rol1x64_256( s2, s3 ); \
mm128_ror1x64_256( s2, s3 ); \
mm128_swap128_256( s4, s5 ); \
mm128_rol1x64_256( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
@@ -132,7 +132,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
#endif // AVX2
#endif // AVX2 else SSE2
// Scalar
//Blake2b's G function

View File

@@ -6,7 +6,7 @@
#if defined(__SSE4_2__)
#include "avxdefs.h"
#include "simd-utils.h"
typedef struct
{

View File

@@ -30,7 +30,7 @@
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#if defined(__SSE4_2__)
#if defined(__SSE2__)
#include <stddef.h>
#include <string.h>
@@ -716,4 +716,4 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
}
#endif // __AVX2__
#endif // __SSE4_2__
#endif // __SSE2__

View File

@@ -42,9 +42,10 @@
#include <stddef.h>
#include "sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#if defined(__SSE4_2__)
#if defined(__SSE2__)
//#if defined(__SSE4_2__)
//#define SPH_SIZE_sha256 256
@@ -60,6 +61,26 @@ void sha256_4way_init( sha256_4way_context *sc );
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
void sha256_4way_close( sha256_4way_context *sc, void *dst );
/*
// SHA-256 7 way hybrid
// Combines SSE, MMX and scalar data to do 8 + 2 + 1 parallel.
typedef struct {
__m128i bufx[64>>2];
__m128i valx[8];
__m64 bufy[64>>2];
__m64 valy[8];
uint32_t bufz[64>>2];
uint32_t valz[8];
uint32_t count_high, count_low;
} sha256_7way_context;
void sha256_7way_init( sha256_7way_context *ctx );
void sha256_7way( sha256_7way_context *ctx, const void *datax,
void *datay, void *dataz, size_t len );
void sha256_7way_close( sha256_7way_context *ctx, void *dstx, void *dstyx,
void *dstz );
*/
#if defined (__AVX2__)
// SHA-256 8 way
@@ -88,6 +109,24 @@ void sha512_4way_init( sha512_4way_context *sc);
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
void sha512_4way_close( sha512_4way_context *sc, void *dst );
#endif
#endif
#endif
// SHA-256 11 way hybrid
// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
typedef struct {
__m256i bufx[64>>2];
__m256i valx[8];
__m64 bufy[64>>2];
__m64 valy[8];
uint32_t bufz[64>>2];
uint32_t valz[8];
uint32_t count_high, count_low;
} sha256_11way_context;
void sha256_11way_init( sha256_11way_context *ctx );
void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
const void *datay, const void *dataz, size_t len );
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
void *dstz );
#endif // __AVX2__
#endif // __SSE2__
#endif // SHA256_4WAY_H__

View File

@@ -0,0 +1,538 @@
#if 0
#include <stddef.h>
#include <string.h>
#include "sha2-hash-4way.h"
#if defined(__AVX2__)
// naming convention for variables and macros
// VARx: AVX2 8 way 32 bit
// VARy: MMX 2 way 32 bit
// VARz: scalar integer 32 bit
static const uint32_t H256[8] =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
static const uint32_t K256[64] =
{
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};
#define CHx(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
#define CHy(X, Y, Z) \
_mm_xor_si64( _mm_and_si64( _mm_xor_si64( Y, Z ), X ), Z )
#define CHz(X, Y, Z) ((( (Y) ^ (Z) ) & (X) ) ^ (Z) )
#define MAJx(X, Y, Z) \
_mm256_or_si256( _mm256_and_si256( X, Y ), \
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
#define MAJy(X, Y, Z) \
_mm_or_si64( _mm_and_si64( X, Y ), \
_mm_and_si64( _mm_or_si64( X, Y ), Z ) )
#define MAJz(X, Y, Z) ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) )
#define BSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x,2), mm256_ror_32(x,13) ), _mm256_srli_epi32(x,22) )
#define BSG2_0y(x) \
_mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) )
#define BSG2_0z(x) ( u32_ror_32(x,2) ^ u32_ror_32(x,13) ^ ((x)>>22) )
#define BSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x,6), mm256_ror_32(x,11) ), _mm256_srli_epi32(x,25) )
#define BSG2_1y(x) \
_mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) )
#define BSG2_1z(x) ( u32_ror_32(x,6) ^ u32_ror_32(x,11) ^ ((x)>>25) )
#define SSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x,7), mm256_ror_32(x,18) ), _mm256_srli_epi32(x,3) )
#define SSG2_0y(x) \
_mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) )
#define SSG2_0z(x) (( u32_ror_32(x,7) ^ u32_ror_32(x,18) ) ^ ((x)>>3) )
#define SSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) )
#define SSG2_1y(x) \
_mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) )
#define SSG2_1z(x) ( u32_ror_32(x,17) ^ u32_ror_32(x,19) ^ ((x)>>10) )
#define SHA2x_MEXP( a, b, c, d ) \
_mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] )
#define SHA2y_MEXP( a, b, c, d ) \
_mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] )
#define SHA2z_MEXP( a, b, c, d ) \
( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] )
#define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \
do { \
__m256i T1x, T2x; \
__m64 T1y, T2y; \
uint32_t T1z, T2z; \
T1x = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
_mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \
_mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \
T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
_mm_add_pi32( Hy, BSG2_1y(Ey) ), CHy(Ey, Fy, Gy) ), \
_mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \
T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \
T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \
T2y = _mm_add_pi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \
T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \
Dx = _mm256_add_epi32( Dx, T1x ); \
Dy = _mm_add_pi32( Dy, T1y ); \
Dz = Dz + T1z; \
Hx = _mm256_add_epi32( T1x, T2x ); \
Hy = _mm_add_pi32( T1y, T2y ); \
Hz = T1z + T2z; \
} while (0)
void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8],
uint32_t *inz, uint32_t rz[8] )
{
__m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx;
__m256i Wx[16];
__m64 Ay, By, Cy, Dy, Ey, Fy, Gy, Hy;
__m64 Wy[16];
uint32_t Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz;
uint32_t Wz[16];
Wx[ 0] = mm256_bswap_32( inx[ 0] );
Wy[ 0] = mm64_bswap_32( iny[ 0] );
Wz[ 0] = bswap_32( inz[ 0] );
Wx[ 1] = mm256_bswap_32( inx[ 1] );
Wy[ 1] = mm64_bswap_32( iny[ 1] );
Wz[ 1] = bswap_32( inz[ 1] );
Wx[ 2] = mm256_bswap_32( inx[ 2] );
Wy[ 2] = mm64_bswap_32( iny[ 2] );
Wz[ 2] = bswap_32( inz[ 2] );
Wx[ 3] = mm256_bswap_32( inx[ 3] );
Wy[ 3] = mm64_bswap_32( iny[ 3] );
Wz[ 3] = bswap_32( inz[ 3] );
Wx[ 4] = mm256_bswap_32( inx[ 4] );
Wy[ 4] = mm64_bswap_32( iny[ 4] );
Wz[ 4] = bswap_32( inz[ 4] );
Wx[ 5] = mm256_bswap_32( inx[ 5] );
Wy[ 5] = mm64_bswap_32( iny[ 5] );
Wz[ 5] = bswap_32( inz[ 5] );
Wx[ 6] = mm256_bswap_32( inx[ 6] );
Wy[ 6] = mm64_bswap_32( iny[ 6] );
Wz[ 6] = bswap_32( inz[ 6] );
Wx[ 7] = mm256_bswap_32( inx[ 7] );
Wy[ 7] = mm64_bswap_32( iny[ 7] );
Wz[ 7] = bswap_32( inz[ 7] );
Wx[ 8] = mm256_bswap_32( inx[ 8] );
Wy[ 8] = mm64_bswap_32( iny[ 8] );
Wz[ 8] = bswap_32( inz[ 8] );
Wx[ 9] = mm256_bswap_32( inx[ 9] );
Wy[ 9] = mm64_bswap_32( iny[ 9] );
Wz[ 9] = bswap_32( inz[ 9] );
Wx[10] = mm256_bswap_32( inx[10] );
Wy[10] = mm64_bswap_32( iny[10] );
Wz[10] = bswap_32( inz[10] );
Wx[11] = mm256_bswap_32( inx[11] );
Wy[11] = mm64_bswap_32( iny[11] );
Wz[11] = bswap_32( inz[11] );
Wx[12] = mm256_bswap_32( inx[12] );
Wy[12] = mm64_bswap_32( iny[12] );
Wz[12] = bswap_32( inz[12] );
Wx[13] = mm256_bswap_32( inx[13] );
Wy[13] = mm64_bswap_32( iny[13] );
Wz[13] = bswap_32( inz[13] );
Wx[14] = mm256_bswap_32( inx[14] );
Wy[14] = mm64_bswap_32( iny[14] );
Wz[14] = bswap_32( inz[14] );
Wx[15] = mm256_bswap_32( inx[15] );
Wy[15] = mm64_bswap_32( iny[15] );
Wz[15] = bswap_32( inz[15] );
Ax = rx[0]; Ay = ry[0]; Az = rz[0];
Bx = rx[1]; By = ry[1]; Bz = rz[1];
Cx = rx[2]; Cy = ry[2]; Cz = rz[2];
Dx = rx[3]; Dy = ry[3]; Dz = rz[3];
Ex = rx[4]; Ey = ry[4]; Ez = rz[4];
Fx = rx[5]; Fy = ry[5]; Fz = rz[5];
Gx = rx[6]; Gy = ry[6]; Gz = rz[6];
Hx = rx[7]; Hy = ry[7]; Hz = rz[7];
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, 0 );
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, 0 );
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, 0 );
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, 0 );
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, 0 );
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, 0 );
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, 0 );
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, 0 );
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, 0 );
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, 0 );
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, 0 );
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, 0 );
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, 0 );
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, 0 );
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, 0 );
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
Wx[ 0] = SHA2x_MEXP( 14, 9, 1, 0 );
Wy[ 0] = SHA2y_MEXP( 14, 9, 1, 0 );
Wz[ 0] = SHA2z_MEXP( 14, 9, 1, 0 );
Wx[ 1] = SHA2x_MEXP( 15, 10, 2, 1 );
Wy[ 1] = SHA2y_MEXP( 15, 10, 2, 1 );
Wz[ 1] = SHA2z_MEXP( 15, 10, 2, 1 );
Wx[ 2] = SHA2x_MEXP( 0, 11, 3, 2 );
Wy[ 2] = SHA2y_MEXP( 0, 11, 3, 2 );
Wz[ 2] = SHA2z_MEXP( 0, 11, 3, 2 );
Wx[ 3] = SHA2x_MEXP( 1, 12, 4, 3 );
Wy[ 3] = SHA2y_MEXP( 1, 12, 4, 3 );
Wz[ 3] = SHA2z_MEXP( 1, 12, 4, 3 );
Wx[ 4] = SHA2x_MEXP( 2, 13, 5, 4 );
Wy[ 4] = SHA2y_MEXP( 2, 13, 5, 4 );
Wz[ 4] = SHA2z_MEXP( 2, 13, 5, 4 );
Wx[ 5] = SHA2x_MEXP( 3, 14, 6, 5 );
Wy[ 5] = SHA2y_MEXP( 3, 14, 6, 5 );
Wz[ 5] = SHA2z_MEXP( 3, 14, 6, 5 );
Wx[ 6] = SHA2x_MEXP( 4, 15, 7, 6 );
Wy[ 6] = SHA2y_MEXP( 4, 15, 7, 6 );
Wz[ 6] = SHA2z_MEXP( 4, 15, 7, 6 );
Wx[ 7] = SHA2x_MEXP( 5, 0, 8, 7);
Wy[ 7] = SHA2y_MEXP( 5, 0, 8, 7);
Wz[ 7] = SHA2z_MEXP( 5, 0, 8, 7);
Wx[ 8] = SHA2x_MEXP( 6, 1, 9, 8);
Wy[ 8] = SHA2y_MEXP( 6, 1, 9, 8);
Wz[ 8] = SHA2z_MEXP( 6, 1, 9, 8);
Wx[ 9] = SHA2x_MEXP( 7, 2, 10, 9 );
Wy[ 9] = SHA2y_MEXP( 7, 2, 10, 9);
Wz[ 9] = SHA2z_MEXP( 7, 2, 10, 9);
Wx[10] = SHA2x_MEXP( 8, 3, 11, 10 );
Wy[10] = SHA2y_MEXP( 8, 3, 11, 10);
Wz[10] = SHA2z_MEXP( 8, 3, 11, 10);
Wx[11] = SHA2x_MEXP( 9, 4, 12, 11);
Wy[11] = SHA2y_MEXP( 9, 4, 12, 11);
Wz[11] = SHA2z_MEXP( 9, 4, 12, 11 );
Wx[12] = SHA2x_MEXP( 10, 5, 13, 12 );
Wy[12] = SHA2y_MEXP( 10, 5, 13, 12 );
Wz[12] = SHA2z_MEXP( 10, 5, 13, 12 );
Wx[13] = SHA2x_MEXP( 11, 6, 14, 13 );
Wy[13] = SHA2y_MEXP( 11, 6, 14, 13 );
Wz[13] = SHA2z_MEXP( 11, 6, 14, 13 );
Wx[14] = SHA2x_MEXP( 12, 7, 15, 14 );
Wy[14] = SHA2y_MEXP( 12, 7, 15, 14 );
Wz[14] = SHA2z_MEXP( 12, 7, 15, 14 );
Wx[15] = SHA2x_MEXP( 13, 8, 0, 15 );
Wy[15] = SHA2y_MEXP( 13, 8, 0, 15 );
Wz[15] = SHA2z_MEXP( 13, 8, 0, 15 );
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, j );
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, j );
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, j );
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, j );
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, j );
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, j );
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, j );
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, j );
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, j );
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, j );
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, j );
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, j );
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, j );
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, j );
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, j );
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, j );
}
rx[0] = _mm256_add_epi32( rx[0], Ax );
ry[0] = _mm_add_pi32( ry[0], Ay );
rz[0] = rz[0]+ Az;
rx[1] = _mm256_add_epi32( rx[1], Bx );
ry[1] = _mm_add_pi32( ry[1], By );
rz[1] = rz[1]+ Bz;
rx[2] = _mm256_add_epi32( rx[2], Cx );
ry[2] = _mm_add_pi32( ry[2], Cy );
rz[3] = rz[3]+ Dz;
rx[4] = _mm256_add_epi32( rx[4], Ex );
ry[4] = _mm_add_pi32( ry[4], Ey );
rz[4] = rz[4]+ Ez;
rx[5] = _mm256_add_epi32( rx[5], Fx );
ry[5] = _mm_add_pi32( ry[5], Fy );
rz[5] = rz[5]+ Fz;
rx[6] = _mm256_add_epi32( rx[6], Gx );
ry[6] = _mm_add_pi32( ry[6], Gy );
rz[6] = rz[6]+ Gz;
rx[7] = _mm256_add_epi32( rx[7], Hx );
ry[7] = _mm_add_pi32( ry[7], Hy );
rz[7] = rz[7]+ Hz;
}
void sha256_11way_init( sha256_11way_context *ctx )
{
ctx->count_high = ctx->count_low = 0;
ctx->valx[0] = _mm256_set1_epi32( H256[0] );
ctx->valy[0] = _mm_set1_pi32( H256[0] );
ctx->valx[1] = _mm256_set1_epi32( H256[0] );
ctx->valy[1] = _mm_set1_pi32( H256[0] );
ctx->valx[2] = _mm256_set1_epi32( H256[0] );
ctx->valy[2] = _mm_set1_pi32( H256[0] );
ctx->valx[3] = _mm256_set1_epi32( H256[0] );
ctx->valy[3] = _mm_set1_pi32( H256[0] );
ctx->valx[4] = _mm256_set1_epi32( H256[0] );
ctx->valy[4] = _mm_set1_pi32( H256[0] );
ctx->valx[5] = _mm256_set1_epi32( H256[0] );
ctx->valy[5] = _mm_set1_pi32( H256[0] );
ctx->valx[6] = _mm256_set1_epi32( H256[0] );
ctx->valy[6] = _mm_set1_pi32( H256[0] );
ctx->valx[7] = _mm256_set1_epi32( H256[0] );
ctx->valy[7] = _mm_set1_pi32( H256[0] );
memcpy( ctx->valz, H256, 32 );
}
void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
const void *datay, const void *dataz, size_t len )
{
__m256i *vdatax = (__m256i*) datax;
__m64 *vdatay = (__m64*) datay;
uint32_t *idataz = (uint32_t*)dataz;
size_t ptr;
const int buf_size = 64;
ptr = (unsigned)ctx->count_low & (buf_size - 1U);
while ( len > 0 )
{
size_t clen;
uint32_t clow, clow2;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
memcpy_m64( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
memcpy ( ctx->bufz + ptr, idataz + ptr, clen );
ptr += clen;
len -= clen;
if ( ptr == buf_size )
{
sha256_11way_round( ctx->bufx, ctx->valx,
ctx->bufy, ctx->valy,
ctx->bufz, ctx->valz );
ptr = 0;
}
clow = ctx->count_low;
clow2 = clow + clen;
ctx->count_low = clow2;
if ( clow2 < clow )
ctx->count_high++;
}
}
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
void *dstz)
{
unsigned ptr, u;
uint32_t low, high;
const int buf_size = 64;
const int pad = buf_size - 8;
ptr = (unsigned)ctx->count_low & (buf_size - 1U);
ctx->bufx[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
ctx->bufy[ ptr>>2 ] = _mm_set1_pi32( 0x80 );
ctx->bufz[ ptr>>2 ] = 0x80;
ptr += 4;
if ( ptr > pad )
{
memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 );
memset_zero_m64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
memset( ctx->bufz + (ptr>>2), 0, (buf_size - ptr) >> 2 );
sha256_11way_round( ctx->bufx, ctx->valx,
ctx->bufy, ctx->valy,
ctx->bufz, ctx->valz );
memset_zero_256( ctx->bufx, pad >> 2 );
memset_zero_m64( ctx->bufy, pad >> 2 );
memset( ctx->bufz, 0, pad >> 2 );
}
else
{
memset_zero_256( ctx->bufx + (ptr>>2), (pad - ptr) >> 2 );
memset_zero_m64( ctx->bufy + (ptr>>2), (pad - ptr) >> 2 );
memset( ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 );
}
low = ctx->count_low;
high = (ctx->count_high << 3) | (low >> 29);
low = low << 3;
ctx->bufx[ pad >> 2 ] =
mm256_bswap_32( _mm256_set1_epi32( high ) );
ctx->bufy[ pad >> 2 ] =
mm64_bswap_32( _mm_set1_pi32( high ) );
ctx->bufz[ pad >> 2 ] =
bswap_32( high );
ctx->bufx[ ( pad+4 ) >> 2 ] =
mm256_bswap_32( _mm256_set1_epi32( low ) );
ctx->bufy[ ( pad+4 ) >> 2 ] =
mm64_bswap_32( _mm_set1_pi32( low ) );
ctx->bufz[ ( pad+4 ) >> 2 ] =
bswap_32( low );
sha256_11way_round( ctx->bufx, ctx->valx,
ctx->bufy, ctx->valy,
ctx->bufz, ctx->valz );
for ( u = 0; u < 8; u ++ )
{
casti_m256i( dstx, u ) = mm256_bswap_32( ctx->valx[u] );
casti_m64 ( dsty, u ) = mm64_bswap_32( ctx->valy[u] );
((uint32_t*)dstz)[u] = bswap_32( ctx->valz[u] );
}
}
#endif
#endif // 0

188
algo/sha/sha256q-4way.c Normal file
View File

@@ -0,0 +1,188 @@
#include "sha256t-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "sha2-hash-4way.h"
#if defined(SHA256T_8WAY)
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
void sha256q_8way_hash( void* output, const void* input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
sha256_8way_context ctx;
memcpy( &ctx, &sha256_ctx8, sizeof ctx );
sha256_8way( &ctx, input + (64<<3), 16 );
sha256_8way_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way( &ctx, vhash, 32 );
sha256_8way_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way( &ctx, vhash, 32 );
sha256_8way_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way( &ctx, vhash, 32 );
sha256_8way_close( &ctx, output );
}
int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint64_t htmax[] = { 0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000 };
const uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
// Need big endian data
mm256_bswap_intrlv80_8x32( vdata, pdata );
sha256_8way_init( &sha256_ctx8 );
sha256_8way( &sha256_ctx8, vdata, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
pdata[19] = n;
sha256q_8way_hash( hash, vdata );
uint32_t *hash7 = &(hash[7<<3]);
for ( int lane = 0; lane < 8; lane++ )
if ( !( hash7[ lane ] & mask ) )
{
// deinterleave hash for lane
uint32_t lane_hash[8];
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif
#if defined(SHA256T_4WAY)
static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
void sha256q_4way_hash( void* output, const void* input )
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
sha256_4way_context ctx;
memcpy( &ctx, &sha256_ctx4, sizeof ctx );
sha256_4way( &ctx, input + (64<<2), 16 );
sha256_4way_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way( &ctx, vhash, 32 );
sha256_4way_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way( &ctx, vhash, 32 );
sha256_4way_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way( &ctx, vhash, 32 );
sha256_4way_close( &ctx, output );
}
int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint64_t htmax[] = { 0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000 };
const uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
mm128_bswap_intrlv80_4x32( vdata, pdata );
sha256_4way_init( &sha256_ctx4 );
sha256_4way( &sha256_ctx4, vdata, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
pdata[19] = n;
sha256q_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( !( hash7[ lane ] & mask ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

113
algo/sha/sha256q.c Normal file
View File

@@ -0,0 +1,113 @@
#include "sha256t-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <openssl/sha.h>
static __thread SHA256_CTX sha256q_ctx __attribute__ ((aligned (64)));
void sha256q_midstate( const void* input )
{
SHA256_Init( &sha256q_ctx );
SHA256_Update( &sha256q_ctx, input, 64 );
}
void sha256q_hash( void* output, const void* input )
{
uint32_t _ALIGN(64) hash[16];
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
SHA256_CTX ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
SHA256_Update( &ctx, input + midlen, tail );
SHA256_Final( (unsigned char*)hash, &ctx );
SHA256_Init( &ctx );
SHA256_Update( &ctx, hash, 32 );
SHA256_Final( (unsigned char*)hash, &ctx );
SHA256_Init( &ctx );
SHA256_Update( &ctx, hash, 32 );
SHA256_Final( (unsigned char*)hash, &ctx );
SHA256_Init( &ctx );
SHA256_Update( &ctx, hash, 32 );
SHA256_Final( (unsigned char*)hash, &ctx );
memcpy( output, hash, 32 );
}
int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
#ifdef _MSC_VER
uint32_t __declspec(align(32)) hash64[8];
#else
uint32_t hash64[8] __attribute__((aligned(32)));
#endif
uint32_t endiandata[32];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = {
0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000
};
uint32_t masks[] = {
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0
};
// we need bigendian data...
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
sha256q_midstate( endiandata );
for ( int m = 0; m < 6; m++ )
{
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
sha256q_hash( hash64, endiandata );
if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
{
work_set_target_ratio( work, hash64 );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d.",
accepted_share_count + rejected_share_count + 1,
thr_id );
else
applog( LOG_WARNING, "Failed to submit share." );
*hashes_done = n - first_nonce + 1;
}
} while ( n < max_nonce && !work_restart[thr_id].restart );
break;
}
}
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}

View File

@@ -5,6 +5,137 @@
#include <stdio.h>
#include "sha2-hash-4way.h"
#if defined(SHA256T_11WAY)
static __thread sha256_11way_context sha256_ctx11 __attribute__ ((aligned (64)));
void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
const void *inpy, const void*inpz )
{
uint32_t hashx[8*8] __attribute__ ((aligned (64)));
uint32_t hashy[8*2] __attribute__ ((aligned (64)));
uint32_t hashz[8] __attribute__ ((aligned (64)));
sha256_11way_context ctx;
const void *inpx64 = inpx+(64<<3);
const void *inpy64 = inpy+(64<<1);
const void *inpz64 = inpz+ 64;
memcpy( &ctx, &sha256_ctx11, sizeof ctx );
sha256_11way_update( &ctx, inpx64, inpy64, inpz64, 16 );
sha256_11way_close( &ctx, hashx, hashy, hashz );
sha256_11way_init( &ctx );
sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
sha256_11way_close( &ctx, hashx, hashy, hashz );
sha256_11way_init( &ctx );
sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
sha256_11way_close( &ctx, outx, outy, outz );
}
int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t datax[20*8] __attribute__ ((aligned (64)));
uint32_t datay[20*2] __attribute__ ((aligned (32)));
uint32_t dataz[20] __attribute__ ((aligned (32)));
uint32_t hashx[8*8] __attribute__ ((aligned (32)));
uint32_t hashy[8*2] __attribute__ ((aligned (32)));
uint32_t hashz[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m256i *noncex = (__m256i*) datax + 19;
__m64 *noncey = (__m64*) datay + 19;
uint32_t *noncez = (uint32_t*)dataz + 19;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
int i;
const uint64_t htmax[] = { 0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000 };
const uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
// Use dataz (scalar) to stage bswapped data for the vectors.
casti_m256i( dataz, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm256_intrlv_8x32( datax, dataz, dataz, dataz, dataz,
dataz, dataz, dataz, dataz, 640 );
mm64_interleave_2x32( datay, dataz, dataz, 640 );
sha256_11way_init( &sha256_ctx11 );
sha256_11way_update( &sha256_ctx11, datax, datay, dataz, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
*noncex = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
*noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) );
*noncez = bswap_32( n+10 );
pdata[19] = n;
sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz );
if ( opt_benchmark ) { n += 11; continue; }
hash7 = &(hashx[7<<3]);
for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
{
// deinterleave hash for lane
mm256_extract_lane_8x32( lane_hash, hashx, i, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + i;
submit_solution( work, lane_hash, mythr, i );
}
}
hash7 = &(hashy[7<<1]);
for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
{
mm64_extract_lane_2x32( lane_hash, hashy, i, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + 8 + i;
submit_solution( work, lane_hash, mythr, i+8 );
}
}
if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
{
pdata[19] = n+10;
submit_solution( work, hashz, mythr, 10 );
}
n += 11;
} while ( (n < max_nonce-12) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif
#if defined(SHA256T_8WAY)
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
@@ -25,15 +156,15 @@ void sha256t_8way_hash( void* output, const void* input )
sha256_8way_init( &ctx );
sha256_8way( &ctx, vhash, 32 );
sha256_8way_close( &ctx, output );
}
int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t edata[20] __attribute__ ((aligned (32)));;
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -56,12 +187,7 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
0 };
// Need big endian data
casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_8x32( vdata, pdata );
sha256_8way_init( &sha256_ctx8 );
sha256_8way( &sha256_ctx8, vdata, 64 );
@@ -70,45 +196,32 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do
{
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
pdata[19] = n;
*noncev = mm256_bswap_32( _mm256_set_epi32(
n+7,n+6,n+5,n+4,n+3,n+2,n+1,n ) );
pdata[19] = n;
sha256t_8way_hash( hash, vdata );
uint32_t *hash7 = &(hash[7<<3]);
for ( int lane = 0; lane < 8; lane++ )
if ( !( hash7[ lane ] & mask ) )
{
{
// deinterleave hash for lane
uint32_t lane_hash[8];
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
}
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 8;
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(SHA256T_4WAY)
#endif
#if defined(SHA256T_4WAY)
static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
@@ -128,7 +241,6 @@ void sha256t_4way_hash( void* output, const void* input )
sha256_4way_init( &ctx );
sha256_4way( &ctx, vhash, 32 );
sha256_4way_close( &ctx, output );
}
int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -136,9 +248,8 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t edata[20] __attribute__ ((aligned (32)));;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -153,20 +264,14 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
0xFFF,
0xFFFF,
0x10000000 };
const uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
const uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
mm128_bswap_intrlv80_4x32( vdata, pdata );
sha256_4way_init( &sha256_ctx4 );
sha256_4way( &sha256_ctx4, vdata, 64 );
@@ -175,7 +280,7 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t mask = masks[m];
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
pdata[19] = n;
pdata[19] = n;
sha256t_4way_hash( hash, vdata );
@@ -183,26 +288,16 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( !( hash7[ lane ] & mask ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
}
n += 4;
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}

View File

@@ -3,15 +3,15 @@
bool register_sha256t_algo( algo_gate_t* gate )
{
#if defined(SHA256T_8WAY)
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t_8way;
gate->hash = (void*)&sha256t_8way_hash;
#elif defined(SHA256T_4WAY)
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t_4way;
gate->hash = (void*)&sha256t_4way_hash;
#else
gate->optimizations = SSE42_OPT | AVX2_OPT | SHA_OPT;
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t;
gate->hash = (void*)&sha256t_hash;
#endif
@@ -19,3 +19,19 @@ bool register_sha256t_algo( algo_gate_t* gate )
return true;
}
bool register_sha256q_algo( algo_gate_t* gate )
{
#if defined(SHA256T_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256q_4way;
gate->hash = (void*)&sha256q_4way_hash;
#else
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256q;
gate->hash = (void*)&sha256q_hash;
#endif
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
}

View File

@@ -6,7 +6,7 @@
// Override multi way on ryzen, SHA is better.
#if !defined(RYZEN_)
#if defined(__SSE4_2__)
#if defined(__SSE2__)
#define SHA256T_4WAY
#endif
#if defined(__AVX2__)
@@ -14,26 +14,35 @@
#endif
#endif
bool register_blake2s_algo( algo_gate_t* gate );
bool register_sha256t_algo( algo_gate_t* gate );
bool register_sha256q_algo( algo_gate_t* gate );
#if defined(SHA256T_8WAY)
void sha256t_8way_hash( void *output, const void *input );
int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void sha256q_8way_hash( void *output, const void *input );
int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#elif defined (SHA256T_4WAY)
#if defined(SHA256T_4WAY)
void sha256t_4way_hash( void *output, const void *input );
int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void sha256q_4way_hash( void *output, const void *input );
int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
void sha256t_hash( void *output, const void *input );
int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
void sha256q_hash( void *output, const void *input );
int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif

View File

@@ -5,8 +5,6 @@
#include <stdio.h>
#include <openssl/sha.h>
#if !defined(SHA256T_4WAY)
static __thread SHA256_CTX sha256t_ctx __attribute__ ((aligned (64)));
void sha256t_midstate( const void* input )
@@ -72,8 +70,11 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
};
// we need bigendian data...
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
sha256t_midstate( endiandata );
@@ -89,7 +90,13 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
{
*hashes_done = n - first_nonce + 1;
return true;
work_set_target_ratio( work, hash64 );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d.",
accepted_share_count + rejected_share_count + 1,
thr_id );
else
applog( LOG_WARNING, "Failed to submit share." );
}
} while ( n < max_nonce && !work_restart[thr_id].restart );
break;
@@ -100,4 +107,3 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
pdata[19] = n;
return 0;
}
#endif

View File

@@ -40,7 +40,7 @@
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
#ifdef __cplusplus
extern "C"{

View File

@@ -346,7 +346,7 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
memcpy( buf + ptr, data, clen );
data = (const unsigned char *)data + clen;
ptr += clen;
len -= clen >> 1;
len -= (clen >> 1);
if ( ptr == sizeof ctx->buf )
{
if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 )
@@ -365,16 +365,8 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
}
uint32_t vp = ptr>>5;
// Terminating byte then zero pad
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
// Zero pad full vectors up to count
for ( ; vp < 6; vp++ )
casti_m256i( buf, vp ) = m256_zero;
// Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
// Count is misaligned to 16 bits and straddles a vector.
// Count is misaligned to 16 bits and straddles 2 vectors.
// Use u32 overlay to stage then u16 to load buf.
union
{
@@ -387,6 +379,18 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
count.u32[2] = ctx->count2;
count.u32[3] = ctx->count3;
if ( vp == 0 ) // empty buf, xevan.
{
casti_m256i( buf, 0 ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
memset_zero_256( (__m256i*)buf + 1, 5 );
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
}
else // half full buf, everyone else.
{
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
memset_zero_256( (__m256i*)buf + vp, 6 - vp );
}
casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
count.u16[0], 0,0,0,0,0,0,0 );
casti_m256i( buf, 7 ) = _mm256_set_epi16(

View File

@@ -3,7 +3,7 @@
#if defined(__AVX2__)
#include "avxdefs.h"
#include "simd-utils.h"
typedef struct {
unsigned char buf[128<<1];

View File

@@ -36,7 +36,7 @@
#ifdef __AES__
#include "sph_shavite.h"
#include "avxdefs.h"
#include "simd-utils.h"
#ifdef __cplusplus
extern "C"{

View File

@@ -5,7 +5,7 @@
#if defined(__AVX2__)
#include "avxdefs.h"
#include "simd-utils.h"
typedef struct {
uint32_t A[ 32*2 ] __attribute__((aligned(64)));

View File

@@ -17,13 +17,13 @@ void skeinhash_4way( void *state, const void *input )
skein512_4way( &ctx_skein, input, 80 );
skein512_4way_close( &ctx_skein, vhash64 );
mm256_reinterleave_4x32( vhash32, vhash64, 512 );
mm256_rintrlv_4x64_4x32( vhash32, vhash64, 512 );
sha256_4way_init( &ctx_sha256 );
sha256_4way( &ctx_sha256, vhash32, 64 );
sha256_4way_close( &ctx_sha256, state );
mm128_deinterleave_4x32( state, state+32, state+64, state+96,
mm128_dintrlv_4x32( state, state+32, state+64, state+96,
vhash32, 256 );
}
@@ -48,7 +48,7 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
swab32_array( edata, pdata, 20 );
mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
mm256_intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
uint32_t *noncep = vdata + 73; // 9*8 + 1

View File

@@ -49,7 +49,7 @@ extern "C"{
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
// Output size in bits
#define SPH_SIZE_skein256 256

View File

@@ -59,7 +59,7 @@
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include "avxdefs.h"
#include "simd-utils.h"
#ifdef __cplusplus
extern "C" {

View File

@@ -52,7 +52,7 @@
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
#include "simd-utils.h"
/**
* Output size (in bits) for WHIRLPOOL.

View File

@@ -49,10 +49,10 @@ void polytimos_4way_hash( void *output, const void *input )
// Need to convert from 64 bit interleaved to 32 bit interleaved.
uint32_t vhash32[16*4];
mm256_reinterleave_4x32( vhash32, vhash, 512 );
mm256_rintrlv_4x64_4x32( vhash32, vhash, 512 );
shabal512_4way( &ctx.shabal, vhash32, 64 );
shabal512_4way_close( &ctx.shabal, vhash32 );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash32, 512 );
update_final_echo ( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *)hash0, 512 );
@@ -66,13 +66,13 @@ void polytimos_4way_hash( void *output, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_2x128( vhash, hash0, hash1, 512 );
mm256_intrlv_2x128( vhash, hash0, hash1, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
mm256_interleave_2x128( vhash, hash2, hash3, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhash, 512 );
mm256_intrlv_2x128( vhash, hash2, hash3, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhash, 512 );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );

View File

@@ -293,7 +293,7 @@ void x16r_4way_hash( void* output, const void* input )
}
int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t hash[4*16] __attribute__ ((aligned (64)));
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
@@ -303,6 +303,7 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1

View File

@@ -35,7 +35,7 @@ void x16s_getAlgoString( const uint8_t* prevblock, char *output )
bool register_x16r_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
init_x16r_4way_ctx();
// init_x16r_4way_ctx();
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#else
@@ -52,7 +52,7 @@ bool register_x16r_algo( algo_gate_t* gate )
bool register_x16s_algo( algo_gate_t* gate )
{
#if defined (X16R_4WAY)
init_x16r_4way_ctx();
// init_x16r_4way_ctx();
gate->scanhash = (void*)&scanhash_x16r_4way;
gate->hash = (void*)&x16r_4way_hash;
#else

View File

@@ -2,7 +2,7 @@
#define X16R_GATE_H__ 1
#include "algo-gate-api.h"
#include "avxdefs.h"
#include "simd-utils.h"
#include <stdint.h>
#if defined(__AVX2__) && defined(__AES__)
@@ -41,7 +41,7 @@ bool register_x16s_algo( algo_gate_t* gate );
void x16r_4way_hash( void *state, const void *input );
int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_x16r_4way_ctx();
@@ -50,7 +50,7 @@ void init_x16r_4way_ctx();
void x16r_hash( void *state, const void *input );
int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_x16r_ctx();

View File

@@ -184,7 +184,7 @@ void x16r_hash( void* output, const void* input )
}
int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash32[8];
uint32_t _ALIGN(128) endiandata[20];
@@ -192,6 +192,7 @@ int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint32_t nonce = first_nonce;
volatile uint8_t *restart = &(work_restart[thr_id].restart);

View File

@@ -299,7 +299,7 @@ extern void hmq1725hash(void *state, const void *input)
}
int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t endiandata[32] __attribute__((aligned(64)));
uint32_t hash64[8] __attribute__((aligned(64)));
@@ -307,6 +307,7 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
//const uint32_t Htarg = ptarget[7];
//we need bigendian data...

View File

@@ -25,7 +25,8 @@
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha2-hash-4way.h"
typedef struct {
union _sonoa_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
@@ -43,31 +44,10 @@ typedef struct {
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
} sonoa_4way_ctx_holder;
sonoa_4way_ctx_holder sonoa_4way_ctx __attribute__ ((aligned (64)));
void init_sonoa_4way_ctx()
{
blake512_4way_init( &sonoa_4way_ctx.blake );
bmw512_4way_init( &sonoa_4way_ctx.bmw );
init_groestl( &sonoa_4way_ctx.groestl, 64 );
skein512_4way_init( &sonoa_4way_ctx.skein );
jh512_4way_init( &sonoa_4way_ctx.jh );
keccak512_4way_init( &sonoa_4way_ctx.keccak );
luffa_2way_init( &sonoa_4way_ctx.luffa, 512 );
cube_2way_init( &sonoa_4way_ctx.cube, 512, 16, 32 );
shavite512_2way_init( &sonoa_4way_ctx.shavite );
simd_2way_init( &sonoa_4way_ctx.simd, 512 );
init_echo( &sonoa_4way_ctx.echo, 512 );
hamsi512_4way_init( &sonoa_4way_ctx.hamsi );
sph_fugue512_init( &sonoa_4way_ctx.fugue );
shabal512_4way_init( &sonoa_4way_ctx.shabal );
sph_whirlpool_init( &sonoa_4way_ctx.whirlpool );
sha512_4way_init( &sonoa_4way_ctx.sha512 );
haval256_5_4way_init( &sonoa_4way_ctx.haval );
};
typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
void sonoa_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -77,19 +57,21 @@ void sonoa_4way_hash( void *state, const void *input )
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
sonoa_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &sonoa_4way_ctx, sizeof(sonoa_4way_ctx) );
sonoa_4way_context_overlay ctx;
// 1
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
@@ -98,38 +80,46 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
@@ -144,13 +134,13 @@ void sonoa_4way_hash( void *state, const void *input )
// 2
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -161,7 +151,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -175,7 +165,7 @@ void sonoa_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -197,8 +187,8 @@ void sonoa_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -213,87 +203,90 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
// 3
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 3
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
@@ -307,13 +300,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 4
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -324,7 +317,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -338,7 +331,7 @@ void sonoa_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -360,8 +353,8 @@ void sonoa_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -376,13 +369,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -397,18 +390,19 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm256_reinterleave_4x64( vhashB, vhash, 512 );
mm256_rintrlv_4x32_4x64( vhashB, vhash, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhashB, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -423,8 +417,8 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_2x128( vhashA, hash0, hash1, 512 );
mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
mm256_intrlv_2x128( vhashA, hash0, hash1, 512 );
mm256_intrlv_2x128( vhashB, hash2, hash3, 512 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
@@ -432,19 +426,19 @@ void sonoa_4way_hash( void *state, const void *input )
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
// 5
mm256_reinterleave_2x128_4x64( vhash, vhashA, vhashB, 512 );
mm256_rintrlv_2x128_4x64( vhash, vhashA, vhashB, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_reinterleave_4x32( vhashB, vhash, 512 );
mm256_rintrlv_4x64_4x32( vhashB, vhash, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhashB, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -455,7 +449,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -469,7 +463,7 @@ void sonoa_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -491,8 +485,8 @@ void sonoa_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -507,13 +501,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -528,14 +522,15 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
@@ -550,13 +545,13 @@ void sonoa_4way_hash( void *state, const void *input )
// 6
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -567,7 +562,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -581,7 +576,7 @@ void sonoa_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -603,8 +598,8 @@ void sonoa_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -619,13 +614,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -640,13 +635,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -661,12 +656,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -683,13 +679,13 @@ void sonoa_4way_hash( void *state, const void *input )
// 7
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
@@ -700,7 +696,7 @@ void sonoa_4way_hash( void *state, const void *input )
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
@@ -714,7 +710,7 @@ void sonoa_4way_hash( void *state, const void *input )
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -736,8 +732,8 @@ void sonoa_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -752,13 +748,13 @@ void sonoa_4way_hash( void *state, const void *input )
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, 512 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
@@ -773,13 +769,13 @@ void sonoa_4way_hash( void *state, const void *input )
sph_fugue512( &ctx.fugue, hash3, 64 );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
@@ -794,17 +790,17 @@ void sonoa_4way_hash( void *state, const void *input )
sph_whirlpool( &ctx.whirlpool, hash3, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
mm256_reinterleave_4x32( vhashB, vhash, 512 );
mm256_rintrlv_4x64_4x32( vhashB, vhash, 512 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashB, 64 );
haval256_5_4way_close( &ctx.haval, state );
}
int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -814,15 +810,11 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
// uint32_t *noncep = vdata + 73; // 9*8 + 1
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
@@ -831,19 +823,13 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
0xFFFFF000, 0xFFFF0000, 0 };
// Need big endian data
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
*noncev = mm256_interleave_blend_32( mm256_bswap_32(
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
*noncev );
sonoa_4way_hash( hash, vdata );
@@ -852,21 +838,19 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( ( ( hash7[ lane ] & mask ) == 0 ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -3,7 +3,7 @@
bool register_sonoa_algo( algo_gate_t* gate )
{
#if defined (SONOA_4WAY)
init_sonoa_4way_ctx();
// init_sonoa_4way_ctx();
gate->scanhash = (void*)&scanhash_sonoa_4way;
gate->hash = (void*)&sonoa_4way_hash;
#else

View File

@@ -17,7 +17,7 @@ void sonoa_4way_hash( void *state, const void *input );
int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_sonoa_4way_ctx();
//void init_sonoa_4way_ctx();
#endif

View File

@@ -14,7 +14,6 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -25,7 +24,6 @@
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha2-hash-4way.h"
//typedef struct {
union _x17_4way_context_overlay
{
blake512_4way_context blake;
@@ -48,30 +46,6 @@ union _x17_4way_context_overlay
};
typedef union _x17_4way_context_overlay x17_4way_context_overlay;
/*
x17_4way_ctx_holder x17_4way_ctx __attribute__ ((aligned (64)));
void init_x17_4way_ctx()
{
blake512_4way_init( &x17_4way_ctx.blake );
bmw512_4way_init( &x17_4way_ctx.bmw );
init_groestl( &x17_4way_ctx.groestl, 64 );
skein512_4way_init( &x17_4way_ctx.skein );
jh512_4way_init( &x17_4way_ctx.jh );
keccak512_4way_init( &x17_4way_ctx.keccak );
luffa_2way_init( &x17_4way_ctx.luffa, 512 );
cube_2way_init( &x17_4way_ctx.cube, 512, 16, 32 );
shavite512_2way_init( &x17_4way_ctx.shavite );
simd_2way_init( &x17_4way_ctx.simd, 512 );
init_echo( &x17_4way_ctx.echo, 512 );
hamsi512_4way_init( &x17_4way_ctx.hamsi );
sph_fugue512_init( &x17_4way_ctx.fugue );
shabal512_4way_init( &x17_4way_ctx.shabal );
sph_whirlpool_init( &x17_4way_ctx.whirlpool );
sha512_4way_init( &x17_4way_ctx.sha512 );
haval256_5_4way_init( &x17_4way_ctx.haval );
};
*/
void x17_4way_hash( void *state, const void *input )
{
uint64_t hash0[8] __attribute__ ((aligned (64)));
@@ -82,7 +56,6 @@ void x17_4way_hash( void *state, const void *input )
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
x17_4way_context_overlay ctx;
// memcpy( &ctx, &x17_4way_ctx, sizeof(x17_4way_ctx) );
// 1 Blake parallel 4 way 64 bit
blake512_4way_init( &ctx.blake );
@@ -95,7 +68,7 @@ void x17_4way_hash( void *state, const void *input )
bmw512_4way_close( &ctx.bmw, vhash );
// Serialize
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 3 Groestl
init_groestl( &ctx.groestl, 64 );
@@ -108,7 +81,7 @@ void x17_4way_hash( void *state, const void *input )
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
// Parallellize
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
// 4 Skein parallel 4 way 64 bit
skein512_4way_init( &ctx.skein );
@@ -126,7 +99,7 @@ void x17_4way_hash( void *state, const void *input )
keccak512_4way_close( &ctx.keccak, vhash );
// 7 Luffa parallel 2 way 128 bit
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
@@ -151,8 +124,8 @@ void x17_4way_hash( void *state, const void *input )
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, 512 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, 512 );
// 11 Echo serial
init_echo( &ctx.echo, 512 );
@@ -169,13 +142,13 @@ void x17_4way_hash( void *state, const void *input )
(const BitSequence *) hash3, 512 );
// 12 Hamsi parallel 4 way 64 bit
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
// 13 Fugue serial
sph_fugue512_init( &ctx.fugue );
@@ -192,13 +165,13 @@ void x17_4way_hash( void *state, const void *input )
sph_fugue512_close( &ctx.fugue, hash3 );
// 14 Shabal, parallel 4 way 32 bit
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
// 15 Whirlpool serial
sph_whirlpool_init( &ctx.whirlpool );
@@ -215,19 +188,18 @@ void x17_4way_hash( void *state, const void *input )
sph_whirlpool_close( &ctx.whirlpool, hash3 );
// 16 SHA512 parallel 64 bit
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
// 17 Haval parallel 32 bit
mm256_reinterleave_4x32( vhashB, vhash, 512 );
mm256_rintrlv_4x64_4x32( vhashB, vhash, 512 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashB, 64 );
haval256_5_4way_close( &ctx.haval, state );
}
int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -237,13 +209,10 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__((aligned(64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
@@ -253,42 +222,33 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
0xFFFFF000, 0xFFFF0000, 0 };
// Need big endian data
casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
mm256_bswap_intrlv80_4x64( vdata, pdata );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
uint32_t mask = masks[ m ];
do
{
*noncev = mm256_interleave_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
*noncev );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
x17_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( ( ( hash7[ lane ] & mask ) == 0 ) )
for ( int lane = 0; lane < 4; lane++ )
if ( ( hash7[ lane ] & mask ) == 0 )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -12,8 +12,9 @@
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -24,16 +25,17 @@
#include "algo/sha/sha2-hash-4way.h"
#include "algo/haval/haval-hash-4way.h"
typedef struct {
blake512_4way_context blake;
union _xevan_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
cube_2way_context cube;
shavite512_2way_context shavite;
simd_2way_context simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
@@ -42,39 +44,8 @@ typedef struct {
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
} xevan_4way_ctx_holder;
xevan_4way_ctx_holder xevan_4way_ctx __attribute__ ((aligned (64)));
static __thread blake512_4way_context xevan_blake_4way_mid
__attribute__ ((aligned (64)));
void init_xevan_4way_ctx()
{
blake512_4way_init(&xevan_4way_ctx.blake);
bmw512_4way_init( &xevan_4way_ctx.bmw );
init_groestl( &xevan_4way_ctx.groestl, 64 );
skein512_4way_init(&xevan_4way_ctx.skein);
jh512_4way_init(&xevan_4way_ctx.jh);
keccak512_4way_init(&xevan_4way_ctx.keccak);
luffa_2way_init( &xevan_4way_ctx.luffa, 512 );
cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &xevan_4way_ctx.shavite );
simd_2way_init( &xevan_4way_ctx.simd, 512 );
init_echo( &xevan_4way_ctx.echo, 512 );
hamsi512_4way_init( &xevan_4way_ctx.hamsi );
sph_fugue512_init( &xevan_4way_ctx.fugue );
shabal512_4way_init( &xevan_4way_ctx.shabal );
sph_whirlpool_init( &xevan_4way_ctx.whirlpool );
sha512_4way_init( &xevan_4way_ctx.sha512 );
haval256_5_4way_init( &xevan_4way_ctx.haval );
};
void xevan_4way_blake512_midstate( const void* input )
{
memcpy( &xevan_blake_4way_mid, &xevan_4way_ctx.blake,
sizeof(xevan_blake_4way_mid) );
blake512_4way( &xevan_blake_4way_mid, input, 64 );
}
typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;
void xevan_4way_hash( void *output, const void *input )
{
@@ -83,343 +54,320 @@ void xevan_4way_hash( void *output, const void *input )
uint64_t hash2[16] __attribute__ ((aligned (64)));
uint64_t hash3[16] __attribute__ ((aligned (64)));
uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
uint64_t vhash32[16<<2] __attribute__ ((aligned (64)));
uint64_t vhashA[16<<2] __attribute__ ((aligned (64)));
uint64_t vhashB[16<<2] __attribute__ ((aligned (64)));
const int dataLen = 128;
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
xevan_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));
// parallel way
memcpy( &ctx.blake, &xevan_blake_4way_mid,
sizeof(xevan_blake_4way_mid) );
blake512_4way( &ctx.blake, input + (midlen<<2), tail );
// parallel 4 way
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close(&ctx.blake, vhash);
memset( &vhash[8<<2], 0, 64<<2 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, dataLen );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
dataLen<<3 );
// Parallel 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, dataLen );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, dataLen );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, dataLen );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
dataLen );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
sph_shavite512( &ctx.shavite, hash0, dataLen );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, dataLen );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, dataLen );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 );
// Parallel
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, dataLen );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, dataLen );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, dataLen );
sph_fugue512_close( &ctx.fugue, hash3 );
// Parallel 4way 32 bit
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, dataLen );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
// Serial
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, dataLen );
sha512_4way_close( &ctx.sha512, vhash );
mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 );
haval256_5_4way( &ctx.haval, vhash32, dataLen );
haval256_5_4way_close( &ctx.haval, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashA, dataLen );
haval256_5_4way_close( &ctx.haval, vhashA );
mm256_rintrlv_4x32_4x64( vhash, vhashA, dataLen<<3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, dataLen );
blake512_4way_close(&ctx.blake, vhash);
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, dataLen );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
dataLen<<3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, dataLen );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, dataLen );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, dataLen );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
mm256_rintrlv_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
dataLen );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
sph_shavite512( &ctx.shavite, hash0, dataLen );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, dataLen );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, dataLen );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
mm256_dintrlv_2x128( hash0, hash1, vhashA, dataLen<<3 );
mm256_dintrlv_2x128( hash2, hash3, vhashB, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, dataLen );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, dataLen );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, dataLen );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
mm128_intrlv_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, dataLen );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm128_dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
mm256_intrlv_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, dataLen );
sha512_4way_close( &ctx.sha512, vhash );
mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 );
haval256_5_4way( &ctx.haval, vhash32, dataLen );
mm256_rintrlv_4x64_4x32( vhashA, vhash, dataLen<<3 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashA, dataLen );
haval256_5_4way_close( &ctx.haval, output );
}
int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t vdata[24*4] __attribute__ ((aligned (64)));
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
if ( opt_benchmark )
ptarget[7] = 0x0cff;
for ( int k=0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
xevan_4way_blake512_midstate( vdata );
mm256_bswap_intrlv80_4x64( vdata, pdata );
do {
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_intrlv_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ), *noncev );
xevan_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( hash7[ lane ] <= Htarg )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -8,7 +8,7 @@ void xevan_set_target( struct work* work, double job_diff )
bool register_xevan_algo( algo_gate_t* gate )
{
#if defined (XEVAN_4WAY)
init_xevan_4way_ctx();
// init_xevan_4way_ctx();
gate->scanhash = (void*)&scanhash_xevan_4way;
gate->hash = (void*)&xevan_4way_hash;
#else

View File

@@ -15,16 +15,16 @@ bool register_xevan_algo( algo_gate_t* gate );
void xevan_4way_hash( void *state, const void *input );
int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_xevan_4way_ctx();
//void init_xevan_4way_ctx();
#endif
void xevan_hash( void *state, const void *input );
int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_xevan_ctx();

View File

@@ -230,13 +230,14 @@ void xevan_hash(void *output, const void *input)
memcpy(output, hash, 32);
}
int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
@@ -248,8 +249,7 @@ int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *
for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);
xevan_blake512_midstate( endiandata );
xevan_blake512_midstate( endiandata );
do {
be32enc(&endiandata[19], nonce);
xevan_hash(hash, endiandata);

View File

@@ -290,7 +290,7 @@ SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx)
/* Initialize an HMAC-SHA256 operation with the given key. */
void
HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen)
{
unsigned char pad[64];
unsigned char khash[32];
@@ -326,7 +326,7 @@ HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
/* Add bytes to the HMAC-SHA256 operation. */
void
HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len)
{
/* Feed data to the inner SHA256 operation. */
@@ -335,7 +335,7 @@ HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
/* Finish an HMAC-SHA256 operation. */
void
HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx)
HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx)
{
unsigned char ihash[32];
@@ -361,7 +361,7 @@ void
PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
HMAC_SHA256_CTX PShctx, hctx;
HMAC_SHA256_CTX_Y PShctx, hctx;
uint8_t _ALIGN(128) T[32];
uint8_t _ALIGN(128) U[32];
uint8_t ivec[4];
@@ -370,8 +370,8 @@ PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
int k;
/* Compute HMAC state after processing P and S. */
HMAC_SHA256_Init(&PShctx, passwd, passwdlen);
HMAC_SHA256_Update(&PShctx, salt, saltlen);
HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen);
HMAC_SHA256_Update_Y(&PShctx, salt, saltlen);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
@@ -379,18 +379,18 @@ PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
HMAC_SHA256_Update(&hctx, ivec, 4);
HMAC_SHA256_Final(U, &hctx);
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y));
HMAC_SHA256_Update_Y(&hctx, ivec, 4);
HMAC_SHA256_Final_Y(U, &hctx);
/* T_i = U_1 ... */
memcpy(T, U, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
HMAC_SHA256_Init(&hctx, passwd, passwdlen);
HMAC_SHA256_Update(&hctx, U, 32);
HMAC_SHA256_Final(U, &hctx);
HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen);
HMAC_SHA256_Update_Y(&hctx, U, 32);
HMAC_SHA256_Final_Y(U, &hctx);
/* ... xor U_j ... */
for (k = 0; k < 32; k++)

View File

@@ -49,14 +49,14 @@ typedef struct HMAC_SHA256Context {
typedef struct HMAC_SHA256Context {
SHA256_CTX ictx;
SHA256_CTX octx;
} HMAC_SHA256_CTX;
} HMAC_SHA256_CTX_Y;
void SHA256_Init_Y(SHA256_CTX_Y *);
void SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t);
void SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *);
void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
void HMAC_SHA256_Final(unsigned char [32], HMAC_SHA256_CTX *);
void HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
void HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
void HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *);
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):

View File

@@ -1354,14 +1354,14 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
if ((t || flags) && buflen == sizeof(sha256)) {
/* Compute ClientKey */
{
HMAC_SHA256_CTX ctx;
HMAC_SHA256_Init(&ctx, buf, buflen);
HMAC_SHA256_CTX_Y ctx;
HMAC_SHA256_Init_Y(&ctx, buf, buflen);
if ( yescrypt_client_key )
HMAC_SHA256_Update( &ctx, (uint8_t*)yescrypt_client_key,
HMAC_SHA256_Update_Y( &ctx, (uint8_t*)yescrypt_client_key,
yescrypt_client_key_len );
else
HMAC_SHA256_Update( &ctx, salt, saltlen );
HMAC_SHA256_Final(sha256, &ctx);
HMAC_SHA256_Update_Y( &ctx, salt, saltlen );
HMAC_SHA256_Final_Y(sha256, &ctx);
}
/* Compute StoredKey */
{

View File

@@ -383,7 +383,7 @@ void yescrypthash(void *output, const void *input)
}
int scanhash_yescrypt( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) vhash[8];
uint32_t _ALIGN(64) endiandata[20];
@@ -393,6 +393,7 @@ int scanhash_yescrypt( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
for (int k = 0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);

View File

@@ -1,646 +0,0 @@
/*-
* Copyright 2005-2016 Colin Percival
* Copyright 2016-2018 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <assert.h>
#include <stdint.h>
#include <string.h>
#include "insecure_memzero.h"
#include "sysendian.h"
#include "sha256.h"
#ifdef __ICC
/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
#define restrict
#elif __STDC_VERSION__ >= 199901L
/* Have restrict */
#elif defined(__GNUC__)
#define restrict __restrict
#else
#define restrict
#endif
/*
* Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
* (uint8_t) in big-endian form.
*/
static void
be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
{
/* Encode vector, two words at a time. */
do {
be32enc(&dst[0], src[0]);
be32enc(&dst[4], src[1]);
src += 2;
dst += 8;
} while (--len);
}
/*
* Decode a big-endian length len*8 vector of (uint8_t) into a length
* len*2 vector of (uint32_t).
*/
static void
be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
{
/* Decode vector, two words at a time. */
do {
dst[0] = be32dec(&src[0]);
dst[1] = be32dec(&src[4]);
src += 8;
dst += 2;
} while (--len);
}
/* SHA256 round constants. */
static const uint32_t Krnd[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define SHR(x, n) (x >> n)
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
h += S1(e) + Ch(e, f, g) + k; \
d += h; \
h += S0(a) + Maj(a, b, c);
/* Adjusted round function for rotating state */
#define RNDr(S, W, i, ii) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i + ii] + Krnd[i + ii])
/* Message schedule computation */
#define MSCH(W, ii, i) \
W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
/*
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state.
*/
static void
SHA256_Transform(uint32_t state[static restrict 8],
const uint8_t block[static restrict 64],
uint32_t W[static restrict 64], uint32_t S[static restrict 8])
{
int i;
/* 1. Prepare the first part of the message schedule W. */
be32dec_vect(W, block, 8);
/* 2. Initialize working variables. */
memcpy(S, state, 32);
/* 3. Mix. */
for (i = 0; i < 64; i += 16) {
RNDr(S, W, 0, i);
RNDr(S, W, 1, i);
RNDr(S, W, 2, i);
RNDr(S, W, 3, i);
RNDr(S, W, 4, i);
RNDr(S, W, 5, i);
RNDr(S, W, 6, i);
RNDr(S, W, 7, i);
RNDr(S, W, 8, i);
RNDr(S, W, 9, i);
RNDr(S, W, 10, i);
RNDr(S, W, 11, i);
RNDr(S, W, 12, i);
RNDr(S, W, 13, i);
RNDr(S, W, 14, i);
RNDr(S, W, 15, i);
if (i == 48)
break;
MSCH(W, 0, i);
MSCH(W, 1, i);
MSCH(W, 2, i);
MSCH(W, 3, i);
MSCH(W, 4, i);
MSCH(W, 5, i);
MSCH(W, 6, i);
MSCH(W, 7, i);
MSCH(W, 8, i);
MSCH(W, 9, i);
MSCH(W, 10, i);
MSCH(W, 11, i);
MSCH(W, 12, i);
MSCH(W, 13, i);
MSCH(W, 14, i);
MSCH(W, 15, i);
}
/* 4. Mix local working variables into global state. */
state[0] += S[0];
state[1] += S[1];
state[2] += S[2];
state[3] += S[3];
state[4] += S[4];
state[5] += S[5];
state[6] += S[6];
state[7] += S[7];
}
static const uint8_t PAD[64] = {
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/* Add padding and terminating bit-count. */
static void
SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
{
size_t r;
/* Figure out how many bytes we have buffered. */
r = (ctx->count >> 3) & 0x3f;
/* Pad to 56 mod 64, transforming if we finish a block en route. */
if (r < 56) {
/* Pad to 56 mod 64. */
memcpy(&ctx->buf[r], PAD, 56 - r);
} else {
/* Finish the current block and mix. */
memcpy(&ctx->buf[r], PAD, 64 - r);
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
/* The start of the final block is all zeroes. */
memset(&ctx->buf[0], 0, 56);
}
/* Add the terminating bit-count. */
be64enc(&ctx->buf[56], ctx->count);
/* Mix in the final block. */
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
}
/* Magic initialization constants. */
static const uint32_t initial_state[8] = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
/**
* SHA256_Init(ctx):
* Initialize the SHA256 context ${ctx}.
*/
void
SHA256_Init(SHA256_CTX * ctx)
{
/* Zero bits processed so far. */
ctx->count = 0;
/* Initialize state. */
memcpy(ctx->state, initial_state, sizeof(initial_state));
}
/**
* SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
*/
static void
_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
uint32_t tmp32[static restrict 72])
{
uint32_t r;
const uint8_t * src = in;
/* Return immediately if we have nothing to do. */
if (len == 0)
return;
/* Number of bytes left in the buffer from previous updates. */
r = (ctx->count >> 3) & 0x3f;
/* Update number of bits. */
ctx->count += (uint64_t)(len) << 3;
/* Handle the case where we don't need to perform any transforms. */
if (len < 64 - r) {
memcpy(&ctx->buf[r], src, len);
return;
}
/* Finish the current block. */
memcpy(&ctx->buf[r], src, 64 - r);
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
src += 64 - r;
len -= 64 - r;
/* Perform complete blocks. */
while (len >= 64) {
SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
src += 64;
len -= 64;
}
/* Copy left over data into buffer. */
memcpy(ctx->buf, src, len);
}
/* Wrapper function for intermediate-values sanitization. */
void
SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
{
uint32_t tmp32[72];
/* Call the real function. */
_SHA256_Update(ctx, in, len, tmp32);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* SHA256_Final(digest, ctx):
* Output the SHA256 hash of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
static void
_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
uint32_t tmp32[static restrict 72])
{
/* Add padding. */
SHA256_Pad(ctx, tmp32);
/* Write the hash. */
be32enc_vect(digest, ctx->state, 4);
}
/* Wrapper function for intermediate-values sanitization. */
void
SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
{
uint32_t tmp32[72];
/* Call the real function. */
_SHA256_Final(digest, ctx, tmp32);
/* Clear the context state. */
insecure_memzero(ctx, sizeof(SHA256_CTX));
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void
SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
{
SHA256_CTX ctx;
uint32_t tmp32[72];
SHA256_Init(&ctx);
_SHA256_Update(&ctx, in, len, tmp32);
_SHA256_Final(digest, &ctx, tmp32);
/* Clean the stack. */
insecure_memzero(&ctx, sizeof(SHA256_CTX));
insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Init(ctx, K, Klen):
* Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
* ${K}.
*/
static void
_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
uint8_t khash[static restrict 32])
{
const uint8_t * K = _K;
size_t i;
/* If Klen > 64, the key is really SHA256(K). */
if (Klen > 64) {
SHA256_Init(&ctx->ictx);
_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
_SHA256_Final(khash, &ctx->ictx, tmp32);
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
SHA256_Init(&ctx->ictx);
memset(pad, 0x36, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
SHA256_Init(&ctx->octx);
memset(pad, 0x5c, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
_SHA256_Update(&ctx->octx, pad, 64, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
{
uint32_t tmp32[72];
uint8_t pad[64];
uint8_t khash[32];
/* Call the real function. */
_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
insecure_memzero(khash, 32);
insecure_memzero(pad, 64);
}
/**
* HMAC_SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
*/
static void
_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
uint32_t tmp32[static restrict 72])
{
/* Feed data to the inner SHA256 operation. */
_SHA256_Update(&ctx->ictx, in, len, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
{
uint32_t tmp32[72];
/* Call the real function. */
_HMAC_SHA256_Update(ctx, in, len, tmp32);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Final(digest, ctx):
* Output the HMAC-SHA256 of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
static void
_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
{
/* Finish the inner SHA256 operation. */
_SHA256_Final(ihash, &ctx->ictx, tmp32);
/* Feed the inner hash to the outer SHA256 operation. */
_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
/* Finish the outer SHA256 operation. */
_SHA256_Final(digest, &ctx->octx, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
{
uint32_t tmp32[72];
uint8_t ihash[32];
/* Call the real function. */
_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
insecure_memzero(ihash, 32);
}
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void
HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
uint8_t digest[32])
{
HMAC_SHA256_CTX ctx;
uint32_t tmp32[72];
uint8_t tmp8[96];
_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
_HMAC_SHA256_Update(&ctx, in, len, tmp32);
_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
/* Clean the stack. */
insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(tmp32, 288);
insecure_memzero(tmp8, 96);
}
/* Add padding and terminating bit-count, but don't invoke Transform yet. */
static int
SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
uint32_t tmp32[static restrict 72])
{
uint32_t r;
r = (ctx->count >> 3) & 0x3f;
if (r >= 56)
return -1;
/*
* Convert length to a vector of bytes -- we do this now rather
* than later because the length will change after we pad.
*/
be64enc(len, ctx->count);
/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
_SHA256_Update(ctx, PAD, 56 - r, tmp32);
/* Add the terminating bit-count. */
ctx->buf[63] = len[7];
_SHA256_Update(ctx, len, 7, tmp32);
return 0;
}
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void
PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
HMAC_SHA256_CTX Phctx, PShctx, hctx;
uint32_t tmp32[72];
union {
uint8_t tmp8[96];
uint32_t state[8];
} u;
size_t i;
uint8_t ivec[4];
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Sanity-check. */
assert(dkLen <= 32 * (size_t)(UINT32_MAX));
if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
uint32_t oldcount;
uint8_t * ivecp;
/* Compute HMAC state after processing P and S. */
_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
tmp32, &u.tmp8[0], &u.tmp8[64]);
_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
/* Prepare ictx padding. */
oldcount = hctx.ictx.count & (0x3f << 3);
_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
goto generic; /* Can't happen due to saltlen check */
ivecp = hctx.ictx.buf + (oldcount >> 3);
/* Prepare octx padding. */
hctx.octx.count += 32 << 3;
SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivecp, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(u.state, hctx.ictx.state, sizeof(u.state));
SHA256_Transform(u.state, hctx.ictx.buf,
&tmp32[0], &tmp32[64]);
be32enc_vect(hctx.octx.buf, u.state, 4);
memcpy(u.state, hctx.octx.state, sizeof(u.state));
SHA256_Transform(u.state, hctx.octx.buf,
&tmp32[0], &tmp32[64]);
be32enc_vect(&buf[i * 32], u.state, 4);
}
goto cleanup;
}
generic:
/* Compute HMAC state after processing P. */
_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
tmp32, &u.tmp8[0], &u.tmp8[64]);
/* Compute HMAC state after processing P and S. */
memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
if (c > 1) {
/* T_i = U_1 ... */
memcpy(U, T, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
/* ... xor U_j ... */
for (k = 0; k < 32; k++)
T[k] ^= U[k];
}
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy(&buf[i * 32], T, clen);
}
/* Clean the stack. */
insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(U, 32);
insecure_memzero(T, 32);
cleanup:
insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(tmp32, 288);
insecure_memzero(&u, sizeof(u));
}

View File

@@ -1,680 +0,0 @@
/*-
* Copyright 2005-2016 Colin Percival
* Copyright 2016-2018 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <assert.h>
#include <stdint.h>
#include <string.h>
#include "insecure_memzero.h"
#include "sysendian.h"
#include "sha256.h"
#include "avxdefs.h"
#ifdef __ICC
/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
#define restrict
#elif __STDC_VERSION__ >= 199901L
/* Have restrict */
#elif defined(__GNUC__)
#define restrict __restrict
#else
#define restrict
#endif
/*
* Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
* (uint8_t) in big-endian form.
*/
static void
be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
{
/* Encode vector, two words at a time. */
do {
be32enc(&dst[0], src[0]);
be32enc(&dst[4], src[1]);
src += 2;
dst += 8;
} while (--len);
}
/*
* Decode a big-endian length len*8 vector of (uint8_t) into a length
* len*2 vector of (uint32_t).
*/
static void
be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
{
/* Decode vector, two words at a time. */
do {
dst[0] = be32dec(&src[0]);
dst[1] = be32dec(&src[4]);
src += 8;
dst += 2;
} while (--len);
}
/* SHA256 round constants. */
static const uint32_t Krnd[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define SHR(x, n) (x >> n)
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
#if 0 //defined(__SHA__)
// ABEF = _mm_sha256rnds2_epu32( CDGH, ABEF, k )
//_mm_sha256rnds2_epu32 (__m128i a, __m128i b, __m128i k)
// b = { ABEF } a = { CDGH }
//
//a = _mm_set_epi32( S[(66 - i) % 8], S[(67 - i) % 8],
// S[(70 - i) % 8], S[(71 - i) % 8] );
//b = _mm_set_epi32( S[(64 - i) % 8], S[(65 - i) % 8],
// S[(68 - i) % 8], S[(69 - i) % 8] );
//k = _mm_set1_epi32( W[i + ii] + Krnd[i + ii] )
// _mm_sha256rnds2_epu32(a,b,k)
#define RNDr( S, W, i, ii ) do \
{ \
uint32_t abef[4]; \
__m128i ABEF = _mm_set_epi32( S[(66 - i) % 8], S[(67 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8] ); \
__m128i CDGH = _mm_set_epi32( S[(64 - i) % 8], S[(65 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8] ); \
__m128i K = _mm_set1_epi32( W[i + ii] + Krnd[i + ii] ); \
casti_m128i( abef, 0 ) = _mm_sha256rnds2_epu32( CDGH, ABEF, K ); \
S[(66 - i) % 8] = abef[3]; \
S[(67 - i) % 8] = abef[2]; \
S[(64 - i) % 8] = abef[1]; \
S[(65 - i) % 8] = abef[0]; \
} while(0)
#else
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
h += S1(e) + Ch(e, f, g) + k; \
d += h; \
h += S0(a) + Maj(a, b, c);
/* Adjusted round function for rotating state */
#define RNDr(S, W, i, ii) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i + ii] + Krnd[i + ii])
#endif
/* Message schedule computation */
#define MSCH(W, ii, i) \
W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
/*
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state.
*/
static void
SHA256_Transform(uint32_t state[static restrict 8],
const uint8_t block[static restrict 64],
uint32_t W[static restrict 64], uint32_t S[static restrict 8])
{
int i;
/* 1. Prepare the first part of the message schedule W. */
be32dec_vect(W, block, 8);
/* 2. Initialize working variables. */
memcpy(S, state, 32);
/* 3. Mix. */
for (i = 0; i < 64; i += 16) {
RNDr(S, W, 0, i);
RNDr(S, W, 1, i);
RNDr(S, W, 2, i);
RNDr(S, W, 3, i);
RNDr(S, W, 4, i);
RNDr(S, W, 5, i);
RNDr(S, W, 6, i);
RNDr(S, W, 7, i);
RNDr(S, W, 8, i);
RNDr(S, W, 9, i);
RNDr(S, W, 10, i);
RNDr(S, W, 11, i);
RNDr(S, W, 12, i);
RNDr(S, W, 13, i);
RNDr(S, W, 14, i);
RNDr(S, W, 15, i);
if (i == 48)
break;
MSCH(W, 0, i);
MSCH(W, 1, i);
MSCH(W, 2, i);
MSCH(W, 3, i);
MSCH(W, 4, i);
MSCH(W, 5, i);
MSCH(W, 6, i);
MSCH(W, 7, i);
MSCH(W, 8, i);
MSCH(W, 9, i);
MSCH(W, 10, i);
MSCH(W, 11, i);
MSCH(W, 12, i);
MSCH(W, 13, i);
MSCH(W, 14, i);
MSCH(W, 15, i);
}
/* 4. Mix local working variables into global state. */
state[0] += S[0];
state[1] += S[1];
state[2] += S[2];
state[3] += S[3];
state[4] += S[4];
state[5] += S[5];
state[6] += S[6];
state[7] += S[7];
}
static const uint8_t PAD[64] = {
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/* Add padding and terminating bit-count. */
static void
SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
{
size_t r;
/* Figure out how many bytes we have buffered. */
r = (ctx->count >> 3) & 0x3f;
/* Pad to 56 mod 64, transforming if we finish a block en route. */
if (r < 56) {
/* Pad to 56 mod 64. */
memcpy(&ctx->buf[r], PAD, 56 - r);
} else {
/* Finish the current block and mix. */
memcpy(&ctx->buf[r], PAD, 64 - r);
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
/* The start of the final block is all zeroes. */
memset(&ctx->buf[0], 0, 56);
}
/* Add the terminating bit-count. */
be64enc(&ctx->buf[56], ctx->count);
/* Mix in the final block. */
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
}
/* Magic initialization constants. */
static const uint32_t initial_state[8] = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
/**
* SHA256_Init(ctx):
* Initialize the SHA256 context ${ctx}.
*/
void
SHA256_Init(SHA256_CTX * ctx)
{
/* Zero bits processed so far. */
ctx->count = 0;
/* Initialize state. */
memcpy(ctx->state, initial_state, sizeof(initial_state));
}
/**
* SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
*/
static void
_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
uint32_t tmp32[static restrict 72])
{
uint32_t r;
const uint8_t * src = in;
/* Return immediately if we have nothing to do. */
if (len == 0)
return;
/* Number of bytes left in the buffer from previous updates. */
r = (ctx->count >> 3) & 0x3f;
/* Update number of bits. */
ctx->count += (uint64_t)(len) << 3;
/* Handle the case where we don't need to perform any transforms. */
if (len < 64 - r) {
memcpy(&ctx->buf[r], src, len);
return;
}
/* Finish the current block. */
memcpy(&ctx->buf[r], src, 64 - r);
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
src += 64 - r;
len -= 64 - r;
/* Perform complete blocks. */
while (len >= 64) {
SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
src += 64;
len -= 64;
}
/* Copy left over data into buffer. */
memcpy(ctx->buf, src, len);
}
/* Wrapper function for intermediate-values sanitization. */
void
SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
{
uint32_t tmp32[72];
/* Call the real function. */
_SHA256_Update(ctx, in, len, tmp32);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* SHA256_Final(digest, ctx):
* Output the SHA256 hash of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
static void
_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
uint32_t tmp32[static restrict 72])
{
/* Add padding. */
SHA256_Pad(ctx, tmp32);
/* Write the hash. */
be32enc_vect(digest, ctx->state, 4);
}
/* Wrapper function for intermediate-values sanitization. */
void
SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
{
uint32_t tmp32[72];
/* Call the real function. */
_SHA256_Final(digest, ctx, tmp32);
/* Clear the context state. */
insecure_memzero(ctx, sizeof(SHA256_CTX));
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void
SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
{
SHA256_CTX ctx;
uint32_t tmp32[72];
SHA256_Init(&ctx);
_SHA256_Update(&ctx, in, len, tmp32);
_SHA256_Final(digest, &ctx, tmp32);
/* Clean the stack. */
insecure_memzero(&ctx, sizeof(SHA256_CTX));
insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Init(ctx, K, Klen):
* Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
* ${K}.
*/
static void
_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
uint8_t khash[static restrict 32])
{
const uint8_t * K = _K;
size_t i;
/* If Klen > 64, the key is really SHA256(K). */
if (Klen > 64) {
SHA256_Init(&ctx->ictx);
_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
_SHA256_Final(khash, &ctx->ictx, tmp32);
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
SHA256_Init(&ctx->ictx);
memset(pad, 0x36, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
SHA256_Init(&ctx->octx);
memset(pad, 0x5c, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
_SHA256_Update(&ctx->octx, pad, 64, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
{
uint32_t tmp32[72];
uint8_t pad[64];
uint8_t khash[32];
/* Call the real function. */
_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
insecure_memzero(khash, 32);
insecure_memzero(pad, 64);
}
/**
* HMAC_SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
*/
static void
_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
uint32_t tmp32[static restrict 72])
{
/* Feed data to the inner SHA256 operation. */
_SHA256_Update(&ctx->ictx, in, len, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
{
uint32_t tmp32[72];
/* Call the real function. */
_HMAC_SHA256_Update(ctx, in, len, tmp32);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Final(digest, ctx):
* Output the HMAC-SHA256 of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
static void
_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
{
/* Finish the inner SHA256 operation. */
_SHA256_Final(ihash, &ctx->ictx, tmp32);
/* Feed the inner hash to the outer SHA256 operation. */
_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
/* Finish the outer SHA256 operation. */
_SHA256_Final(digest, &ctx->octx, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
{
uint32_t tmp32[72];
uint8_t ihash[32];
/* Call the real function. */
_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
insecure_memzero(ihash, 32);
}
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void
HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
uint8_t digest[32])
{
HMAC_SHA256_CTX ctx;
uint32_t tmp32[72];
uint8_t tmp8[96];
_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
_HMAC_SHA256_Update(&ctx, in, len, tmp32);
_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
/* Clean the stack. */
insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(tmp32, 288);
insecure_memzero(tmp8, 96);
}
/* Add padding and terminating bit-count, but don't invoke Transform yet. */
static int
SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
uint32_t tmp32[static restrict 72])
{
uint32_t r;
r = (ctx->count >> 3) & 0x3f;
if (r >= 56)
return -1;
/*
* Convert length to a vector of bytes -- we do this now rather
* than later because the length will change after we pad.
*/
be64enc(len, ctx->count);
/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
_SHA256_Update(ctx, PAD, 56 - r, tmp32);
/* Add the terminating bit-count. */
ctx->buf[63] = len[7];
_SHA256_Update(ctx, len, 7, tmp32);
return 0;
}
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void
PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
HMAC_SHA256_CTX Phctx, PShctx, hctx;
uint32_t tmp32[72];
union {
uint8_t tmp8[96];
uint32_t state[8];
} u;
size_t i;
uint8_t ivec[4];
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Sanity-check. */
assert(dkLen <= 32 * (size_t)(UINT32_MAX));
if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
uint32_t oldcount;
uint8_t * ivecp;
/* Compute HMAC state after processing P and S. */
_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
tmp32, &u.tmp8[0], &u.tmp8[64]);
_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
/* Prepare ictx padding. */
oldcount = hctx.ictx.count & (0x3f << 3);
_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
goto generic; /* Can't happen due to saltlen check */
ivecp = hctx.ictx.buf + (oldcount >> 3);
/* Prepare octx padding. */
hctx.octx.count += 32 << 3;
SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivecp, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(u.state, hctx.ictx.state, sizeof(u.state));
SHA256_Transform(u.state, hctx.ictx.buf,
&tmp32[0], &tmp32[64]);
be32enc_vect(hctx.octx.buf, u.state, 4);
memcpy(u.state, hctx.octx.state, sizeof(u.state));
SHA256_Transform(u.state, hctx.octx.buf,
&tmp32[0], &tmp32[64]);
be32enc_vect(&buf[i * 32], u.state, 4);
}
goto cleanup;
}
generic:
/* Compute HMAC state after processing P. */
_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
tmp32, &u.tmp8[0], &u.tmp8[64]);
/* Compute HMAC state after processing P and S. */
memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
if (c > 1) {
/* T_i = U_1 ... */
memcpy(U, T, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
/* ... xor U_j ... */
for (k = 0; k < 32; k++)
T[k] ^= U[k];
}
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy(&buf[i * 32], T, clen);
}
/* Clean the stack. */
insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(U, 32);
insecure_memzero(T, 32);
cleanup:
insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(tmp32, 288);
insecure_memzero(&u, sizeof(u));
}

View File

@@ -1,672 +0,0 @@
/*-
* Copyright 2005-2016 Colin Percival
* Copyright 2016-2018 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <assert.h>
#include <stdint.h>
#include <string.h>
#include "insecure_memzero.h"
#include "sysendian.h"
#include "sha256.h"
#ifdef __ICC
/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
#define restrict
#elif __STDC_VERSION__ >= 199901L
/* Have restrict */
#elif defined(__GNUC__)
#define restrict __restrict
#else
#define restrict
#endif
/*
* Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
* (uint8_t) in big-endian form.
*/
static void
be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
{
/* Encode vector, two words at a time. */
do {
be32enc(&dst[0], src[0]);
be32enc(&dst[4], src[1]);
src += 2;
dst += 8;
} while (--len);
}
/*
* Decode a big-endian length len*8 vector of (uint8_t) into a length
* len*2 vector of (uint32_t).
*/
static void
be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
{
/* Decode vector, two words at a time. */
do {
dst[0] = be32dec(&src[0]);
dst[1] = be32dec(&src[4]);
src += 8;
dst += 2;
} while (--len);
}
#if 0
/* SHA256 round constants. */
static const uint32_t Krnd[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define SHR(x, n) (x >> n)
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
h += S1(e) + Ch(e, f, g) + k; \
d += h; \
h += S0(a) + Maj(a, b, c);
/* Adjusted round function for rotating state */
#define RNDr(S, W, i, ii) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i + ii] + Krnd[i + ii])
/* Message schedule computation */
#define MSCH(W, ii, i) \
W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
/*
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state.
*/
static void
SHA256_Transform(uint32_t state[static restrict 8],
const uint8_t block[static restrict 64],
uint32_t W[static restrict 64], uint32_t S[static restrict 8])
{
int i;
/* 1. Prepare the first part of the message schedule W. */
be32dec_vect(W, block, 8);
/* 2. Initialize working variables. */
memcpy(S, state, 32);
/* 3. Mix. */
for (i = 0; i < 64; i += 16) {
RNDr(S, W, 0, i);
RNDr(S, W, 1, i);
RNDr(S, W, 2, i);
RNDr(S, W, 3, i);
RNDr(S, W, 4, i);
RNDr(S, W, 5, i);
RNDr(S, W, 6, i);
RNDr(S, W, 7, i);
RNDr(S, W, 8, i);
RNDr(S, W, 9, i);
RNDr(S, W, 10, i);
RNDr(S, W, 11, i);
RNDr(S, W, 12, i);
RNDr(S, W, 13, i);
RNDr(S, W, 14, i);
RNDr(S, W, 15, i);
if (i == 48)
break;
MSCH(W, 0, i);
MSCH(W, 1, i);
MSCH(W, 2, i);
MSCH(W, 3, i);
MSCH(W, 4, i);
MSCH(W, 5, i);
MSCH(W, 6, i);
MSCH(W, 7, i);
MSCH(W, 8, i);
MSCH(W, 9, i);
MSCH(W, 10, i);
MSCH(W, 11, i);
MSCH(W, 12, i);
MSCH(W, 13, i);
MSCH(W, 14, i);
MSCH(W, 15, i);
}
/* 4. Mix local working variables into global state. */
state[0] += S[0];
state[1] += S[1];
state[2] += S[2];
state[3] += S[3];
state[4] += S[4];
state[5] += S[5];
state[6] += S[6];
state[7] += S[7];
}
#endif
static const uint8_t PAD[64] = {
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/* Add padding and terminating bit-count. */
static void
SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
{
size_t r;
/* Figure out how many bytes we have buffered. */
r = (ctx->count >> 3) & 0x3f;
/* Pad to 56 mod 64, transforming if we finish a block en route. */
if (r < 56) {
/* Pad to 56 mod 64. */
memcpy(&ctx->buf[r], PAD, 56 - r);
} else {
/* Finish the current block and mix. */
memcpy(&ctx->buf[r], PAD, 64 - r);
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
/* The start of the final block is all zeroes. */
memset(&ctx->buf[0], 0, 56);
}
/* Add the terminating bit-count. */
be64enc(&ctx->buf[56], ctx->count);
/* Mix in the final block. */
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
}
#if 0
/* Magic initialization constants. */
static const uint32_t initial_state[8] = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
/**
* SHA256_Init(ctx):
* Initialize the SHA256 context ${ctx}.
*/
void
SHA256_Init(SHA256_CTX * ctx)
{
/* Zero bits processed so far. */
ctx->count = 0;
/* Initialize state. */
memcpy(ctx->state, initial_state, sizeof(initial_state));
}
/**
* SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
*/
static void
_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
uint32_t tmp32[static restrict 72])
{
uint32_t r;
const uint8_t * src = in;
/* Return immediately if we have nothing to do. */
if (len == 0)
return;
/* Number of bytes left in the buffer from previous updates. */
r = (ctx->count >> 3) & 0x3f;
/* Update number of bits. */
ctx->count += (uint64_t)(len) << 3;
/* Handle the case where we don't need to perform any transforms. */
if (len < 64 - r) {
memcpy(&ctx->buf[r], src, len);
return;
}
/* Finish the current block. */
memcpy(&ctx->buf[r], src, 64 - r);
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
src += 64 - r;
len -= 64 - r;
/* Perform complete blocks. */
while (len >= 64) {
SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
src += 64;
len -= 64;
}
/* Copy left over data into buffer. */
memcpy(ctx->buf, src, len);
}
/* Wrapper function for intermediate-values sanitization. */
void
SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
{
uint32_t tmp32[72];
/* Call the real function. */
_SHA256_Update(ctx, in, len, tmp32);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* SHA256_Final(digest, ctx):
* Output the SHA256 hash of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
static void
_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
uint32_t tmp32[static restrict 72])
{
/* Add padding. */
SHA256_Pad(ctx, tmp32);
/* Write the hash. */
be32enc_vect(digest, ctx->state, 4);
}
/* Wrapper function for intermediate-values sanitization. */
void
SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
{
uint32_t tmp32[72];
/* Call the real function. */
_SHA256_Final(digest, ctx, tmp32);
/* Clear the context state. */
insecure_memzero(ctx, sizeof(SHA256_CTX));
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
#endif
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void
SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
{
SHA256_CTX ctx;
uint32_t tmp32[72];
SHA256_Init(&ctx);
SHA256_Update(&ctx, in, len);
SHA256_Final(digest, &ctx);
// _SHA256_Update(&ctx, in, len, tmp32);
// _SHA256_Final(digest, &ctx, tmp32);
/* Clean the stack. */
insecure_memzero(&ctx, sizeof(SHA256_CTX));
insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Init(ctx, K, Klen):
* Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
* ${K}.
*/
static void
_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
uint8_t khash[static restrict 32])
{
const uint8_t * K = _K;
size_t i;
/* If Klen > 64, the key is really SHA256(K). */
if (Klen > 64) {
SHA256_Init(&ctx->ictx);
SHA256_Update(&ctx->ictx, K, Klen);
SHA256_Final(khash, &ctx->ictx);
// _SHA256_Update(&ctx->ictx, K, Klen, tmp32);
// _SHA256_Final(khash, &ctx->ictx, tmp32);
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
SHA256_Init(&ctx->ictx);
memset(pad, 0x36, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
SHA256_Update(&ctx->ictx, pad, 64);
// _SHA256_Update(&ctx->ictx, pad, 64, tmp32);
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
SHA256_Init(&ctx->octx);
memset(pad, 0x5c, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
SHA256_Update(&ctx->octx, pad, 64);
// _SHA256_Update(&ctx->octx, pad, 64, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
{
uint32_t tmp32[72];
uint8_t pad[64];
uint8_t khash[32];
/* Call the real function. */
_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
insecure_memzero(khash, 32);
insecure_memzero(pad, 64);
}
/**
* HMAC_SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
*/
static void
_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
uint32_t tmp32[static restrict 72])
{
/* Feed data to the inner SHA256 operation. */
SHA256_Update(&ctx->ictx, in, len);
// _SHA256_Update(&ctx->ictx, in, len, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
{
uint32_t tmp32[72];
/* Call the real function. */
_HMAC_SHA256_Update(ctx, in, len, tmp32);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Final(digest, ctx):
* Output the HMAC-SHA256 of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
static void
_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
{
/* Finish the inner SHA256 operation. */
_SHA256_Final(ihash, &ctx->ictx, tmp32);
/* Feed the inner hash to the outer SHA256 operation. */
_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
/* Finish the outer SHA256 operation. */
_SHA256_Final(digest, &ctx->octx, tmp32);
// _SHA256_Final(ihash, &ctx->ictx, tmp32);
// _SHA256_Update(&ctx->octx, ihash, 32, tmp32);
// _SHA256_Final(digest, &ctx->octx, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
{
uint32_t tmp32[72];
uint8_t ihash[32];
/* Call the real function. */
_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
insecure_memzero(ihash, 32);
}
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void
HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
uint8_t digest[32])
{
HMAC_SHA256_CTX ctx;
uint32_t tmp32[72];
uint8_t tmp8[96];
_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
_HMAC_SHA256_Update(&ctx, in, len, tmp32);
_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
/* Clean the stack. */
insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(tmp32, 288);
insecure_memzero(tmp8, 96);
}
/* Add padding and terminating bit-count, but don't invoke Transform yet. */
static int
SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
uint32_t tmp32[static restrict 72])
{
uint32_t r;
r = (ctx->count >> 3) & 0x3f;
if (r >= 56)
return -1;
/*
* Convert length to a vector of bytes -- we do this now rather
* than later because the length will change after we pad.
*/
be64enc(len, ctx->count);
/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
SHA256_Update(ctx, PAD, 56 - r, tmp);
/* Add the terminating bit-count. */
ctx->buf[63] = len[7];
SHA256_Update(ctx, len, 7, tmp);
/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
// _SHA256_Update(ctx, PAD, 56 - r, tmp32);
/* Add the terminating bit-count. */
// ctx->buf[63] = len[7];
// _SHA256_Update(ctx, len, 7, tmp32);
return 0;
}
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void
PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
HMAC_SHA256_CTX Phctx, PShctx, hctx;
uint32_t tmp32[72];
union {
uint8_t tmp8[96];
uint32_t state[8];
} u;
size_t i;
uint8_t ivec[4];
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Sanity-check. */
assert(dkLen <= 32 * (size_t)(UINT32_MAX));
if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
uint32_t oldcount;
uint8_t * ivecp;
/* Compute HMAC state after processing P and S. */
_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
tmp32, &u.tmp8[0], &u.tmp8[64]);
_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
/* Prepare ictx padding. */
oldcount = hctx.ictx.count & (0x3f << 3);
_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
goto generic; /* Can't happen due to saltlen check */
ivecp = hctx.ictx.buf + (oldcount >> 3);
/* Prepare octx padding. */
hctx.octx.count += 32 << 3;
SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivecp, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(u.state, hctx.ictx.state, sizeof(u.state));
SHA256_Transform(u.state, hctx.ictx.buf );
be32enc_vect(hctx.octx.buf, u.state, 4);
memcpy(u.state, hctx.octx.state, sizeof(u.state));
SHA256_Transform(u.state, hctx.octx.buf );
// SHA256_Transform(u.state, hctx.ictx.buf,
// &tmp32[0], &tmp32[64]);
// be32enc_vect(hctx.octx.buf, u.state, 4);
// memcpy(u.state, hctx.octx.state, sizeof(u.state));
// SHA256_Transform(u.state, hctx.octx.buf,
// &tmp32[0], &tmp32[64]);
be32enc_vect(&buf[i * 32], u.state, 4);
}
goto cleanup;
}
generic:
/* Compute HMAC state after processing P. */
_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
tmp32, &u.tmp8[0], &u.tmp8[64]);
/* Compute HMAC state after processing P and S. */
memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
if (c > 1) {
/* T_i = U_1 ... */
memcpy(U, T, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
/* ... xor U_j ... */
for (k = 0; k < 32; k++)
T[k] ^= U[k];
}
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy(&buf[i * 32], T, clen);
}
/* Clean the stack. */
insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(U, 32);
insecure_memzero(T, 32);
cleanup:
insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(tmp32, 288);
insecure_memzero(&u, sizeof(u));
}

View File

@@ -1,129 +0,0 @@
/*-
* Copyright 2005-2016 Colin Percival
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef _SHA256_H_
#define _SHA256_H_
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Use #defines in order to avoid namespace collisions with anyone else's
* SHA256 code (e.g., the code in OpenSSL).
*/
#define SHA256_Init libcperciva_SHA256_Init
#define SHA256_Update libcperciva_SHA256_Update
#define SHA256_Final libcperciva_SHA256_Final
#define SHA256_Buf libcperciva_SHA256_Buf
#define SHA256_CTX libcperciva_SHA256_CTX
#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init
#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update
#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final
#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf
#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX
/* Context structure for SHA256 operations. */
typedef struct {
uint32_t state[8];
uint64_t count;
uint8_t buf[64];
} SHA256_CTX;
/**
* SHA256_Init(ctx):
* Initialize the SHA256 context ${ctx}.
*/
void SHA256_Init(SHA256_CTX *);
/**
* SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
*/
void SHA256_Update(SHA256_CTX *, const void *, size_t);
/**
* SHA256_Final(digest, ctx):
* Output the SHA256 hash of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
void SHA256_Final(uint8_t[32], SHA256_CTX *);
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void SHA256_Buf(const void *, size_t, uint8_t[32]);
/* Context structure for HMAC-SHA256 operations. */
typedef struct {
SHA256_CTX ictx;
SHA256_CTX octx;
} HMAC_SHA256_CTX;
/**
* HMAC_SHA256_Init(ctx, K, Klen):
* Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
* ${K}.
*/
void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
/**
* HMAC_SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
*/
void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
/**
* HMAC_SHA256_Final(digest, ctx):
* Output the HMAC-SHA256 of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *);
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]);
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
uint64_t, uint8_t *, size_t);
#ifdef __cplusplus
}
#endif
#endif /* !_SHA256_H_ */

View File

@@ -1,134 +0,0 @@
/*-
* Copyright 2005-2016 Colin Percival
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef _SHA256_H_
#define _SHA256_H_
#include <stddef.h>
#include <stdint.h>
#include <openssl.sha>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Use #defines in order to avoid namespace collisions with anyone else's
* SHA256 code (e.g., the code in OpenSSL).
*/
/*
#define SHA256_Init libcperciva_SHA256_Init
#define SHA256_Update libcperciva_SHA256_Update
#define SHA256_Final libcperciva_SHA256_Final
#define SHA256_CTX libcperciva_SHA256_CTX
*/
#define SHA256_Buf libcperciva_SHA256_Buf
#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init
#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update
#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final
#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf
#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX
#if 0
/* Context structure for SHA256 operations. */
typedef struct {
uint32_t state[8];
uint64_t count;
uint8_t buf[64];
} SHA256_CTX;
/**
* SHA256_Init(ctx):
* Initialize the SHA256 context ${ctx}.
*/
void SHA256_Init(SHA256_CTX *);
/**
* SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
*/
void SHA256_Update(SHA256_CTX *, const void *, size_t);
/**
* SHA256_Final(digest, ctx):
* Output the SHA256 hash of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
void SHA256_Final(uint8_t[32], SHA256_CTX *);
#endif
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void SHA256_Buf(const void *, size_t, uint8_t[32]);
/* Context structure for HMAC-SHA256 operations. */
typedef struct {
SHA256_CTX ictx;
SHA256_CTX octx;
} HMAC_SHA256_CTX;
/**
* HMAC_SHA256_Init(ctx, K, Klen):
* Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
* ${K}.
*/
void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
/**
* HMAC_SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
*/
void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
/**
* HMAC_SHA256_Final(digest, ctx):
* Output the HMAC-SHA256 of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *);
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]);
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
uint64_t, uint8_t *, size_t);
#ifdef __cplusplus
}
#endif
#endif /* !_SHA256_H_ */

218
algo/yespower/sha256_p.c Normal file
View File

@@ -0,0 +1,218 @@
/*-
* Copyright 2005,2007,2009 Colin Percival
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include "sysendian.h"
#include "sha256_p.h"
#include "compat.h"
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define SHR(x, n) (x >> n)
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
t0 = h + S1(e) + Ch(e, f, g) + k; \
t1 = S0(a) + Maj(a, b, c); \
d += t0; \
h = t0 + t1;
/* Adjusted round function for rotating state */
#define RNDr(S, W, i, k) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i] + k)
/*
static unsigned char PAD[64] = {
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
*/
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void
SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
{
SHA256_CTX ctx;
SHA256_Init( &ctx );
SHA256_Update( &ctx, in, len );
SHA256_Final( digest, &ctx );
}
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void
HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
uint8_t digest[32])
{
HMAC_SHA256_CTX ctx;
HMAC_SHA256_Init( &ctx, K, Klen );
HMAC_SHA256_Update( &ctx, in, len );
HMAC_SHA256_Final( digest, &ctx );
}
/* Initialize an HMAC-SHA256 operation with the given key. */
void
HMAC_SHA256_Init( HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen )
{
unsigned char pad[64];
unsigned char khash[32];
const unsigned char * K = _K;
size_t i;
/* If Klen > 64, the key is really SHA256(K). */
if (Klen > 64) {
SHA256_Init( &ctx->ictx );
SHA256_Update( &ctx->ictx, K, Klen );
SHA256_Final( khash, &ctx->ictx );
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
SHA256_Init( &ctx->ictx );
memset( pad, 0x36, 64 );
for ( i = 0; i < Klen; i++ )
pad[i] ^= K[i];
SHA256_Update( &ctx->ictx, pad, 64 );
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
SHA256_Init( &ctx->octx );
memset(pad, 0x5c, 64);
for ( i = 0; i < Klen; i++ )
pad[i] ^= K[i];
SHA256_Update( &ctx->octx, pad, 64 );
/* Clean the stack. */
//memset(khash, 0, 32);
}
/* Add bytes to the HMAC-SHA256 operation. */
void
HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
{
/* Feed data to the inner SHA256 operation. */
SHA256_Update( &ctx->ictx, in, len );
}
/* Finish an HMAC-SHA256 operation. */
void
HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx )
{
unsigned char ihash[32];
/* Finish the inner SHA256 operation. */
SHA256_Final( ihash, &ctx->ictx );
/* Feed the inner hash to the outer SHA256 operation. */
SHA256_Update( &ctx->octx, ihash, 32 );
/* Finish the outer SHA256 operation. */
SHA256_Final( digest, &ctx->octx );
/* Clean the stack. */
//memset(ihash, 0, 32);
}
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void
PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
HMAC_SHA256_CTX PShctx, hctx;
uint8_t _ALIGN(128) T[32];
uint8_t _ALIGN(128) U[32];
uint8_t ivec[4];
size_t i, clen;
uint64_t j;
int k;
/* Compute HMAC state after processing P and S. */
HMAC_SHA256_Init(&PShctx, passwd, passwdlen);
HMAC_SHA256_Update(&PShctx, salt, saltlen);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
HMAC_SHA256_Update(&hctx, ivec, 4);
HMAC_SHA256_Final(U, &hctx);
/* T_i = U_1 ... */
memcpy(T, U, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
HMAC_SHA256_Init(&hctx, passwd, passwdlen);
HMAC_SHA256_Update(&hctx, U, 32);
HMAC_SHA256_Final(U, &hctx);
/* ... xor U_j ... */
for (k = 0; k < 32; k++)
T[k] ^= U[k];
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy(&buf[i * 32], T, clen);
}
/* Clean PShctx, since we never called _Final on it. */
//memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
}

View File

@@ -1,496 +0,0 @@
/*-
* Copyright 2005,2007,2009 Colin Percival
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include "sysendian.h"
#include "sha256_p.h"
#include "compat.h"
/*
* Encode a length len/4 vector of (uint32_t) into a length len vector of
* (unsigned char) in big-endian form. Assumes len is a multiple of 4.
*/
static void
be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
{
size_t i;
for (i = 0; i < len / 4; i++)
be32enc(dst + i * 4, src[i]);
}
/*
* Decode a big-endian length len vector of (unsigned char) into a length
* len/4 vector of (uint32_t). Assumes len is a multiple of 4.
*/
static void
be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
{
size_t i;
for (i = 0; i < len / 4; i++)
dst[i] = be32dec(src + i * 4);
}
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define SHR(x, n) (x >> n)
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
t0 = h + S1(e) + Ch(e, f, g) + k; \
t1 = S0(a) + Maj(a, b, c); \
d += t0; \
h = t0 + t1;
/* Adjusted round function for rotating state */
#define RNDr(S, W, i, k) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i] + k)
/*
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state.
*/
static void
SHA256_Transform_p(uint32_t * state, const unsigned char block[64])
{
uint32_t _ALIGN(128) W[64], S[8];
uint32_t t0, t1;
int i;
/* 1. Prepare message schedule W. */
be32dec_vect(W, block, 64);
for (i = 16; i < 64; i++)
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
/* 2. Initialize working variables. */
memcpy(S, state, 32);
/* 3. Mix. */
RNDr(S, W, 0, 0x428a2f98);
RNDr(S, W, 1, 0x71374491);
RNDr(S, W, 2, 0xb5c0fbcf);
RNDr(S, W, 3, 0xe9b5dba5);
RNDr(S, W, 4, 0x3956c25b);
RNDr(S, W, 5, 0x59f111f1);
RNDr(S, W, 6, 0x923f82a4);
RNDr(S, W, 7, 0xab1c5ed5);
RNDr(S, W, 8, 0xd807aa98);
RNDr(S, W, 9, 0x12835b01);
RNDr(S, W, 10, 0x243185be);
RNDr(S, W, 11, 0x550c7dc3);
RNDr(S, W, 12, 0x72be5d74);
RNDr(S, W, 13, 0x80deb1fe);
RNDr(S, W, 14, 0x9bdc06a7);
RNDr(S, W, 15, 0xc19bf174);
RNDr(S, W, 16, 0xe49b69c1);
RNDr(S, W, 17, 0xefbe4786);
RNDr(S, W, 18, 0x0fc19dc6);
RNDr(S, W, 19, 0x240ca1cc);
RNDr(S, W, 20, 0x2de92c6f);
RNDr(S, W, 21, 0x4a7484aa);
RNDr(S, W, 22, 0x5cb0a9dc);
RNDr(S, W, 23, 0x76f988da);
RNDr(S, W, 24, 0x983e5152);
RNDr(S, W, 25, 0xa831c66d);
RNDr(S, W, 26, 0xb00327c8);
RNDr(S, W, 27, 0xbf597fc7);
RNDr(S, W, 28, 0xc6e00bf3);
RNDr(S, W, 29, 0xd5a79147);
RNDr(S, W, 30, 0x06ca6351);
RNDr(S, W, 31, 0x14292967);
RNDr(S, W, 32, 0x27b70a85);
RNDr(S, W, 33, 0x2e1b2138);
RNDr(S, W, 34, 0x4d2c6dfc);
RNDr(S, W, 35, 0x53380d13);
RNDr(S, W, 36, 0x650a7354);
RNDr(S, W, 37, 0x766a0abb);
RNDr(S, W, 38, 0x81c2c92e);
RNDr(S, W, 39, 0x92722c85);
RNDr(S, W, 40, 0xa2bfe8a1);
RNDr(S, W, 41, 0xa81a664b);
RNDr(S, W, 42, 0xc24b8b70);
RNDr(S, W, 43, 0xc76c51a3);
RNDr(S, W, 44, 0xd192e819);
RNDr(S, W, 45, 0xd6990624);
RNDr(S, W, 46, 0xf40e3585);
RNDr(S, W, 47, 0x106aa070);
RNDr(S, W, 48, 0x19a4c116);
RNDr(S, W, 49, 0x1e376c08);
RNDr(S, W, 50, 0x2748774c);
RNDr(S, W, 51, 0x34b0bcb5);
RNDr(S, W, 52, 0x391c0cb3);
RNDr(S, W, 53, 0x4ed8aa4a);
RNDr(S, W, 54, 0x5b9cca4f);
RNDr(S, W, 55, 0x682e6ff3);
RNDr(S, W, 56, 0x748f82ee);
RNDr(S, W, 57, 0x78a5636f);
RNDr(S, W, 58, 0x84c87814);
RNDr(S, W, 59, 0x8cc70208);
RNDr(S, W, 60, 0x90befffa);
RNDr(S, W, 61, 0xa4506ceb);
RNDr(S, W, 62, 0xbef9a3f7);
RNDr(S, W, 63, 0xc67178f2);
/* 4. Mix local working variables into global state */
for (i = 0; i < 8; i++)
state[i] += S[i];
#if 0
/* Clean the stack. */
memset(W, 0, 256);
memset(S, 0, 32);
t0 = t1 = 0;
#endif
}
static unsigned char PAD[64] = {
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
// only called by SHA256_Final_p
/* Add padding and terminating bit-count. */
static void
SHA256_Pad_p(SHA256_CTX_p * ctx)
{
unsigned char len[8];
uint32_t r, plen;
/*
* Convert length to a vector of bytes -- we do this now rather
* than later because the length will change after we pad.
*/
be32enc_vect(len, ctx->count, 8);
/* Add 1--64 bytes so that the resulting length is 56 mod 64 */
r = (ctx->count[1] >> 3) & 0x3f;
plen = (r < 56) ? (56 - r) : (120 - r);
SHA256_Update_p(ctx, PAD, (size_t)plen);
/* Add the terminating bit-count */
SHA256_Update_p(ctx, len, 8);
}
/* SHA-256 initialization. Begins a SHA-256 operation. */
void
SHA256_Init_p(SHA256_CTX_p * ctx)
{
/* Zero bits processed so far */
ctx->count[0] = ctx->count[1] = 0;
/* Magic initialization constants */
ctx->state[0] = 0x6A09E667;
ctx->state[1] = 0xBB67AE85;
ctx->state[2] = 0x3C6EF372;
ctx->state[3] = 0xA54FF53A;
ctx->state[4] = 0x510E527F;
ctx->state[5] = 0x9B05688C;
ctx->state[6] = 0x1F83D9AB;
ctx->state[7] = 0x5BE0CD19;
}
/* Add bytes into the hash */
void
SHA256_Update_p(SHA256_CTX_p * ctx, const void *in, size_t len)
{
uint32_t bitlen[2];
uint32_t r;
const unsigned char *src = in;
/* Number of bytes left in the buffer from previous updates */
r = (ctx->count[1] >> 3) & 0x3f;
/* Convert the length into a number of bits */
bitlen[1] = ((uint32_t)len) << 3;
bitlen[0] = (uint32_t)(len >> 29);
/* Update number of bits */
if ((ctx->count[1] += bitlen[1]) < bitlen[1])
ctx->count[0]++;
ctx->count[0] += bitlen[0];
/* Handle the case where we don't need to perform any transforms */
if (len < 64 - r) {
memcpy(&ctx->buf[r], src, len);
return;
}
/* Finish the current block */
memcpy(&ctx->buf[r], src, 64 - r);
SHA256_Transform_p(ctx->state, ctx->buf);
src += 64 - r;
len -= 64 - r;
/* Perform complete blocks */
while (len >= 64) {
SHA256_Transform_p(ctx->state, src);
src += 64;
len -= 64;
}
/* Copy left over data into buffer */
memcpy(ctx->buf, src, len);
}
/*
* SHA-256 finalization. Pads the input data, exports the hash value,
* and clears the context state.
*/
void
SHA256_Final_p(unsigned char digest[32], SHA256_CTX_p * ctx)
{
/* Add padding */
SHA256_Pad_p(ctx);
/* Write the hash */
be32enc_vect(digest, ctx->state, 32);
/* Clear the context state */
memset((void *)ctx, 0, sizeof(*ctx));
}
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void
SHA256_Buf_p(const void * in, size_t len, uint8_t digest[32])
{
// SHA256_CTX_p ctx;
// uint32_t tmp32[72];
#if defined(__SHA__)
SHA256_CTX ctx;
SHA256_Init(&ctx);
SHA256_Update(&ctx, in, len);
SHA256_Final(digest, &ctx);
#else
SHA256_CTX_p ctx;
SHA256_Init_p(&ctx);
SHA256_Update_p(&ctx, in, len);
SHA256_Final_p(digest, &ctx);
#endif
/* Clean the stack. */
// insecure_memzero(&ctx, sizeof(SHA256_CTX));
// insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void
HMAC_SHA256_Buf_p(const void * K, size_t Klen, const void * in, size_t len,
uint8_t digest[32])
{
HMAC_SHA256_CTX_p ctx;
// uint32_t tmp32[72];
// uint8_t tmp8[96];
HMAC_SHA256_Init_p(&ctx, K, Klen);
HMAC_SHA256_Update_p(&ctx, in, len);
HMAC_SHA256_Final_p(digest, &ctx);
/* Clean the stack. */
// insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
// insecure_memzero(tmp32, 288);
// insecure_memzero(tmp8, 96);
}
/* Initialize an HMAC-SHA256 operation with the given key. */
void
HMAC_SHA256_Init_p(HMAC_SHA256_CTX_p * ctx, const void * _K, size_t Klen)
{
unsigned char pad[64];
unsigned char khash[32];
const unsigned char * K = _K;
size_t i;
/* If Klen > 64, the key is really SHA256(K). */
if (Klen > 64) {
#if defined(__SHA__)
SHA256_Init(&ctx->ictx);
SHA256_Update(&ctx->ictx, K, Klen);
SHA256_Final(khash, &ctx->ictx);
#else
SHA256_Init_p(&ctx->ictx);
SHA256_Update_p(&ctx->ictx, K, Klen);
SHA256_Final_p(khash, &ctx->ictx);
#endif
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
#if defined(__SHA__)
SHA256_Init(&ctx->ictx);
#else
SHA256_Init_p(&ctx->ictx);
#endif
memset(pad, 0x36, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
#if defined(__SHA__)
SHA256_Update(&ctx->ictx, pad, 64);
#else
SHA256_Update_p(&ctx->ictx, pad, 64);
#endif
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
#if defined(__SHA__)
SHA256_Init(&ctx->octx);
#else
SHA256_Init_p(&ctx->octx);
#endif
memset(pad, 0x5c, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
#if defined(__SHA__)
SHA256_Update(&ctx->octx, pad, 64);
#else
SHA256_Update_p(&ctx->octx, pad, 64);
#endif
/* Clean the stack. */
//memset(khash, 0, 32);
}
/* Add bytes to the HMAC-SHA256 operation. */
void
HMAC_SHA256_Update_p(HMAC_SHA256_CTX_p * ctx, const void *in, size_t len)
{
/* Feed data to the inner SHA256 operation. */
#if defined(__SHA__)
SHA256_Update(&ctx->ictx, in, len);
#else
SHA256_Update_p(&ctx->ictx, in, len);
#endif
}
/* Finish an HMAC-SHA256 operation. */
void
HMAC_SHA256_Final_p(unsigned char digest[32], HMAC_SHA256_CTX_p * ctx)
{
unsigned char ihash[32];
#if defined(__SHA__)
/* Finish the inner SHA256 operation. */
SHA256_Final(ihash, &ctx->ictx);
/* Feed the inner hash to the outer SHA256 operation. */
SHA256_Update(&ctx->octx, ihash, 32);
/* Finish the outer SHA256 operation. */
SHA256_Final(digest, &ctx->octx);
#else
/* Finish the inner SHA256 operation. */
SHA256_Final_p(ihash, &ctx->ictx);
/* Feed the inner hash to the outer SHA256 operation. */
SHA256_Update_p(&ctx->octx, ihash, 32);
/* Finish the outer SHA256 operation. */
SHA256_Final_p(digest, &ctx->octx);
#endif
/* Clean the stack. */
//memset(ihash, 0, 32);
}
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void
PBKDF2_SHA256_p(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
HMAC_SHA256_CTX_p PShctx, hctx;
uint8_t _ALIGN(128) T[32];
uint8_t _ALIGN(128) U[32];
uint8_t ivec[4];
size_t i, clen;
uint64_t j;
int k;
/* Compute HMAC state after processing P and S. */
HMAC_SHA256_Init_p(&PShctx, passwd, passwdlen);
HMAC_SHA256_Update_p(&PShctx, salt, saltlen);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_p));
HMAC_SHA256_Update_p(&hctx, ivec, 4);
HMAC_SHA256_Final_p(U, &hctx);
/* T_i = U_1 ... */
memcpy(T, U, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
HMAC_SHA256_Init_p(&hctx, passwd, passwdlen);
HMAC_SHA256_Update_p(&hctx, U, 32);
HMAC_SHA256_Final_p(U, &hctx);
/* ... xor U_j ... */
for (k = 0; k < 32; k++)
T[k] ^= U[k];
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy(&buf[i * 32], T, clen);
}
/* Clean PShctx, since we never called _Final on it. */
//memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
}

View File

@@ -33,45 +33,24 @@
#include <stdint.h>
#include <openssl/sha.h>
typedef struct SHA256Context {
uint32_t state[8];
uint32_t count[2];
unsigned char buf[64];
} SHA256_CTX_p;
/*
typedef struct HMAC_SHA256Context {
SHA256_CTX_Y ictx;
SHA256_CTX_Y octx;
} HMAC_SHA256_CTX_Y;
*/
typedef struct HMAC_SHA256Context {
#if defined(__SHA__)
SHA256_CTX ictx;
SHA256_CTX octx;
#else
SHA256_CTX_p ictx;
SHA256_CTX_p octx;
#endif
} HMAC_SHA256_CTX_p;
} HMAC_SHA256_CTX;
void SHA256_Init_p(SHA256_CTX_p *);
void SHA256_Update_p(SHA256_CTX_p *, const void *, size_t);
void SHA256_Final_p(unsigned char [32], SHA256_CTX_p *);
void SHA256_Buf_p(const void * in, size_t len, uint8_t digest[32]);
void HMAC_SHA256_Init_p(HMAC_SHA256_CTX_p *, const void *, size_t);
void HMAC_SHA256_Update_p(HMAC_SHA256_CTX_p *, const void *, size_t);
void HMAC_SHA256_Final_p(unsigned char [32], HMAC_SHA256_CTX_p *);
void HMAC_SHA256_Buf_p(const void * K, size_t Klen, const void * in,
size_t len, uint8_t digest[32]);
void SHA256_Buf( const void * in, size_t len, uint8_t digest[32] );
void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * );
void HMAC_SHA256_Buf( const void * K, size_t Klen, const void * in,
size_t len, uint8_t digest[32] );
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void PBKDF2_SHA256_p(const uint8_t *, size_t, const uint8_t *, size_t,
uint64_t, uint8_t *, size_t);
void PBKDF2_SHA256( const uint8_t *, size_t, const uint8_t *, size_t,
uint64_t, uint8_t *, size_t);
#endif /* !_SHA256_H_ */

View File

@@ -62,6 +62,7 @@
#warning "Note: building generic code for non-x86. That's OK."
#endif
*/
/*
* The SSE4 code version has fewer instructions than the generic SSE2 version,
* but all of the instructions are SIMD, thereby wasting the scalar execution
@@ -96,7 +97,7 @@
#include <string.h>
#include "insecure_memzero.h"
#include "sha256.h"
#include "sha256_p.h"
#include "sysendian.h"
#include "yespower.h"

File diff suppressed because it is too large Load Diff

View File

@@ -51,7 +51,7 @@
#include <stdlib.h>
#include <string.h>
#include "sha256.h"
#include "sha256_p.h"
#include "sysendian.h"
#include "yespower.h"
@@ -534,11 +534,12 @@ int yespower(yespower_local_t *local,
if (pers) {
HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen,
return true;
(uint8_t *)sha256);
SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst);
}
} else {
HMAC_SHA256_Buf((uint8_t *)B + B_size - 64, 64,
HMAC_SHA256_Buf_P((uint8_t *)B + B_size - 64, 64,
sha256, sizeof(sha256), (uint8_t *)dst);
}

View File

@@ -38,7 +38,7 @@ void yespower_hash( const char *input, char *output, uint32_t len )
}
int scanhash_yespower( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) vhash[8];
uint32_t _ALIGN(64) endiandata[20];
@@ -48,6 +48,7 @@ int scanhash_yespower( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
for (int k = 0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);

2004
avxdefs.h

File diff suppressed because it is too large Load Diff

View File

@@ -100,9 +100,9 @@ rm -f config.status
CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
make -j 16
strip -s cpuminer.exe
mv cpuminer.exe cpuminer-native.exe
#mv cpuminer.exe cpuminer-native.exe
strip -s cpuminer
mv cpuminer cpuminer-native
#mv cpuminer cpuminer-native
make clean || echo done
#make clean || echo done

22
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.1.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.3.1.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.9.1'
PACKAGE_STRING='cpuminer-opt 3.9.1'
PACKAGE_VERSION='3.9.3.1'
PACKAGE_STRING='cpuminer-opt 3.9.3.1'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.9.1 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.9.3.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.9.1:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.9.3.1:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.9.1
cpuminer-opt configure 3.9.3.1
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.9.1, which was
It was created by cpuminer-opt $as_me 3.9.3.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.9.1'
VERSION='3.9.3.1'
cat >>confdefs.h <<_ACEOF
@@ -5884,7 +5884,7 @@ fi
# GC2 for GNU static
if test "x$OS" = "xWindows_NT" ; then
if test "x$have_win32" = "xtrue" ; then
# MinGW
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
$as_echo_n "checking for pthread_create in -lpthread... " >&6; }
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.9.1, which was
This file was extended by cpuminer-opt $as_me 3.9.3.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.9.1
cpuminer-opt config.status 3.9.3.1
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.9.1])
AC_INIT([cpuminer-opt], [3.9.3.1])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM
@@ -106,7 +106,7 @@ fi
AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
# GC2 for GNU static
if test "x$OS" = "xWindows_NT" ; then
if test "x$have_win32" = "xtrue" ; then
# MinGW
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",[])
else

View File

@@ -105,10 +105,12 @@ enum algos opt_algo = ALGO_NULL;
int opt_scrypt_n = 0;
int opt_pluck_n = 128;
int opt_n_threads = 0;
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
__int128_t opt_affinity = -1LL;
// Windows doesn't support 128 bit affinity mask.
#if defined(__linux) && defined(GCC_INT128)
#define AFFINITY_USES_UINT128 1
uint128_t opt_affinity = -1LL;
#else
int64_t opt_affinity = -1LL;
uint64_t opt_affinity = -1LL;
#endif
int opt_priority = 0;
int num_cpus = 1;
@@ -203,7 +205,8 @@ static inline void drop_policy(void)
#define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */
#endif
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
// Linux affinity can use int128.
#if AFFINITY_USES_UINT128
static void affine_to_cpu_mask( int id, unsigned __int128 mask )
#else
static void affine_to_cpu_mask( int id, unsigned long long mask )
@@ -216,7 +219,7 @@ static void affine_to_cpu_mask( int id, unsigned long long mask )
for ( uint8_t i = 0; i < ncpus; i++ )
{
// cpu mask
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
#if AFFINITY_USES_UINT128
if( ( mask & ( (unsigned __int128)1ULL << i ) ) ) CPU_SET( i, &set );
#else
if( (ncpus > 64) || ( mask & (1ULL << i) ) ) CPU_SET( i, &set );
@@ -237,6 +240,7 @@ static void affine_to_cpu_mask( int id, unsigned long long mask )
#elif defined(WIN32) /* Windows */
static inline void drop_policy(void) { }
// Windows CPU groups to manage more than 64 CPUs.
static void affine_to_cpu_mask( int id, unsigned long mask )
{
bool success;
@@ -263,7 +267,7 @@ static void affine_to_cpu_mask( int id, unsigned long mask )
break;
cpu -= cpus;
}
}
if (opt_debug)
applog(LOG_DEBUG, "Binding thread %d to cpu %d on cpu group %d (mask %x)", id, cpu, group, (1ULL << cpu));
@@ -847,7 +851,8 @@ static int share_result( int result, struct work *work, const char *reason )
float rate;
char rate_s[8] = {0};
double sharediff = work ? work->sharediff : stratum.sharediff;
bool solved = result && (net_diff > 0.0 ) && ( sharediff >= net_diff );
bool solved = result && accepted_share_count && (net_diff > 0.0 )
&& ( sharediff >= net_diff );
char sol[32] = {0};
int i;
@@ -857,15 +862,17 @@ static int share_result( int result, struct work *work, const char *reason )
hashcount += thr_hashcount[i];
hashrate += thr_hashrates[i];
}
solved = result && ( (uint64_t)hashcount > 0 ) && (net_diff > 0.0 )
&& ( sharediff >= net_diff );
result ? accepted_share_count++ : rejected_share_count++;
if ( solved )
{
solved_block_count++;
if ( use_colors )
sprintf( sol, CL_GRN " Solved" CL_WHT " %d", solved_block_count );
sprintf( sol, CL_GRN " Solved: %d" CL_WHT, solved_block_count );
else
sprintf( sol, " Solved %d", solved_block_count );
sprintf( sol, ", Solved: %d", solved_block_count );
}
pthread_mutex_unlock(&stats_lock);
@@ -1839,26 +1846,42 @@ static void *miner_thread( void *userdata )
}
else
*/
if ( num_cpus > 1 )
{
if ( (opt_affinity == -1LL) && (opt_n_threads) > 1 )
{
if (opt_debug)
applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)",
thr_id, thr_id % num_cpus, ( 1ULL << (thr_id % num_cpus) ) );
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
affine_to_cpu_mask( thr_id,
(unsigned __int128)1LL << (thr_id % num_cpus) );
#if AFFINITY_USES_UINT128
// Default affinity
if ( (opt_affinity == i128_neg1 ) && opt_n_threads > 1 )
{
if ( opt_debug )
applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
thr_id, thr_id % num_cpus,
u128_hi64( (uint128_t)1ULL << (thr_id % num_cpus) ),
u128_lo64( (uint128_t)1ULL << (thr_id % num_cpus) ) );
affine_to_cpu_mask( thr_id, (uint128_t)1ULL << (thr_id % num_cpus) );
}
#else
affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) );
#endif
}
else if (opt_affinity != -1)
{
if ( (opt_affinity == -1LL) && opt_n_threads > 1 )
{
if (opt_debug)
applog( LOG_DEBUG, "Binding thread %d to cpu mask %x",
thr_id, opt_affinity);
affine_to_cpu_mask( thr_id, opt_affinity );
applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
thr_id, thr_id % num_cpus, 1LL << (thr_id % num_cpus)) ;
affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) );
}
#endif
else // Custom affinity
{
#if AFFINITY_USES_UINT128
if (opt_debug)
applog( LOG_DEBUG, "Binding thread %d to mask %016llx %016llx",
thr_id, u128_hi64( opt_affinity ),
u128_lo64( opt_affinity ) );
#else
if (opt_debug)
applog( LOG_DEBUG, "Binding thread %d to mask %016llx",
thr_id, opt_affinity );
#endif
affine_to_cpu_mask( thr_id, opt_affinity );
}
}
@@ -2894,13 +2917,21 @@ void parse_arg(int key, char *arg )
break;
case 1020:
p = strstr(arg, "0x");
if (p)
ul = strtoul(p, NULL, 16);
if ( p )
ul = strtoull( p, NULL, 16 );
else
ul = atol(arg);
if (ul > (1UL<<num_cpus)-1)
ul = -1;
opt_affinity = ul;
ul = atoll( arg );
// if ( ul > ( 1ULL << num_cpus ) - 1ULL )
// ul = -1LL;
#if AFFINITY_USES_UINT128
// replicate the low 64 bits to make a full 128 bit mask if there are more
// than 64 CPUs, otherwise zero extend the upper half.
opt_affinity = (uint128_t)ul;
if ( num_cpus > 64 )
opt_affinity = (opt_affinity << 64 ) | (uint128_t)ul;
#else
opt_affinity = ul;
#endif
break;
case 1021:
v = atoi(arg);
@@ -3299,20 +3330,18 @@ int main(int argc, char *argv[])
}
if (!rpc_userpass)
{
{
rpc_userpass = (char*) malloc(strlen(rpc_user) + strlen(rpc_pass) + 2);
if (rpc_userpass)
sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
else
return 1;
if (rpc_userpass)
sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
else
return 1;
}
// All options must be set before starting the gate
if ( !register_algo_gate( opt_algo, &algo_gate ) )
exit(1);
// All options must be set before starting the gate
if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);
if ( !check_cpu_capability() )
exit(1);
if ( !check_cpu_capability() ) exit(1);
pthread_mutex_init(&stats_lock, NULL);
pthread_mutex_init(&g_work_lock, NULL);
@@ -3325,7 +3354,7 @@ int main(int argc, char *argv[])
? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
: CURL_GLOBAL_ALL;
if (curl_global_init(flags))
{
{
applog(LOG_ERR, "CURL initialization failed");
return 1;
}
@@ -3384,6 +3413,8 @@ int main(int argc, char *argv[])
if ( num_cpus != opt_n_threads )
applog( LOG_INFO,"%u CPU cores available, %u miner threads selected.",
num_cpus, opt_n_threads );
// To be reviewed
if ( opt_affinity != -1 )
{
if ( num_cpus > 64 )

File diff suppressed because it is too large Load Diff

View File

@@ -363,7 +363,7 @@ struct work {
char *job_id;
size_t xnonce2_len;
unsigned char *xnonce2;
uint32_t nonces[8];
uint32_t nonces[8]; // deprecated
} __attribute__ ((aligned (64)));
struct stratum_job {
@@ -538,6 +538,7 @@ enum algos {
ALGO_SCRYPTJANE,
ALGO_SHA256D,
ALGO_SHA256T,
ALGO_SHA256Q,
ALGO_SHAVITE3,
ALGO_SKEIN,
ALGO_SKEIN2,
@@ -625,6 +626,7 @@ static const char* const algo_names[] = {
"scryptjane",
"sha256d",
"sha256t",
"sha256q",
"shavite3",
"skein",
"skein2",
@@ -774,7 +776,8 @@ Options:\n\
scryptjane:nf\n\
sha256d Double SHA-256\n\
sha256t Triple SHA-256, Onecoin (OC)\n\
shavite3 Shavite3\n\
sha256q Quad SHA-256, Pyrite (PYE)\n\
shavite3 Shavite3\n\
skein Skein+Sha (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\
skunk Signatum (SIGT)\n\

183
simd-utils.h Normal file
View File

@@ -0,0 +1,183 @@
#if !defined(SIMD_UTILS_H__)
#define SIMD_UTILS_H__ 1
//////////////////////////////////////////////////////////////////////
//
// SIMD utilities
//
// Not to be confused with the hashing function of the same name. This
// is about Single Instruction Multiple Data programming using CPU
// features such as SSE and AVX.
//
// This header is the entry point to a suite of macros and functions
// to perform basic operations on vectors that are useful in crypto
// mining. Some of these functions have native CPU support for scalar
// data but not for vectors. The main categories are bit rotation
// and endian byte swapping
//
// An attempt was made to make the names as similar as possible to
// Intel's intrinsic function format. Most variations are to avoid
// confusion with actual Intel intrinsics, brevity, and clarity.
//
// This suite supports some operations on regular 64 bit integers
// as well as 128 bit integers available on recent versions of Linux
// and GCC.
//
// It also supports various vector sizes on CPUs that meet the minimum
// requirements.
//
// The minimum for any real work is a 64 bit CPU with SSE2,
// ie an the Intel Core 2.
//
// Following are the minimum requirements for each vector size. There
// is no significant 64 bit vectorization therefore SSE2 is the practical
// minimum for using this code.
//
// MMX: 64 bit vectors
// SSE2: 128 bit vectors (64 bit CPUs only, such as Intel Core2.
// AVX2: 256 bit vectors (Starting with Intel Haswell and AMD Ryzen)
// AVX512: 512 bit vectors (still under development)
//
// Most functions are avalaible at the stated levels but in rare cases
// a higher level feature may be required with no compatible alternative.
// Some SSE2 functions have versions optimized for higher feature levels
// such as SSSE3 or SSE4.1 that will be used automatically on capable
// CPUs.
//
// The vector size boundaries are respected to maintain compatibility.
// For example, an instruction introduced with AVX2 may improve 128 bit
// vector performance but will not be implemented. A CPU with AVX2 will
// tend to use 256 bit vectors. On a practical level AVX512 does introduce
// bit rotation instructions for 128 and 256 bit vectors in addition to
// its own 5a12 bit vectors. These will not be back ported to replace the
// SW implementations for the smaller vectors. This policy may be reviewed
// in the future once AVX512 is established.
//
// Strict alignment of data is required: 16 bytes for 128 bit vectors,
// 32 bytes for 256 bit vectors and 64 bytes for 512 bit vectors. 64 byte
// alignment is recommended in all cases for best cache alignment.
//
// Windows has problems with function vector arguments larger than
// 128 bits. Stack alignment is only guaranteed to 16 bytes. Always use
// pointers for larger vectors in function arguments. Macros can be
// used for larger value arguments.
//
// An attempt was made to make the names as similar as possible to
// Intel's intrinsic function format. Most variations are to avoid
// confusion with actual Intel intrinsics, brevity, and clarity
//
// The main differences are:
//
// - the leading underscore(s) "_" and the "i" are dropped from the
// prefix of vector instructions.
// - "mm64" and "mm128" used for 64 and 128 bit prefix respectively
// to avoid the ambiguity of "mm".
// - the element size does not include additional type specifiers
// like "epi".
// - some macros contain value args that are updated.
// - specialized shift and rotate functions that move elements around
// use the notation "1x32" to indicate the distance moved as units of
// the element size.
// - there is a subset of some functions for scalar data. They may have
// no prefix nor vec-size, just one size, the size of the data.
//
// Function names follow this pattern:
//
// prefix_op[esize]_[vsize]
//
// Prefix: usually the size of the largest vectors used. Following
// are some examples:
//
// u64: unsigned 64 bit integer function
// i128: signed 128 bit integer function
// m128: 128 bit vector identifier
// mm128: 128 bit vector function
//
// op: describes the operation of the function or names the data
// identifier.
//
// esize: optional, element size of operation
//
// vsize: optional, lane size used when a function operates on elements
// of vectors within lanes of a vector.
//
// Ex: mm256_ror1x64_128 rotates each 128 bit lane of a 256 bit vector
// right by 64 bits.
//
// Some random thoughts about macros and inline functions, the pros and
// cons, when to use them, etc:
//
// Macros are very convenient and efficient for statement functions.
// Macro args are passed by value and modifications are seen by the caller.
// Macros should not generally call regular functions unless it is for a
// special purpose such overloading a function name.
// Statement function macros that return a value should not end in ";"
// Statement function macros that return a value and don't modify input args
// may be used in function arguments and expressions.
// Macro args used in expressions should be protected ex: (x)+1
// Macros force inlining, function inlining can be overridden by the compiler.
// Inline functions are preferred when multiple statements or local variables
// are needed.
// The compiler can't do any syntax checking or type checking of args making
// macros difficult to debug.
// Although it is technically posssible to access the callers data without
// they being passed as arguments it is good practice to always define
// arguments even if they have the same name.
//
// General guidelines for inline functions:
//
// Inline functions should not have loops, it defeats the purpose of inlining.
// Inline functions should be short, the benefit is lost and the memory cost
// increases if the function is referenced often.
// Inline functions may call other functions, inlined or not. It is convenient
// for wrapper functions whether or not the wrapped function is itself inlined.
// Care should be taken when unrolling loops that contain calls to inlined
// functions that may be large.
// Large code blocks used only once may use function inlining to
// improve high level code readability without the penalty of function
// overhead.
//
///////////////////////////////////////////////////////
#include <inttypes.h>
#include <x86intrin.h>
#include <memory.h>
#include <stdbool.h>
// byteswap.h doesn't exist on Windows, find alternative
//#include <byteswap.h>
// Various types and overlays
#include "simd-utils/simd-types.h"
// 64 and 128 bit integers.
#include "simd-utils/simd-int.h"
#if defined(__MMX__)
// 64 bit vectors
#include "simd-utils/simd-mmx.h"
#include "simd-utils/intrlv-mmx.h"
#if defined(__SSE2__)
// 128 bit vectors
#include "simd-utils/simd-sse2.h"
#include "simd-utils/intrlv-sse2.h"
#if defined(__AVX2__)
// 256 bit vectors
#include "simd-utils/simd-avx2.h"
#include "simd-utils/intrlv-avx2.h"
// Skylake-X has all these
#if defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
// 512 bit vectors
#include "simd-utils/simd-avx512.h"
#include "simd-utils/intrlv-avx512.h"
#endif // MMX
#endif // SSE2
#endif // AVX2
#endif // AVX512
#endif // SIMD_UTILS_H__

Some files were not shown because too many files have changed in this diff Show More