Compare commits

...

6 Commits

Author SHA1 Message Date
Jay D Dee
7fec680835 v3.9.2.4 2019-06-07 23:30:38 -04:00
Jay D Dee
1b0a5aadf6 v3.9.2.3 2019-06-05 12:20:04 -04:00
Jay D Dee
0a3c52810e v3.9.2.2 2019-06-04 17:14:03 -04:00
Jay D Dee
4d4386a374 v3.9.2.1 2019-06-04 16:56:44 -04:00
Jay D Dee
ce259b915a v3.9.2 2019-06-03 21:36:33 -04:00
Jay D Dee
02202ab803 v3.9.1.1 2019-05-31 13:20:12 -04:00
69 changed files with 4150 additions and 5086 deletions

View File

@@ -68,7 +68,8 @@ cpuminer_SOURCES = \
algo/blake/pentablake-4way.c \
algo/blake/pentablake.c \
algo/bmw/sph_bmw.c \
algo/bmw/bmw-hash-4way.c \
algo/bmw/bmw256-hash-4way.c \
algo/bmw/bmw512-hash-4way.c \
algo/bmw/bmw256.c \
algo/cryptonight/cryptolight.c \
algo/cryptonight/cryptonight-common.c\
@@ -162,10 +163,13 @@ cpuminer_SOURCES = \
algo/sha/sph_sha2.c \
algo/sha/sph_sha2big.c \
algo/sha/sha2-hash-4way.c \
algo/sha/sha256_hash_11way.c \
algo/sha/sha2.c \
algo/sha/sha256t-gate.c \
algo/sha/sha256t-4way.c \
algo/sha/sha256t.c \
algo/sha/sha256q-4way.c \
algo/sha/sha256q.c \
algo/shabal/sph_shabal.c \
algo/shabal/shabal-hash-4way.c \
algo/shavite/sph_shavite.c \
@@ -262,7 +266,7 @@ cpuminer_SOURCES = \
algo/yescrypt/sha256_Y.c \
algo/yescrypt/yescrypt-best.c \
algo/yespower/yespower.c \
algo/yespower/sha256.c \
algo/yespower/sha256_p.c \
algo/yespower/yespower-opt.c
disable_flags =

View File

@@ -12,7 +12,7 @@ the software, don't use it.
Choose the exe that best matches you CPU's features or use trial and
error to find the fastest one that doesn't crash. Pay attention to
the features listed at cpuminer startup to ensure you are mining at
optimum speed using all the available features.
optimum speed using the best available features.
Architecture names and compile options used are only provided for Intel
Core series. Even the newest Pentium and Celeron CPUs are often missing
@@ -22,8 +22,6 @@ AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
supported by cpuminer-opt due to an incompatible implementation of SSE2 on
these CPUs. Some algos may crash the miner with an invalid instruction.
Users are recommended to use an unoptimized miner such as cpuminer-multi.
Changes in v3.8.4 may have improved compatibility with some of these CPUs.
Exe name Compile flags Arch name

View File

@@ -33,11 +33,44 @@ Requirements
Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
supported.
64 bit Linux or Windows operating system. Apple is not supported.
64 bit Linux or Windows operating system. Apple and Android are not supported.
Change Log
----------
v3.9.2.4
Yet another affinity fix. Hopefully the last one.
v3.9.2.3
Another cpu-affinity fix.
Disabled test code that fails to compile on some CPUs with limited
AVX512 capabilities.
v3.9.2.2
Fixed some day one cpu-affinity issues.
v3.9.2
Added sha256q algo.
Yespower now uses openssl SHA256, but no observable hash rate increase
on Ryzen.
Ongoing rearchitecting.
Lyra2z now hashes 8-way on CPUs with AVX2.
Lyra2 (all including phi2) now runs optimized code with SSE2.
v3.9.1.1
Fixed lyra2v3 AVX and below.
Compiling on Windows using Cygwin now works. Simply use "./build.sh"
just like on Linux. It isn't portable therefore the binaries package will
continue to use the existing procedure.
The Cygwin procedure will be documented in more detail later and will
include a list of packages that need to be installed.
v3.9.1
Fixed AVX2 version of anime algo.

View File

@@ -210,6 +210,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
case ALGO_SCRYPTJANE: register_scryptjane_algo ( gate ); break;
case ALGO_SHA256D: register_sha256d_algo ( gate ); break;
case ALGO_SHA256T: register_sha256t_algo ( gate ); break;
case ALGO_SHA256Q: register_sha256q_algo ( gate ); break;
case ALGO_SHAVITE3: register_shavite_algo ( gate ); break;
case ALGO_SKEIN: register_skein_algo ( gate ); break;
case ALGO_SKEIN2: register_skein2_algo ( gate ); break;
@@ -344,9 +345,9 @@ const char* const algo_alias_map[][2] =
{ NULL, NULL }
};
// if arg is a valid alias for a known algo it is updated with the proper name.
// No validation of the algo or alias is done, It is the responsinility of the
// calling function to validate the algo after return.
// if arg is a valid alias for a known algo it is updated with the proper
// name. No validation of the algo or alias is done, It is the responsinility
// of the calling function to validate the algo after return.
void get_algo_alias( char** algo_or_alias )
{
int i;
@@ -361,3 +362,22 @@ void get_algo_alias( char** algo_or_alias )
#undef ALIAS
#undef PROPER
// only for parallel when there are lanes.
bool submit_solution( struct work *work, void *hash,
struct thr_info *thr, int lane )
{
work_set_target_ratio( work, hash );
if ( submit_work( thr, work ) )
{
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr->id, lane );
return true;
}
else
applog( LOG_WARNING, "Failed to submit share." );
return false;
}

View File

@@ -196,8 +196,9 @@ void four_way_not_tested();
int null_scanhash();
// The one and only, a callback for scanhash.
bool submit_solution( struct work *work, void *hash,
struct thr_info *thr, int lane );
bool submit_work( struct thr_info *thr, const struct work *work_in );
// displays warning

View File

@@ -41,7 +41,6 @@ extern "C"{
#endif
#include <stddef.h>
#ifdef __AVX2__
#include "algo/sha/sph_types.h"
#include "avxdefs.h"
@@ -50,6 +49,10 @@ extern "C"{
#define SPH_SIZE_bmw512 512
#if defined(__SSE2__)
// BMW-256 4 way 32
typedef struct {
__m128i buf[64];
__m128i H[16];
@@ -59,6 +62,60 @@ typedef struct {
typedef bmw_4way_small_context bmw256_4way_context;
void bmw256_4way_init(void *cc);
void bmw256_4way(void *cc, const void *data, size_t len);
void bmw256_4way_close(void *cc, void *dst);
void bmw256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#endif // __SSE2__
#if defined(__AVX2__)
// BMW-256 8 way 32
typedef struct {
__m256i buf[64];
__m256i H[16];
size_t ptr;
uint32_t bit_count; // assume bit_count fits in 32 bits
} bmw_8way_small_context __attribute__ ((aligned (64)));
typedef bmw_8way_small_context bmw256_8way_context;
void bmw256_8way_init( bmw256_8way_context *ctx );
void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len );
void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );
#endif
#if defined(__SSE2__)
// BMW-512 2 way 64
typedef struct {
__m128i buf[16];
__m128i H[16];
size_t ptr;
uint64_t bit_count;
} bmw_2way_big_context __attribute__ ((aligned (64)));
typedef bmw_2way_big_context bmw512_2way_context;
void bmw512_2way_init( bmw512_2way_context *ctx );
void bmw512_2way( bmw512_2way_context *ctx, const void *data, size_t len );
void bmw512_2way_close( bmw512_2way_context *ctx, void *dst );
#endif // __SSE2__
#if defined(__AVX2__)
// BMW-512 4 way 64
typedef struct {
__m256i buf[16];
__m256i H[16];
@@ -68,14 +125,6 @@ typedef struct {
typedef bmw_4way_big_context bmw512_4way_context;
void bmw256_4way_init(void *cc);
void bmw256_4way(void *cc, const void *data, size_t len);
void bmw256_4way_close(void *cc, void *dst);
void bmw256_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
void bmw512_4way_init(void *cc);
@@ -86,10 +135,10 @@ void bmw512_4way_close(void *cc, void *dst);
void bmw512_4way_addbits_and_close(
void *cc, unsigned ub, unsigned n, void *dst);
#endif
#endif // __AVX2__
#ifdef __cplusplus
}
#endif
#endif
#endif // BMW_HASH_H__

File diff suppressed because it is too large Load Diff

1109
algo/bmw/bmw512-hash-4way.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -11,6 +11,8 @@ extern "C"{
#pragma warning (disable: 4146)
#endif
#define SPH_FUGUE_NOCOPY 1
static const sph_u32 IV224[] = {
SPH_C32(0xf4c9120d), SPH_C32(0x6286f757), SPH_C32(0xee39e01c),
SPH_C32(0xe074e3cb), SPH_C32(0xa1127c62), SPH_C32(0x9a43d215),

View File

@@ -11,6 +11,10 @@
#include <sys/endian.h>
#endif
#if defined(__CYGWIN__)
#include <endian.h>
#endif
#include "tmmintrin.h"
#include "smmintrin.h"

View File

@@ -8,6 +8,10 @@
#include <sys/endian.h>
#endif
#if defined(__CYGWIN__)
#include <endian.h>
#endif
#include "tmmintrin.h"
#include "smmintrin.h"
#include "immintrin.h"

View File

@@ -91,7 +91,7 @@ extern "C"{
#pragma warning (disable: 4146)
#endif
/*
static const sph_u64 RC[] = {
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
@@ -106,7 +106,7 @@ static const sph_u64 RC[] = {
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
};
*/
#define kekDECL_STATE \
sph_u64 keca00, keca01, keca02, keca03, keca04; \
sph_u64 keca10, keca11, keca12, keca13, keca14; \
@@ -756,6 +756,20 @@ static const sph_u64 RC[] = {
* tested faster saving space
*/
#define KECCAK_F_1600_ do { \
static const sph_u64 RC[] = { \
SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), \
SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), \
SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), \
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), \
SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), \
SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), \
SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), \
SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), \
SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), \
SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), \
SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), \
SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) \
}; \
int j; \
for (j = 0; j < 24; j += 4) { \
KF_ELT( 0, 1, RC[j + 0]); \
@@ -791,7 +805,7 @@ static const sph_u64 RC[] = {
/* load initial constants */
#define KEC_I
static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 };
//static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 };
/*
unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
*/
@@ -799,6 +813,7 @@ static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0
/* load hash for loop */
#define KEC_U \
do { \
static unsigned char keczword[8] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80 }; \
/*memcpy(hashbuf, hash, 64); */ \
memcpy(hash + 64, keczword, 8); \
} while (0);

View File

@@ -90,7 +90,7 @@ void allium_4way_hash( void *state, const void *input )
}
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -100,40 +100,41 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;
swab32_array( edata, pdata, 20 );
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
blake256_4way_init( &allium_4way_ctx.blake );
blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
allium_4way_hash( hash, vdata );
pdata[19] = n;
for ( int i = 0; i < 4; i++ )
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
for ( int lane = 0; lane < 4; lane++ ) if ( (hash+(lane<<3))[7] <= Htarg )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
if ( fulltest( hash+(lane<<3), ptarget ) )
{
pdata[19] = n + lane;
submit_solution( work, hash+(lane<<3), mythr, lane );
}
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -70,7 +70,7 @@ void allium_hash(void *state, const void *input)
}
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[20];
@@ -80,6 +80,7 @@ int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x3ffff;

View File

@@ -1,6 +1,43 @@
#include "lyra2-gate.h"
// huge pages
//
// Use MAP_PRIVATE instead
// In register algo:
// replace thread safe whole matrix with a char**
// alloc huge pages matrixsize * threads
// make pointers to each thread to each thread, creating an
// array[thread][matrix].
// Each thread can create its own matrix pointer:
// my_matrix = the matrix + ( thread_id * matrix_size )
//
// Compiler version check?
// Fallback?
//
// create a generic utility to map & unmap huge pages.
// ptr = malloc_huge( size );
// Yespower wrapper checks for 64 byte alignment, seems unnecessary as
// it should be aligned to the page boundary. It may be desireable to
// have the matrix size rounded up if necessary to something bigger
// than 64 byte, say 4 kbytes a small page size.
// Define some constants for indivual parameters and matrix size for
// each algo. Use the parameter constants where apropriate.
// Convert algos that don't yet do so to use dynamic alllocation.
// Alloc huge pages globally. If ok each thread will create a pointer to
// its chunk. If fail each thread will use use _mm_alloc for itself.
// BLOCK_LEN_BYTES is 768.
#define LYRA2REV3_NROWS 4
#define LYRA2REV3_NCOLS 4
/*
#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)*(LYRA2REV3_NCOLS)* \
(LYRA2REV3_NROWS)*8)
*/
#define LYRA2REV3_MATRIX_SIZE ((BLOCK_LEN_BYTES)<<4)
__thread uint64_t* l2v3_wholeMatrix;
bool lyra2rev3_thread_init()

View File

@@ -43,25 +43,25 @@ bool register_lyra2rev2_algo( algo_gate_t* gate );
void lyra2rev2_4way_hash( void *state, const void *input );
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool init_lyra2rev2_4way_ctx();
#else
void lyra2rev2_hash( void *state, const void *input );
int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool init_lyra2rev2_ctx();
#endif
/////////////////////////
#if defined(__SSE4_2__)
#if defined(__SSE2__)
#define LYRA2Z_4WAY
#endif
#if defined(__AVX2__)
// #define LYRA2Z_8WAY
#define LYRA2Z_8WAY
#endif
@@ -71,21 +71,21 @@ bool init_lyra2rev2_ctx();
void lyra2z_8way_hash( void *state, const void *input );
int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_8way_thread_init();
#elif defined(LYRA2Z_4WAY)
void lyra2z_4way_hash( void *state, const void *input );
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_4way_thread_init();
#else
void lyra2z_hash( void *state, const void *input );
int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2z_thread_init();
#endif
@@ -102,14 +102,14 @@ bool lyra2z_thread_init();
void lyra2h_4way_hash( void *state, const void *input );
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2h_4way_thread_init();
#else
void lyra2h_hash( void *state, const void *input );
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool lyra2h_thread_init();
#endif
@@ -126,14 +126,14 @@ bool register_allium_algo( algo_gate_t* gate );
void allium_4way_hash( void *state, const void *input );
int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool init_allium_4way_ctx();
#else
void allium_hash( void *state, const void *input );
int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
bool init_allium_ctx();
#endif
@@ -146,7 +146,7 @@ bool register_phi2_algo( algo_gate_t* gate );
void phi2_hash( void *state, const void *input );
int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_phi2_ctx();
#endif // LYRA2_GATE_H__

View File

@@ -236,7 +236,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
//Tries to allocate enough space for the whole memory matrix
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
/*
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;

View File

@@ -50,7 +50,7 @@ void lyra2h_4way_hash( void *state, const void *input )
}
int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -63,6 +63,7 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep= vdata + 76; // 19*4
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;

View File

@@ -36,7 +36,7 @@ void lyra2h_hash( void *state, const void *input )
}
int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) endiandata[20];
@@ -45,6 +45,7 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if (opt_benchmark)
ptarget[7] = 0x0000ff;

View File

@@ -81,8 +81,8 @@ void lyra2re_hash(void *state, const void *input)
memcpy(state, hashA, 32);
}
int scanhash_lyra2re(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_lyra2re( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -91,6 +91,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
swab32_array( endiandata, pdata, 20 );

View File

@@ -82,7 +82,7 @@ void lyra2rev2_4way_hash( void *state, const void *input )
}
int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -95,6 +95,7 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
( (uint32_t*)ptarget )[7] = 0x0000ff;

View File

@@ -73,7 +73,7 @@ void lyra2rev2_hash( void *state, const void *input )
}
int scanhash_lyra2rev2(int thr_id, struct work *work,
uint32_t max_nonce, uint64_t *hashes_done)
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
@@ -82,6 +82,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;

View File

@@ -74,7 +74,6 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
const uint32_t Htarg = ptarget[7];
int num_found = 0;
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
@@ -104,13 +103,7 @@ int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;

View File

@@ -50,7 +50,7 @@ void lyra2z_4way_hash( void *state, const void *input )
}
int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -60,25 +60,23 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 76; // 19*4
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 20; i++ )
be32enc( &edata[i], pdata[i] );
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
lyra2z_4way_midstate( vdata );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
*noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
lyra2z_4way_hash( hash, vdata );
pdata[19] = n;
@@ -87,15 +85,19 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, i );
else
applog( LOG_WARNING, "Failed to submit share." );
}
n += 4;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
} while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif
@@ -150,14 +152,14 @@ void lyra2z_8way_hash( void *state, const void *input )
memcpy( state+ 32, hash1, 32 );
memcpy( state+ 64, hash2, 32 );
memcpy( state+ 96, hash3, 32 );
memcpy( state+128, hash1, 32 );
memcpy( state+160, hash2, 32 );
memcpy( state+192, hash3, 32 );
memcpy( state+224, hash1, 32 );
memcpy( state+128, hash4, 32 );
memcpy( state+160, hash5, 32 );
memcpy( state+192, hash6, 32 );
memcpy( state+224, hash7, 32 );
}
int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8*8] __attribute__ ((aligned (64)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
@@ -167,15 +169,15 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 152; // 19*8
__m256i *noncev = (__m256i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if ( opt_benchmark )
ptarget[7] = 0x0000ff;
for ( int i=0; i < 19; i++ )
be32enc( &edata[i], pdata[i] );
casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
@@ -183,15 +185,8 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
lyra2z_8way_midstate( vdata );
do {
be32enc( noncep, n );
be32enc( noncep+1, n+1 );
be32enc( noncep+2, n+2 );
be32enc( noncep+3, n+3 );
be32enc( noncep+4, n+4 );
be32enc( noncep+5, n+5 );
be32enc( noncep+6, n+6 );
be32enc( noncep+7, n+7 );
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
lyra2z_8way_hash( hash, vdata );
pdata[19] = n;
@@ -199,15 +194,13 @@ int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
{
pdata[19] = n+i;
nonces[ num_found++ ] = n+i;
work_set_target_ratio( work, hash+(i<<3) );
submit_solution( work, hash+(i<<3), mythr, i );
}
n += 8;
} while ( (num_found == 0) && (n < max_nonce-4)
&& !work_restart[thr_id].restart);
} while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}

View File

@@ -44,7 +44,7 @@ void lyra2z_hash( void *state, const void *input )
}
int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) endiandata[20];
@@ -53,6 +53,7 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
if (opt_benchmark)
ptarget[7] = 0x0000ff;

View File

@@ -16,39 +16,43 @@ void lyra2z330_hash(void *state, const void *input, uint32_t height)
}
int scanhash_lyra2z330( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[8] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
if (opt_benchmark)
ptarget[7] = 0x0000ff;
uint32_t hash[8] __attribute__ ((aligned (64)));
uint32_t endiandata[20] __attribute__ ((aligned (64)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t nonce = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
for (int i=0; i < 19; i++) {
be32enc(&endiandata[i], pdata[i]);
}
if (opt_benchmark)
ptarget[7] = 0x0000ff;
do {
be32enc(&endiandata[19], nonce);
lyra2z330_hash( hash, endiandata, work->height );
if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce;
return 1;
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
for (int i=0; i < 19; i++)
be32enc(&endiandata[i], pdata[i]);
do
{
be32enc(&endiandata[19], nonce);
lyra2z330_hash( hash, endiandata, work->height );
if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
{
work_set_target_ratio(work, hash);
pdata[19] = nonce;
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d",
accepted_share_count + rejected_share_count + 1,
mythr->id );
else
applog( LOG_WARNING, "Failed to submit share." );
}
nonce++;
} while (nonce < max_nonce && !work_restart[thr_id].restart);
pdata[19] = nonce;
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
void lyra2z330_set_target( struct work* work, double job_diff )

View File

@@ -92,42 +92,50 @@ void phi2_hash(void *state, const void *input)
memcpy(state, hash, 32);
}
int scanhash_phi2(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[36];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(128) endiandata[36];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
if(opt_benchmark){
ptarget[7] = 0x00ff;
}
if(opt_benchmark){
ptarget[7] = 0x00ff;
}
phi2_has_roots = false;
for ( int i=0; i < 36; i++ )
{
be32enc(&endiandata[i], pdata[i]);
if (i >= 20 && pdata[i]) phi2_has_roots = true;
}
phi2_has_roots = false;
for (int i=0; i < 36; i++) {
be32enc(&endiandata[i], pdata[i]);
if (i >= 20 && pdata[i]) phi2_has_roots = true;
}
do {
be32enc( &endiandata[19], n );
phi2_hash( hash, endiandata );
do {
be32enc(&endiandata[19], n);
phi2_hash(hash, endiandata);
if (hash[7] < Htarg && fulltest(hash, ptarget)) {
work_set_target_ratio(work, hash);
if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
{
pdata[19] = n;
work_set_target_ratio( work, hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d.",
accepted_share_count + rejected_share_count + 1,
thr_id );
else
applog( LOG_WARNING, "Failed to submit share." );
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 1;
}
n++;
}
n++;
} while (n < max_nonce && !work_restart[thr_id].restart);
} while ( n < max_nonce && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}

View File

@@ -108,7 +108,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
#define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
G_2X64( s0, s2, s4, s6 ); \
G_2X64( s1, s3, s5, s7 ); \
mm128_rol1x64_256( s2, s3 ); \
mm128_ror1x64_256( s2, s3 ); \
mm128_swap128_256( s4, s5 ); \
mm128_rol1x64_256( s6, s7 ); \
G_2X64( s0, s2, s4, s6 ); \
@@ -132,7 +132,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
#endif // AVX2
#endif // AVX2 else SSE2
// Scalar
//Blake2b's G function

View File

@@ -30,7 +30,7 @@
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#if defined(__SSE4_2__)
#if defined(__SSE2__)
#include <stddef.h>
#include <string.h>
@@ -716,4 +716,4 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
}
#endif // __AVX2__
#endif // __SSE4_2__
#endif // __SSE2__

View File

@@ -44,7 +44,8 @@
#include "sph_types.h"
#include "avxdefs.h"
#if defined(__SSE4_2__)
#if defined(__SSE2__)
//#if defined(__SSE4_2__)
//#define SPH_SIZE_sha256 256
@@ -60,6 +61,26 @@ void sha256_4way_init( sha256_4way_context *sc );
void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
void sha256_4way_close( sha256_4way_context *sc, void *dst );
/*
// SHA-256 7 way hybrid
// Combines SSE, MMX and scalar data to do 8 + 2 + 1 parallel.
typedef struct {
__m128i bufx[64>>2];
__m128i valx[8];
__m64 bufy[64>>2];
__m64 valy[8];
uint32_t bufz[64>>2];
uint32_t valz[8];
uint32_t count_high, count_low;
} sha256_7way_context;
void sha256_7way_init( sha256_7way_context *ctx );
void sha256_7way( sha256_7way_context *ctx, const void *datax,
void *datay, void *dataz, size_t len );
void sha256_7way_close( sha256_7way_context *ctx, void *dstx, void *dstyx,
void *dstz );
*/
#if defined (__AVX2__)
// SHA-256 8 way
@@ -88,6 +109,24 @@ void sha512_4way_init( sha512_4way_context *sc);
void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
void sha512_4way_close( sha512_4way_context *sc, void *dst );
#endif
#endif
#endif
// SHA-256 11 way hybrid
// Combines AVX2, MMX and scalar data to do 8 + 2 + 1 parallel.
typedef struct {
__m256i bufx[64>>2];
__m256i valx[8];
__m64 bufy[64>>2];
__m64 valy[8];
uint32_t bufz[64>>2];
uint32_t valz[8];
uint32_t count_high, count_low;
} sha256_11way_context;
void sha256_11way_init( sha256_11way_context *ctx );
void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
const void *datay, const void *dataz, size_t len );
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dstyx,
void *dstz );
#endif // __AVX2__
#endif // __SSE2__
#endif // SHA256_4WAY_H__

View File

@@ -0,0 +1,536 @@
#include <stddef.h>
#include <string.h>
#include "sha2-hash-4way.h"
#if defined(__AVX2__)
// naming convention for variables and macros
// VARx: AVX2 8 way 32 bit
// VARy: MMX 2 way 32 bit
// VARz: scalar integer 32 bit
static const uint32_t H256[8] =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
static const uint32_t K256[64] =
{
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};
#define CHx(X, Y, Z) \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
#define CHy(X, Y, Z) \
_mm_xor_si64( _mm_and_si64( _mm_xor_si64( Y, Z ), X ), Z )
#define CHz(X, Y, Z) ((( (Y) ^ (Z) ) & (X) ) ^ (Z) )
#define MAJx(X, Y, Z) \
_mm256_or_si256( _mm256_and_si256( X, Y ), \
_mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
#define MAJy(X, Y, Z) \
_mm_or_si64( _mm_and_si64( X, Y ), \
_mm_and_si64( _mm_or_si64( X, Y ), Z ) )
#define MAJz(X, Y, Z) ( ( (X) & (Y) ) | ( ( (X) | (Y) ) & (Z) ) )
#define BSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x,2), mm256_ror_32(x,13) ), _mm256_srli_epi32(x,22) )
#define BSG2_0y(x) \
_mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,2), mm64_ror_32(x,13) ), _mm_srli_pi32(x,22) )
#define BSG2_0z(x) ( ror_32(x,2) ^ ror_32(x,13) ^ ((x)>>22) )
#define BSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x,6), mm256_ror_32(x,11) ), _mm256_srli_epi32(x,25) )
#define BSG2_1y(x) \
_mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,6), mm64_ror_32(x,11) ), _mm_srli_pi32(x,25) )
#define BSG2_1z(x) ( ror_32(x,6) ^ ror_32(x,11) ^ ((x)>>25) )
#define SSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x,7), mm256_ror_32(x,18) ), _mm256_srli_epi32(x,3) )
#define SSG2_0y(x) \
_mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,7), mm64_ror_32(x,18) ), _mm_srli_pi32(x,3) )
#define SSG2_0z(x) (( ror_32(x,7) ^ ror_32(x,18) ) ^ ((x)>>3) )
#define SSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( \
mm256_ror_32(x,17), mm256_ror_32(x,19) ), _mm256_srli_epi32(x,10) )
#define SSG2_1y(x) \
_mm_xor_si64( _mm_xor_si64( \
mm64_ror_32(x,17), mm64_ror_32(x,19) ), _mm_srli_pi32(x,10) )
#define SSG2_1z(x) ( ror_32(x,17) ^ ror_32(x,19) ^ ((x)>>10) )
#define SHA2x_MEXP( a, b, c, d ) \
_mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
SSG2_1x( Wx[a] ), Wx[b] ), SSG2_0x( Wx[c] ) ), Wx[d] )
#define SHA2y_MEXP( a, b, c, d ) \
_mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
SSG2_1y( Wy[a] ), Wy[b] ), SSG2_0y( Wy[c] ) ), Wy[d] )
#define SHA2z_MEXP( a, b, c, d ) \
( SSG2_1z( Wz[a] ) + Wz[b] + SSG2_0z( Wz[c] ) + Wz[d] )
#define SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx, \
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy, \
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, i, j) \
do { \
__m256i T1x, T2x; \
__m64 T1y, T2y; \
uint32_t T1z, T2z; \
T1x = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
_mm256_add_epi32( Hx, BSG2_1x(Ex) ), CHx(Ex, Fx, Gx) ), \
_mm256_set1_epi32( K256[( (j)+(i) )] ) ), Wx[i] ); \
T1y = _mm_add_pi32( _mm_add_pi32( _mm_add_pi32( \
_mm_add_pi32( Hy, BSG2_1y(Ey) ), CHy(Ey, Fy, Gy) ), \
_mm_set1_pi32( K256[( (j)+(i) )] ) ), Wy[i] ); \
T1z = Hz + BSG2_1z( Ez ) + CHz( Ez, Fz, Gz ) + K256[ ((j)+(i)) ] + Wz[i]; \
T2x = _mm256_add_epi32( BSG2_0x(Ax), MAJx(Ax, Bx, Cx) ); \
T2y = _mm_add_pi32( BSG2_0y(Ay), MAJy(Ay, By, Cy) ); \
T2z = BSG2_0z( Az ) + MAJz( Az, Bz, Cz ); \
Dx = _mm256_add_epi32( Dx, T1x ); \
Dy = _mm_add_pi32( Dy, T1y ); \
Dz = Dz + T1z; \
Hx = _mm256_add_epi32( T1x, T2x ); \
Hy = _mm_add_pi32( T1y, T2y ); \
Hz = T1z + T2z; \
} while (0)
void sha256_11way_round( __m256i *inx, __m256i rx[8], __m64 *iny, __m64 ry[8],
uint32_t *inz, uint32_t rz[8] )
{
__m256i Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx;
__m256i Wx[16];
__m64 Ay, By, Cy, Dy, Ey, Fy, Gy, Hy;
__m64 Wy[16];
uint32_t Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz;
uint32_t Wz[16];
Wx[ 0] = mm256_bswap_32( inx[ 0] );
Wy[ 0] = mm64_bswap_32( iny[ 0] );
Wz[ 0] = bswap_32( inz[ 0] );
Wx[ 1] = mm256_bswap_32( inx[ 1] );
Wy[ 1] = mm64_bswap_32( iny[ 1] );
Wz[ 1] = bswap_32( inz[ 1] );
Wx[ 2] = mm256_bswap_32( inx[ 2] );
Wy[ 2] = mm64_bswap_32( iny[ 2] );
Wz[ 2] = bswap_32( inz[ 2] );
Wx[ 3] = mm256_bswap_32( inx[ 3] );
Wy[ 3] = mm64_bswap_32( iny[ 3] );
Wz[ 3] = bswap_32( inz[ 3] );
Wx[ 4] = mm256_bswap_32( inx[ 4] );
Wy[ 4] = mm64_bswap_32( iny[ 4] );
Wz[ 4] = bswap_32( inz[ 4] );
Wx[ 5] = mm256_bswap_32( inx[ 5] );
Wy[ 5] = mm64_bswap_32( iny[ 5] );
Wz[ 5] = bswap_32( inz[ 5] );
Wx[ 6] = mm256_bswap_32( inx[ 6] );
Wy[ 6] = mm64_bswap_32( iny[ 6] );
Wz[ 6] = bswap_32( inz[ 6] );
Wx[ 7] = mm256_bswap_32( inx[ 7] );
Wy[ 7] = mm64_bswap_32( iny[ 7] );
Wz[ 7] = bswap_32( inz[ 7] );
Wx[ 8] = mm256_bswap_32( inx[ 8] );
Wy[ 8] = mm64_bswap_32( iny[ 8] );
Wz[ 8] = bswap_32( inz[ 8] );
Wx[ 9] = mm256_bswap_32( inx[ 9] );
Wy[ 9] = mm64_bswap_32( iny[ 9] );
Wz[ 9] = bswap_32( inz[ 9] );
Wx[10] = mm256_bswap_32( inx[10] );
Wy[10] = mm64_bswap_32( iny[10] );
Wz[10] = bswap_32( inz[10] );
Wx[11] = mm256_bswap_32( inx[11] );
Wy[11] = mm64_bswap_32( iny[11] );
Wz[11] = bswap_32( inz[11] );
Wx[12] = mm256_bswap_32( inx[12] );
Wy[12] = mm64_bswap_32( iny[12] );
Wz[12] = bswap_32( inz[12] );
Wx[13] = mm256_bswap_32( inx[13] );
Wy[13] = mm64_bswap_32( iny[13] );
Wz[13] = bswap_32( inz[13] );
Wx[14] = mm256_bswap_32( inx[14] );
Wy[14] = mm64_bswap_32( iny[14] );
Wz[14] = bswap_32( inz[14] );
Wx[15] = mm256_bswap_32( inx[15] );
Wy[15] = mm64_bswap_32( iny[15] );
Wz[15] = bswap_32( inz[15] );
Ax = rx[0]; Ay = ry[0]; Az = rz[0];
Bx = rx[1]; By = ry[1]; Bz = rz[1];
Cx = rx[2]; Cy = ry[2]; Cz = rz[2];
Dx = rx[3]; Dy = ry[3]; Dz = rz[3];
Ex = rx[4]; Ey = ry[4]; Ez = rz[4];
Fx = rx[5]; Fy = ry[5]; Fz = rz[5];
Gx = rx[6]; Gy = ry[6]; Gz = rz[6];
Hx = rx[7]; Hy = ry[7]; Hz = rz[7];
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, 0 );
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, 0 );
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, 0 );
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, 0 );
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, 0 );
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, 0 );
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, 0 );
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, 0 );
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, 0 );
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, 0 );
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, 0 );
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, 0 );
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, 0 );
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, 0 );
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, 0 );
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, 0 );
for ( int j = 16; j < 64; j += 16 )
{
Wx[ 0] = SHA2x_MEXP( 14, 9, 1, 0 );
Wy[ 0] = SHA2y_MEXP( 14, 9, 1, 0 );
Wz[ 0] = SHA2z_MEXP( 14, 9, 1, 0 );
Wx[ 1] = SHA2x_MEXP( 15, 10, 2, 1 );
Wy[ 1] = SHA2y_MEXP( 15, 10, 2, 1 );
Wz[ 1] = SHA2z_MEXP( 15, 10, 2, 1 );
Wx[ 2] = SHA2x_MEXP( 0, 11, 3, 2 );
Wy[ 2] = SHA2y_MEXP( 0, 11, 3, 2 );
Wz[ 2] = SHA2z_MEXP( 0, 11, 3, 2 );
Wx[ 3] = SHA2x_MEXP( 1, 12, 4, 3 );
Wy[ 3] = SHA2y_MEXP( 1, 12, 4, 3 );
Wz[ 3] = SHA2z_MEXP( 1, 12, 4, 3 );
Wx[ 4] = SHA2x_MEXP( 2, 13, 5, 4 );
Wy[ 4] = SHA2y_MEXP( 2, 13, 5, 4 );
Wz[ 4] = SHA2z_MEXP( 2, 13, 5, 4 );
Wx[ 5] = SHA2x_MEXP( 3, 14, 6, 5 );
Wy[ 5] = SHA2y_MEXP( 3, 14, 6, 5 );
Wz[ 5] = SHA2z_MEXP( 3, 14, 6, 5 );
Wx[ 6] = SHA2x_MEXP( 4, 15, 7, 6 );
Wy[ 6] = SHA2y_MEXP( 4, 15, 7, 6 );
Wz[ 6] = SHA2z_MEXP( 4, 15, 7, 6 );
Wx[ 7] = SHA2x_MEXP( 5, 0, 8, 7);
Wy[ 7] = SHA2y_MEXP( 5, 0, 8, 7);
Wz[ 7] = SHA2z_MEXP( 5, 0, 8, 7);
Wx[ 8] = SHA2x_MEXP( 6, 1, 9, 8);
Wy[ 8] = SHA2y_MEXP( 6, 1, 9, 8);
Wz[ 8] = SHA2z_MEXP( 6, 1, 9, 8);
Wx[ 9] = SHA2x_MEXP( 7, 2, 10, 9 );
Wy[ 9] = SHA2y_MEXP( 7, 2, 10, 9);
Wz[ 9] = SHA2z_MEXP( 7, 2, 10, 9);
Wx[10] = SHA2x_MEXP( 8, 3, 11, 10 );
Wy[10] = SHA2y_MEXP( 8, 3, 11, 10);
Wz[10] = SHA2z_MEXP( 8, 3, 11, 10);
Wx[11] = SHA2x_MEXP( 9, 4, 12, 11);
Wy[11] = SHA2y_MEXP( 9, 4, 12, 11);
Wz[11] = SHA2z_MEXP( 9, 4, 12, 11 );
Wx[12] = SHA2x_MEXP( 10, 5, 13, 12 );
Wy[12] = SHA2y_MEXP( 10, 5, 13, 12 );
Wz[12] = SHA2z_MEXP( 10, 5, 13, 12 );
Wx[13] = SHA2x_MEXP( 11, 6, 14, 13 );
Wy[13] = SHA2y_MEXP( 11, 6, 14, 13 );
Wz[13] = SHA2z_MEXP( 11, 6, 14, 13 );
Wx[14] = SHA2x_MEXP( 12, 7, 15, 14 );
Wy[14] = SHA2y_MEXP( 12, 7, 15, 14 );
Wz[14] = SHA2z_MEXP( 12, 7, 15, 14 );
Wx[15] = SHA2x_MEXP( 13, 8, 0, 15 );
Wy[15] = SHA2y_MEXP( 13, 8, 0, 15 );
Wz[15] = SHA2z_MEXP( 13, 8, 0, 15 );
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 0, j );
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 1, j );
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 2, j );
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 3, j );
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 4, j );
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 5, j );
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 6, j );
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 7, j );
SHA2s_11WAY_STEP( Ax, Bx, Cx, Dx, Ex, Fx, Gx, Hx,
Ay, By, Cy, Dy, Ey, Fy, Gy, Hy,
Az, Bz, Cz, Dz, Ez, Fz, Gz, Hz, 8, j );
SHA2s_11WAY_STEP( Hx, Ax, Bx, Cx, Dx, Ex, Fx, Gx,
Hy, Ay, By, Cy, Dy, Ey, Fy, Gy,
Hz, Az, Bz, Cz, Dz, Ez, Fz, Gz, 9, j );
SHA2s_11WAY_STEP( Gx, Hx, Ax, Bx, Cx, Dx, Ex, Fx,
Gy, Hy, Ay, By, Cy, Dy, Ey, Fy,
Gz, Hz, Az, Bz, Cz, Dz, Ez, Fz, 10, j );
SHA2s_11WAY_STEP( Fx, Gx, Hx, Ax, Bx, Cx, Dx, Ex,
Fy, Gy, Hy, Ay, By, Cy, Dy, Ey,
Fz, Gz, Hz, Az, Bz, Cz, Dz, Ez, 11, j );
SHA2s_11WAY_STEP( Ex, Fx, Gx, Hx, Ax, Bx, Cx, Dx,
Ey, Fy, Gy, Hy, Ay, By, Cy, Dy,
Ez, Fz, Gz, Hz, Az, Bz, Cz, Dz, 12, j );
SHA2s_11WAY_STEP( Dx, Ex, Fx, Gx, Hx, Ax, Bx, Cx,
Dy, Ey, Fy, Gy, Hy, Ay, By, Cy,
Dz, Ez, Fz, Gz, Hz, Az, Bz, Cz, 13, j );
SHA2s_11WAY_STEP( Cx, Dx, Ex, Fx, Gx, Hx, Ax, Bx,
Cy, Dy, Ey, Fy, Gy, Hy, Ay, By,
Cz, Dz, Ez, Fz, Gz, Hz, Az, Bz, 14, j );
SHA2s_11WAY_STEP( Bx, Cx, Dx, Ex, Fx, Gx, Hx, Ax,
By, Cy, Dy, Ey, Fy, Gy, Hy, Ay,
Bz, Cz, Dz, Ez, Fz, Gz, Hz, Az, 15, j );
}
rx[0] = _mm256_add_epi32( rx[0], Ax );
ry[0] = _mm_add_pi32( ry[0], Ay );
rz[0] = rz[0]+ Az;
rx[1] = _mm256_add_epi32( rx[1], Bx );
ry[1] = _mm_add_pi32( ry[1], By );
rz[1] = rz[1]+ Bz;
rx[2] = _mm256_add_epi32( rx[2], Cx );
ry[2] = _mm_add_pi32( ry[2], Cy );
rz[3] = rz[3]+ Dz;
rx[4] = _mm256_add_epi32( rx[4], Ex );
ry[4] = _mm_add_pi32( ry[4], Ey );
rz[4] = rz[4]+ Ez;
rx[5] = _mm256_add_epi32( rx[5], Fx );
ry[5] = _mm_add_pi32( ry[5], Fy );
rz[5] = rz[5]+ Fz;
rx[6] = _mm256_add_epi32( rx[6], Gx );
ry[6] = _mm_add_pi32( ry[6], Gy );
rz[6] = rz[6]+ Gz;
rx[7] = _mm256_add_epi32( rx[7], Hx );
ry[7] = _mm_add_pi32( ry[7], Hy );
rz[7] = rz[7]+ Hz;
}
void sha256_11way_init( sha256_11way_context *ctx )
{
ctx->count_high = ctx->count_low = 0;
ctx->valx[0] = _mm256_set1_epi32( H256[0] );
ctx->valy[0] = _mm_set1_pi32( H256[0] );
ctx->valx[1] = _mm256_set1_epi32( H256[0] );
ctx->valy[1] = _mm_set1_pi32( H256[0] );
ctx->valx[2] = _mm256_set1_epi32( H256[0] );
ctx->valy[2] = _mm_set1_pi32( H256[0] );
ctx->valx[3] = _mm256_set1_epi32( H256[0] );
ctx->valy[3] = _mm_set1_pi32( H256[0] );
ctx->valx[4] = _mm256_set1_epi32( H256[0] );
ctx->valy[4] = _mm_set1_pi32( H256[0] );
ctx->valx[5] = _mm256_set1_epi32( H256[0] );
ctx->valy[5] = _mm_set1_pi32( H256[0] );
ctx->valx[6] = _mm256_set1_epi32( H256[0] );
ctx->valy[6] = _mm_set1_pi32( H256[0] );
ctx->valx[7] = _mm256_set1_epi32( H256[0] );
ctx->valy[7] = _mm_set1_pi32( H256[0] );
memcpy( ctx->valz, H256, 32 );
}
void sha256_11way_update( sha256_11way_context *ctx, const void *datax,
const void *datay, const void *dataz, size_t len )
{
__m256i *vdatax = (__m256i*) datax;
__m64 *vdatay = (__m64*) datay;
uint32_t *idataz = (uint32_t*)dataz;
size_t ptr;
const int buf_size = 64;
ptr = (unsigned)ctx->count_low & (buf_size - 1U);
while ( len > 0 )
{
size_t clen;
uint32_t clow, clow2;
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_256( ctx->bufx + (ptr>>2), vdatax + (ptr>>2), clen>>2 );
memcpy_64 ( ctx->bufy + (ptr>>2), vdatay + (ptr>>2), clen>>2 );
memcpy ( ctx->bufz + ptr, idataz + ptr, clen );
ptr += clen;
len -= clen;
if ( ptr == buf_size )
{
sha256_11way_round( ctx->bufx, ctx->valx,
ctx->bufy, ctx->valy,
ctx->bufz, ctx->valz );
ptr = 0;
}
clow = ctx->count_low;
clow2 = clow + clen;
ctx->count_low = clow2;
if ( clow2 < clow )
ctx->count_high++;
}
}
void sha256_11way_close( sha256_11way_context *ctx, void *dstx, void *dsty,
void *dstz)
{
unsigned ptr, u;
uint32_t low, high;
const int buf_size = 64;
const int pad = buf_size - 8;
ptr = (unsigned)ctx->count_low & (buf_size - 1U);
ctx->bufx[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
ctx->bufy[ ptr>>2 ] = _mm_set1_pi32( 0x80 );
ctx->bufz[ ptr>>2 ] = 0x80;
ptr += 4;
if ( ptr > pad )
{
memset_zero_256( ctx->bufx + (ptr>>2), (buf_size - ptr) >> 2 );
memset_zero_64( ctx->bufy + (ptr>>2), (buf_size - ptr) >> 2 );
memset( ctx->bufz + (ptr>>2), 0, (buf_size - ptr) >> 2 );
sha256_11way_round( ctx->bufx, ctx->valx,
ctx->bufy, ctx->valy,
ctx->bufz, ctx->valz );
memset_zero_256( ctx->bufx, pad >> 2 );
memset_zero_64( ctx->bufy, pad >> 2 );
memset( ctx->bufz, 0, pad >> 2 );
}
else
{
memset_zero_256( ctx->bufx + (ptr>>2), (pad - ptr) >> 2 );
memset_zero_64( ctx->bufy + (ptr>>2), (pad - ptr) >> 2 );
memset( ctx->bufz + (ptr>>2), 0, (pad - ptr) >> 2 );
}
low = ctx->count_low;
high = (ctx->count_high << 3) | (low >> 29);
low = low << 3;
ctx->bufx[ pad >> 2 ] =
mm256_bswap_32( _mm256_set1_epi32( high ) );
ctx->bufy[ pad >> 2 ] =
mm64_bswap_32( _mm_set1_pi32( high ) );
ctx->bufz[ pad >> 2 ] =
bswap_32( high );
ctx->bufx[ ( pad+4 ) >> 2 ] =
mm256_bswap_32( _mm256_set1_epi32( low ) );
ctx->bufy[ ( pad+4 ) >> 2 ] =
mm64_bswap_32( _mm_set1_pi32( low ) );
ctx->bufz[ ( pad+4 ) >> 2 ] =
bswap_32( low );
sha256_11way_round( ctx->bufx, ctx->valx,
ctx->bufy, ctx->valy,
ctx->bufz, ctx->valz );
for ( u = 0; u < 8; u ++ )
{
casti_m256i( dstx, u ) = mm256_bswap_32( ctx->valx[u] );
casti_m64 ( dsty, u ) = mm64_bswap_32( ctx->valy[u] );
((uint32_t*)dstz)[u] = bswap_32( ctx->valz[u] );
}
}
#endif

219
algo/sha/sha256q-4way.c Normal file
View File

@@ -0,0 +1,219 @@
#include "sha256t-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "sha2-hash-4way.h"
#if defined(SHA256T_8WAY)
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
void sha256q_8way_hash( void* output, const void* input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
sha256_8way_context ctx;
memcpy( &ctx, &sha256_ctx8, sizeof ctx );
sha256_8way( &ctx, input + (64<<3), 16 );
sha256_8way_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way( &ctx, vhash, 32 );
sha256_8way_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way( &ctx, vhash, 32 );
sha256_8way_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way( &ctx, vhash, 32 );
sha256_8way_close( &ctx, output );
}
int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t edata[20] __attribute__ ((aligned (32)));;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m256i *noncev = (__m256i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint64_t htmax[] = { 0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000 };
const uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
// Need big endian data
casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm256_interleave_8x32( vdata, edata, edata, edata, edata,
edata, edata, edata, edata, 640 );
sha256_8way_init( &sha256_ctx8 );
sha256_8way( &sha256_ctx8, vdata, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
pdata[19] = n;
sha256q_8way_hash( hash, vdata );
uint32_t *hash7 = &(hash[7<<3]);
for ( int lane = 0; lane < 8; lane++ )
if ( !( hash7[ lane ] & mask ) )
{
// deinterleave hash for lane
uint32_t lane_hash[8];
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
}
n += 8;
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif
#if defined(SHA256T_4WAY)
static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
void sha256q_4way_hash( void* output, const void* input )
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
sha256_4way_context ctx;
memcpy( &ctx, &sha256_ctx4, sizeof ctx );
sha256_4way( &ctx, input + (64<<2), 16 );
sha256_4way_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way( &ctx, vhash, 32 );
sha256_4way_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way( &ctx, vhash, 32 );
sha256_4way_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way( &ctx, vhash, 32 );
sha256_4way_close( &ctx, output );
}
int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t edata[20] __attribute__ ((aligned (32)));;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m128i *noncev = (__m128i*)vdata + 19; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint64_t htmax[] = { 0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000 };
const uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
sha256_4way_init( &sha256_ctx4 );
sha256_4way( &sha256_ctx4, vdata, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do {
*noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
pdata[19] = n;
sha256q_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( !( hash7[ lane ] & mask ) )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
}
n += 4;
} while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif

113
algo/sha/sha256q.c Normal file
View File

@@ -0,0 +1,113 @@
#include "sha256t-gate.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <openssl/sha.h>
static __thread SHA256_CTX sha256q_ctx __attribute__ ((aligned (64)));
void sha256q_midstate( const void* input )
{
SHA256_Init( &sha256q_ctx );
SHA256_Update( &sha256q_ctx, input, 64 );
}
void sha256q_hash( void* output, const void* input )
{
uint32_t _ALIGN(64) hash[16];
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
SHA256_CTX ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &sha256q_ctx, sizeof sha256q_ctx );
SHA256_Update( &ctx, input + midlen, tail );
SHA256_Final( (unsigned char*)hash, &ctx );
SHA256_Init( &ctx );
SHA256_Update( &ctx, hash, 32 );
SHA256_Final( (unsigned char*)hash, &ctx );
SHA256_Init( &ctx );
SHA256_Update( &ctx, hash, 32 );
SHA256_Final( (unsigned char*)hash, &ctx );
SHA256_Init( &ctx );
SHA256_Update( &ctx, hash, 32 );
SHA256_Final( (unsigned char*)hash, &ctx );
memcpy( output, hash, 32 );
}
int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
#ifdef _MSC_VER
uint32_t __declspec(align(32)) hash64[8];
#else
uint32_t hash64[8] __attribute__((aligned(32)));
#endif
uint32_t endiandata[32];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = {
0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000
};
uint32_t masks[] = {
0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0
};
// we need bigendian data...
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
sha256q_midstate( endiandata );
for ( int m = 0; m < 6; m++ )
{
if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do {
pdata[19] = ++n;
be32enc(&endiandata[19], n);
sha256q_hash( hash64, endiandata );
if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
{
work_set_target_ratio( work, hash64 );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d.",
accepted_share_count + rejected_share_count + 1,
thr_id );
else
applog( LOG_WARNING, "Failed to submit share." );
*hashes_done = n - first_nonce + 1;
}
} while ( n < max_nonce && !work_restart[thr_id].restart );
break;
}
}
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}

View File

@@ -5,6 +5,137 @@
#include <stdio.h>
#include "sha2-hash-4way.h"
#if defined(SHA256T_11WAY)
static __thread sha256_11way_context sha256_ctx11 __attribute__ ((aligned (64)));
void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
const void *inpy, const void*inpz )
{
uint32_t hashx[8*8] __attribute__ ((aligned (64)));
uint32_t hashy[8*2] __attribute__ ((aligned (64)));
uint32_t hashz[8] __attribute__ ((aligned (64)));
sha256_11way_context ctx;
const void *inpx64 = inpx+(64<<3);
const void *inpy64 = inpy+(64<<1);
const void *inpz64 = inpz+ 64;
memcpy( &ctx, &sha256_ctx11, sizeof ctx );
sha256_11way_update( &ctx, inpx64, inpy64, inpz64, 16 );
sha256_11way_close( &ctx, hashx, hashy, hashz );
sha256_11way_init( &ctx );
sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
sha256_11way_close( &ctx, hashx, hashy, hashz );
sha256_11way_init( &ctx );
sha256_11way_update( &ctx, hashx, hashy, hashz, 32 );
sha256_11way_close( &ctx, outx, outy, outz );
}
int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t datax[20*8] __attribute__ ((aligned (64)));
uint32_t datay[20*2] __attribute__ ((aligned (32)));
uint32_t dataz[20] __attribute__ ((aligned (32)));
uint32_t hashx[8*8] __attribute__ ((aligned (32)));
uint32_t hashy[8*2] __attribute__ ((aligned (32)));
uint32_t hashz[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash7;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
__m256i *noncex = (__m256i*) datax + 19;
__m64 *noncey = (__m64*) datay + 19;
uint32_t *noncez = (uint32_t*)dataz + 19;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
int i;
const uint64_t htmax[] = { 0,
0xF,
0xFF,
0xFFF,
0xFFFF,
0x10000000 };
const uint32_t masks[] = { 0xFFFFFFFF,
0xFFFFFFF0,
0xFFFFFF00,
0xFFFFF000,
0xFFFF0000,
0 };
// Use dataz (scalar) to stage bswapped data for the vectors.
casti_m256i( dataz, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( dataz, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( dataz, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm256_interleave_8x32( datax, dataz, dataz, dataz, dataz,
dataz, dataz, dataz, dataz, 640 );
mm64_interleave_2x32( datay, dataz, dataz, 640 );
sha256_11way_init( &sha256_ctx11 );
sha256_11way_update( &sha256_ctx11, datax, datay, dataz, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
do
{
*noncex = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
*noncey = mm64_bswap_32( _mm_set_pi32( n+9, n+8 ) );
*noncez = bswap_32( n+10 );
pdata[19] = n;
sha256t_11way_hash( hashx, hashy, hashz, datax, datay, dataz );
if ( opt_benchmark ) { n += 11; continue; }
hash7 = &(hashx[7<<3]);
for ( i = 0; i < 8; i++ ) if ( !( hash7[ i ] & mask ) )
{
// deinterleave hash for lane
mm256_extract_lane_8x32( lane_hash, hashx, i, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + i;
submit_solution( work, lane_hash, mythr, i );
}
}
hash7 = &(hashy[7<<1]);
for( i = 0; i < 2; i++ ) if ( !(hash7[ 0] & mask ) )
{
mm64_extract_lane_2x32( lane_hash, hashy, i, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + 8 + i;
submit_solution( work, lane_hash, mythr, i+8 );
}
}
if ( !(hashz[7] & mask ) && fulltest( hashz, ptarget ) )
{
pdata[19] = n+10;
submit_solution( work, hashz, mythr, 10 );
}
n += 11;
} while ( (n < max_nonce-12) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}
#endif
#if defined(SHA256T_8WAY)
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
@@ -29,7 +160,7 @@ void sha256t_8way_hash( void* output, const void* input )
}
int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
@@ -71,44 +202,38 @@ int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
do
{
*noncev = mm256_bswap_32(
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
pdata[19] = n;
_mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
pdata[19] = n;
sha256t_8way_hash( hash, vdata );
uint32_t *hash7 = &(hash[7<<3]);
uint32_t *hash7 = &(hash[7<<3]);
for ( int lane = 0; lane < 8; lane++ )
if ( !( hash7[ lane ] & mask ) )
{
{
// deinterleave hash for lane
uint32_t lane_hash[8];
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr, lane );
}
}
}
n += 8;
} while ( (n < max_nonce-10) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}
#elif defined(SHA256T_4WAY)
#endif
#if defined(SHA256T_4WAY)
static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
@@ -136,9 +261,9 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
{
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
uint32_t hash[8*4] __attribute__ ((aligned (32)));
uint32_t *hash7 = &(hash[7<<2]);
uint32_t lane_hash[8];
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t edata[20] __attribute__ ((aligned (32)));;
uint32_t *hash7 = &(hash[7<<2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t Htarg = ptarget[7];
@@ -187,22 +312,14 @@ int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
submit_solution( work, lane_hash, mythr, lane );
}
}
n += 4;
} while ( (n < max_nonce - 4) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return 0;
}

View File

@@ -2,16 +2,20 @@
bool register_sha256t_algo( algo_gate_t* gate )
{
#if defined(SHA256T_8WAY)
gate->optimizations = SSE42_OPT | AVX2_OPT;
#if defined(SHA256T_11WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t_11way;
gate->hash = (void*)&sha256t_11way_hash;
#elif defined(SHA256T_8WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t_8way;
gate->hash = (void*)&sha256t_8way_hash;
#elif defined(SHA256T_4WAY)
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t_4way;
gate->hash = (void*)&sha256t_4way_hash;
#else
gate->optimizations = SSE42_OPT | AVX2_OPT | SHA_OPT;
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256t;
gate->hash = (void*)&sha256t_hash;
#endif
@@ -19,3 +23,23 @@ bool register_sha256t_algo( algo_gate_t* gate )
return true;
}
bool register_sha256q_algo( algo_gate_t* gate )
{
#if defined(SHA256T_8WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256q_8way;
gate->hash = (void*)&sha256q_8way_hash;
#elif defined(SHA256T_4WAY)
gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256q_4way;
gate->hash = (void*)&sha256q_4way_hash;
#else
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256q;
gate->hash = (void*)&sha256q_hash;
#endif
gate->get_max64 = (void*)&get_max64_0x3ffff;
return true;
}

View File

@@ -6,34 +6,55 @@
// Override multi way on ryzen, SHA is better.
#if !defined(RYZEN_)
#if defined(__SSE4_2__)
#if defined(__SSE2__)
#define SHA256T_4WAY
#endif
#if defined(__AVX2__)
#define SHA256T_8WAY
// #define SHA256T_11WAY
#endif
#endif
bool register_blake2s_algo( algo_gate_t* gate );
bool register_sha256t_algo( algo_gate_t* gate );
bool register_sha256q_algo( algo_gate_t* gate );
#if defined(SHA256T_11WAY)
void sha256t_11way_hash( void *outx, void *outy, void *outz, const void *inpx,
const void *inpy, const void *inpz );
int scanhash_sha256t_11way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
//void sha256q_8way_hash( void *output, const void *input );
//int scanhash_sha256q_11way( int thr_id, struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256T_8WAY)
void sha256t_8way_hash( void *output, const void *input );
int scanhash_sha256t_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void sha256q_8way_hash( void *output, const void *input );
int scanhash_sha256q_8way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#elif defined (SHA256T_4WAY)
#if defined(SHA256T_4WAY)
void sha256t_4way_hash( void *output, const void *input );
int scanhash_sha256t_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#else
void sha256q_4way_hash( void *output, const void *input );
int scanhash_sha256q_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
void sha256t_hash( void *output, const void *input );
int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
void sha256q_hash( void *output, const void *input );
int scanhash_sha256q( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif

View File

@@ -5,8 +5,6 @@
#include <stdio.h>
#include <openssl/sha.h>
#if !defined(SHA256T_4WAY)
static __thread SHA256_CTX sha256t_ctx __attribute__ ((aligned (64)));
void sha256t_midstate( const void* input )
@@ -72,8 +70,11 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
};
// we need bigendian data...
for ( int k = 0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
sha256t_midstate( endiandata );
@@ -89,7 +90,13 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
if ( ( !(hash64[7] & mask) ) && fulltest( hash64, ptarget ) )
{
*hashes_done = n - first_nonce + 1;
return true;
work_set_target_ratio( work, hash64 );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE, "Share %d submitted by thread %d.",
accepted_share_count + rejected_share_count + 1,
thr_id );
else
applog( LOG_WARNING, "Failed to submit share." );
}
} while ( n < max_nonce && !work_restart[thr_id].restart );
break;
@@ -100,4 +107,3 @@ int scanhash_sha256t( int thr_id, struct work *work, uint32_t max_nonce,
pdata[19] = n;
return 0;
}
#endif

View File

@@ -346,7 +346,7 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
memcpy( buf + ptr, data, clen );
data = (const unsigned char *)data + clen;
ptr += clen;
len -= clen >> 1;
len -= (clen >> 1);
if ( ptr == sizeof ctx->buf )
{
if ( ( ctx->count0 = ctx->count0 + 1024 ) == 0 )
@@ -365,16 +365,8 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
}
uint32_t vp = ptr>>5;
// Terminating byte then zero pad
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
// Zero pad full vectors up to count
for ( ; vp < 6; vp++ )
casti_m256i( buf, vp ) = m256_zero;
// Count = { 0, 16, 64, 80 }. Outsize = 16 u32 = 512 bits = 0x0200
// Count is misaligned to 16 bits and straddles a vector.
// Count is misaligned to 16 bits and straddles 2 vectors.
// Use u32 overlay to stage then u16 to load buf.
union
{
@@ -387,6 +379,18 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,
count.u32[2] = ctx->count2;
count.u32[3] = ctx->count3;
if ( vp == 0 ) // empty buf, xevan.
{
casti_m256i( buf, 0 ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
memset_zero_256( (__m256i*)buf + 1, 5 );
ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
}
else // half full buf, everyone else.
{
casti_m256i( buf, vp++ ) = _mm256_set_epi32( 0,0,0,0x80, 0,0,0,0x80 );
memset_zero_256( (__m256i*)buf + vp, 6 - vp );
}
casti_m256i( buf, 6 ) = _mm256_set_epi16( count.u16[0], 0,0,0,0,0,0,0,
count.u16[0], 0,0,0,0,0,0,0 );
casti_m256i( buf, 7 ) = _mm256_set_epi16(

View File

@@ -25,7 +25,8 @@
#include "algo/haval/haval-hash-4way.h"
#include "algo/sha/sha2-hash-4way.h"
typedef struct {
union _sonoa_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
@@ -43,8 +44,10 @@ typedef struct {
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
} sonoa_4way_ctx_holder;
};
typedef union _sonoa_4way_context_overlay sonoa_4way_context_overlay;
/*
sonoa_4way_ctx_holder sonoa_4way_ctx __attribute__ ((aligned (64)));
void init_sonoa_4way_ctx()
@@ -67,6 +70,7 @@ void init_sonoa_4way_ctx()
sha512_4way_init( &sonoa_4way_ctx.sha512 );
haval256_5_4way_init( &sonoa_4way_ctx.haval );
};
*/
void sonoa_4way_hash( void *state, const void *input )
{
@@ -77,19 +81,23 @@ void sonoa_4way_hash( void *state, const void *input )
uint64_t vhash[8*4] __attribute__ ((aligned (64)));
uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
sonoa_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &sonoa_4way_ctx, sizeof(sonoa_4way_ctx) );
sonoa_4way_context_overlay ctx;
// sonoa_4way_ctx_holder ctx __attribute__ ((aligned (64)));
// memcpy( &ctx, &sonoa_4way_ctx, sizeof(sonoa_4way_ctx) );
// 1
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close( &ctx.blake, vhash );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
@@ -100,29 +108,36 @@ void sonoa_4way_hash( void *state, const void *input )
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, 64 );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, 64 );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, 64 );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, 512 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, 64 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, 64 );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, 64 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, 64 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, 512 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
@@ -130,6 +145,7 @@ void sonoa_4way_hash( void *state, const void *input )
mm256_deinterleave_2x128( hash0, hash1, vhashA, 512 );
mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, 512 );
init_echo( &ctx.echo, 512 );
@@ -215,10 +231,12 @@ void sonoa_4way_hash( void *state, const void *input )
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, 64 );
hamsi512_4way_close( &ctx.hamsi, vhash );
// 3
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
@@ -294,6 +312,7 @@ void sonoa_4way_hash( void *state, const void *input )
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, 64 );
sph_fugue512_close( &ctx.fugue, hash0 );
sph_fugue512_init( &ctx.fugue );
@@ -399,10 +418,11 @@ void sonoa_4way_hash( void *state, const void *input )
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, 64 );
shabal512_4way_close( &ctx.shabal, vhash );
mm256_reinterleave_4x64( vhashB, vhash, 512 );
mm256_reinterleave_4x32_4x64( vhashB, vhash, 512 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhashB, 64 );
@@ -438,7 +458,7 @@ void sonoa_4way_hash( void *state, const void *input )
bmw512_4way( &ctx.bmw, vhash, 64 );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_reinterleave_4x32( vhashB, vhash, 512 );
mm256_reinterleave_4x64_4x32( vhashB, vhash, 512 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhashB, 64 );
@@ -536,6 +556,7 @@ void sonoa_4way_hash( void *state, const void *input )
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
sph_whirlpool_init( &ctx.whirlpool );
@@ -663,6 +684,7 @@ void sonoa_4way_hash( void *state, const void *input )
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
@@ -800,11 +822,11 @@ void sonoa_4way_hash( void *state, const void *input )
sha512_4way( &ctx.sha512, vhash, 64 );
sha512_4way_close( &ctx.sha512, vhash );
mm256_reinterleave_4x32( vhashB, vhash, 512 );
mm256_reinterleave_4x64_4x32( vhashB, vhash, 512 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashB, 64 );
haval256_5_4way_close( &ctx.haval, state );
}
int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -819,10 +841,7 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
// uint32_t *noncep = vdata + 73; // 9*8 + 1
const uint32_t Htarg = ptarget[7];
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
uint64_t htmax[] = { 0, 0xF, 0xFF,
@@ -855,18 +874,23 @@ int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE,
"Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -3,7 +3,7 @@
bool register_sonoa_algo( algo_gate_t* gate )
{
#if defined (SONOA_4WAY)
init_sonoa_4way_ctx();
// init_sonoa_4way_ctx();
gate->scanhash = (void*)&scanhash_sonoa_4way;
gate->hash = (void*)&sonoa_4way_hash;
#else

View File

@@ -17,7 +17,7 @@ void sonoa_4way_hash( void *state, const void *input );
int scanhash_sonoa_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
void init_sonoa_4way_ctx();
//void init_sonoa_4way_ctx();
#endif

View File

@@ -14,7 +14,6 @@
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -222,7 +221,7 @@ void x17_4way_hash( void *state, const void *input )
sha512_4way_close( &ctx.sha512, vhash );
// 17 Haval parallel 32 bit
mm256_reinterleave_4x32( vhashB, vhash, 512 );
mm256_reinterleave_4x64_4x32( vhashB, vhash, 512 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashB, 64 );
@@ -242,8 +241,6 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t *ptarget = work->target;
uint32_t n = pdata[19];
const uint32_t first_nonce = pdata[19];
uint32_t *nonces = work->nonces;
int num_found = 0;
__m256i *noncev = (__m256i*)vdata + 9; // aligned
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
@@ -260,35 +257,40 @@ int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *edata = (uint64_t*)endiandata;
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{
uint32_t mask = masks[m];
uint32_t mask = masks[ m ];
do
{
*noncev = mm256_interleave_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ),
_mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ),
*noncev );
x17_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
if ( ( ( hash7[ lane ] & mask ) == 0 ) )
if ( ( hash7[ lane ] & mask ) == 0 )
{
mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE,
"Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce - 4 ) && !work_restart[thr_id].restart );
break;
}
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -12,8 +12,9 @@
#include "algo/jh/jh-hash-4way.h"
#include "algo/keccak/keccak-hash-4way.h"
#include "algo/skein/skein-hash-4way.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/luffa/luffa-hash-2way.h"
#include "algo/cubehash/cube-hash-2way.h"
#include "algo/shavite/shavite-hash-2way.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/simd/simd-hash-2way.h"
#include "algo/echo/aes_ni/hash_api.h"
@@ -24,16 +25,17 @@
#include "algo/sha/sha2-hash-4way.h"
#include "algo/haval/haval-hash-4way.h"
typedef struct {
blake512_4way_context blake;
union _xevan_4way_context_overlay
{
blake512_4way_context blake;
bmw512_4way_context bmw;
hashState_groestl groestl;
skein512_4way_context skein;
jh512_4way_context jh;
keccak512_4way_context keccak;
luffa_2way_context luffa;
cubehashParam cube;
sph_shavite512_context shavite;
cube_2way_context cube;
shavite512_2way_context shavite;
simd_2way_context simd;
hashState_echo echo;
hamsi512_4way_context hamsi;
@@ -42,39 +44,8 @@ typedef struct {
sph_whirlpool_context whirlpool;
sha512_4way_context sha512;
haval256_5_4way_context haval;
} xevan_4way_ctx_holder;
xevan_4way_ctx_holder xevan_4way_ctx __attribute__ ((aligned (64)));
static __thread blake512_4way_context xevan_blake_4way_mid
__attribute__ ((aligned (64)));
void init_xevan_4way_ctx()
{
blake512_4way_init(&xevan_4way_ctx.blake);
bmw512_4way_init( &xevan_4way_ctx.bmw );
init_groestl( &xevan_4way_ctx.groestl, 64 );
skein512_4way_init(&xevan_4way_ctx.skein);
jh512_4way_init(&xevan_4way_ctx.jh);
keccak512_4way_init(&xevan_4way_ctx.keccak);
luffa_2way_init( &xevan_4way_ctx.luffa, 512 );
cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
sph_shavite512_init( &xevan_4way_ctx.shavite );
simd_2way_init( &xevan_4way_ctx.simd, 512 );
init_echo( &xevan_4way_ctx.echo, 512 );
hamsi512_4way_init( &xevan_4way_ctx.hamsi );
sph_fugue512_init( &xevan_4way_ctx.fugue );
shabal512_4way_init( &xevan_4way_ctx.shabal );
sph_whirlpool_init( &xevan_4way_ctx.whirlpool );
sha512_4way_init( &xevan_4way_ctx.sha512 );
haval256_5_4way_init( &xevan_4way_ctx.haval );
};
void xevan_4way_blake512_midstate( const void* input )
{
memcpy( &xevan_blake_4way_mid, &xevan_4way_ctx.blake,
sizeof(xevan_blake_4way_mid) );
blake512_4way( &xevan_blake_4way_mid, input, 64 );
}
typedef union _xevan_4way_context_overlay xevan_4way_context_overlay;
void xevan_4way_hash( void *output, const void *input )
{
@@ -83,293 +54,283 @@ void xevan_4way_hash( void *output, const void *input )
uint64_t hash2[16] __attribute__ ((aligned (64)));
uint64_t hash3[16] __attribute__ ((aligned (64)));
uint64_t vhash[16<<2] __attribute__ ((aligned (64)));
uint64_t vhash32[16<<2] __attribute__ ((aligned (64)));
uint64_t vhashA[16<<2] __attribute__ ((aligned (64)));
uint64_t vhashB[16<<2] __attribute__ ((aligned (64)));
const int dataLen = 128;
const int midlen = 64; // bytes
const int tail = 80 - midlen; // 16
xevan_4way_ctx_holder ctx __attribute__ ((aligned (64)));
memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
xevan_4way_context_overlay ctx __attribute__ ((aligned (64)));
// parallel way
memcpy( &ctx.blake, &xevan_blake_4way_mid,
sizeof(xevan_blake_4way_mid) );
blake512_4way( &ctx.blake, input + (midlen<<2), tail );
// parallel 4 way
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, input, 80 );
blake512_4way_close(&ctx.blake, vhash);
memset( &vhash[8<<2], 0, 64<<2 );
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, dataLen );
bmw512_4way_close( &ctx.bmw, vhash );
// Serial
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
dataLen<<3 );
// Parallel 4way
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, dataLen );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, dataLen );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, dataLen );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
dataLen );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
sph_shavite512( &ctx.shavite, hash0, dataLen );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, dataLen );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, dataLen );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
mm256_deinterleave_1x128( hash0, hash1, vhashA, dataLen<<3 );
mm256_deinterleave_1x128( hash2, hash3, vhashB, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 );
// Parallel
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, dataLen );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, dataLen );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, dataLen );
sph_fugue512_close( &ctx.fugue, hash3 );
// Parallel 4way 32 bit
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, dataLen );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
// Serial
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, dataLen );
sha512_4way_close( &ctx.sha512, vhash );
mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 );
haval256_5_4way( &ctx.haval, vhash32, dataLen );
haval256_5_4way_close( &ctx.haval, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_reinterleave_4x64_4x32( vhashA, vhash, dataLen<<3 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashA, dataLen );
haval256_5_4way_close( &ctx.haval, vhashA );
mm256_reinterleave_4x32_4x64( vhash, vhashA, dataLen<<3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 );
memcpy( &ctx, &xevan_4way_ctx, sizeof(xevan_4way_ctx) );
blake512_4way_init( &ctx.blake );
blake512_4way( &ctx.blake, vhash, dataLen );
blake512_4way_close(&ctx.blake, vhash);
bmw512_4way_init( &ctx.bmw );
bmw512_4way( &ctx.bmw, vhash, dataLen );
bmw512_4way_close( &ctx.bmw, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2,
dataLen<<3 );
memcpy( &ctx.groestl, &xevan_4way_ctx.groestl, sizeof(hashState_groestl) );
init_groestl( &ctx.groestl, 64 );
update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3,
dataLen<<3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
skein512_4way_init( &ctx.skein );
skein512_4way( &ctx.skein, vhash, dataLen );
skein512_4way_close( &ctx.skein, vhash );
jh512_4way_init( &ctx.jh );
jh512_4way( &ctx.jh, vhash, dataLen );
jh512_4way_close( &ctx.jh, vhash );
keccak512_4way_init( &ctx.keccak );
keccak512_4way( &ctx.keccak, vhash, dataLen );
keccak512_4way_close( &ctx.keccak, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
mm256_reinterleave_4x64_2x128( vhashA, vhashB, vhash, dataLen<<3 );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
luffa_2way_init( &ctx.luffa, 512 );
luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, dataLen );
cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2,
dataLen );
memcpy( &ctx.cube, &xevan_4way_ctx.cube, sizeof(cubehashParam) );
cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3,
dataLen );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashA, vhashA, dataLen );
cube_2way_init( &ctx.cube, 512, 16, 32 );
cube_2way_update_close( &ctx.cube, vhashB, vhashB, dataLen );
sph_shavite512( &ctx.shavite, hash0, dataLen );
sph_shavite512_close( &ctx.shavite, hash0 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash1, dataLen );
sph_shavite512_close( &ctx.shavite, hash1 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash2, dataLen );
sph_shavite512_close( &ctx.shavite, hash2 );
memcpy( &ctx.shavite, &xevan_4way_ctx.shavite,
sizeof(sph_shavite512_context) );
sph_shavite512( &ctx.shavite, hash3, dataLen );
sph_shavite512_close( &ctx.shavite, hash3 );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashA, vhashA, dataLen );
shavite512_2way_init( &ctx.shavite );
shavite512_2way_update_close( &ctx.shavite, vhashB, vhashB, dataLen );
mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
simd_2way_init( &ctx.simd, 512 );
simd_2way_update_close( &ctx.simd, vhashB, vhashB, dataLen<<3 );
mm256_deinterleave_1x128( hash0, hash1, vhashA, dataLen<<3 );
mm256_deinterleave_1x128( hash2, hash3, vhashB, dataLen<<3 );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash0,
(const BitSequence *) hash0, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash1,
(const BitSequence *) hash1, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash2,
(const BitSequence *) hash2, dataLen<<3 );
memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
init_echo( &ctx.echo, 512 );
update_final_echo( &ctx.echo, (BitSequence *)hash3,
(const BitSequence *) hash3, dataLen<<3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
hamsi512_4way_init( &ctx.hamsi );
hamsi512_4way( &ctx.hamsi, vhash, dataLen );
hamsi512_4way_close( &ctx.hamsi, vhash );
mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash0, dataLen );
sph_fugue512_close( &ctx.fugue, hash0 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash1, dataLen );
sph_fugue512_close( &ctx.fugue, hash1 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash2, dataLen );
sph_fugue512_close( &ctx.fugue, hash2 );
memcpy( &ctx.fugue, &xevan_4way_ctx.fugue, sizeof(sph_fugue512_context) );
sph_fugue512_init( &ctx.fugue );
sph_fugue512( &ctx.fugue, hash3, dataLen );
sph_fugue512_close( &ctx.fugue, hash3 );
mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
shabal512_4way_init( &ctx.shabal );
shabal512_4way( &ctx.shabal, vhash, dataLen );
shabal512_4way_close( &ctx.shabal, vhash );
mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash0, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash0 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash1, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash2, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash2 );
memcpy( &ctx.whirlpool, &xevan_4way_ctx.whirlpool,
sizeof(sph_whirlpool_context) );
sph_whirlpool_init( &ctx.whirlpool );
sph_whirlpool( &ctx.whirlpool, hash3, dataLen );
sph_whirlpool_close( &ctx.whirlpool, hash3 );
mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
sha512_4way_init( &ctx.sha512 );
sha512_4way( &ctx.sha512, vhash, dataLen );
sha512_4way_close( &ctx.sha512, vhash );
mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 );
haval256_5_4way( &ctx.haval, vhash32, dataLen );
mm256_reinterleave_4x64_4x32( vhashA, vhash, dataLen<<3 );
haval256_5_4way_init( &ctx.haval );
haval256_5_4way( &ctx.haval, vhashA, dataLen );
haval256_5_4way_close( &ctx.haval, output );
}
int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash[4*8] __attribute__ ((aligned (64)));
uint32_t *hash7 = &(hash[7<<2]);
@@ -378,30 +339,26 @@ int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
__m256i *noncev = (__m256i*)vdata + 9; // aligned
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
uint32_t *nonces = work->nonces;
int num_found = 0;
uint32_t *noncep = vdata + 73; // 9*8 + 1
if ( opt_benchmark )
ptarget[7] = 0x0cff;
for ( int k=0; k < 19; k++ )
be32enc( &endiandata[k], pdata[k] );
uint64_t *edata = (uint64_t*)endiandata;
casti_m256i( edata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
casti_m256i( edata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
xevan_4way_blake512_midstate( vdata );
do {
be32enc( noncep, n );
be32enc( noncep+2, n+1 );
be32enc( noncep+4, n+2 );
be32enc( noncep+6, n+3 );
*noncev = mm256_interleave_blend_32( mm256_bswap_32(
_mm256_set_epi32( n+3, 0,n+2, 0,n+1, 0, n, 0 ) ), *noncev );
xevan_4way_hash( hash, vdata );
for ( int lane = 0; lane < 4; lane++ )
@@ -411,15 +368,20 @@ int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
if ( fulltest( lane_hash, ptarget ) )
{
pdata[19] = n + lane;
nonces[ num_found++ ] = n + lane;
work_set_target_ratio( work, lane_hash );
if ( submit_work( mythr, work ) )
applog( LOG_NOTICE,
"Share %d submitted by thread %d, lane %d.",
accepted_share_count + rejected_share_count + 1,
thr_id, lane );
else
applog( LOG_WARNING, "Failed to submit share." );
}
}
n += 4;
} while ( ( num_found == 0 ) && ( n < max_nonce )
&& !work_restart[thr_id].restart );
} while ( ( n < max_nonce-4 ) && !work_restart[thr_id].restart );
*hashes_done = n - first_nonce + 1;
return num_found;
return 0;
}
#endif

View File

@@ -8,7 +8,7 @@ void xevan_set_target( struct work* work, double job_diff )
bool register_xevan_algo( algo_gate_t* gate )
{
#if defined (XEVAN_4WAY)
init_xevan_4way_ctx();
// init_xevan_4way_ctx();
gate->scanhash = (void*)&scanhash_xevan_4way;
gate->hash = (void*)&xevan_4way_hash;
#else

View File

@@ -15,16 +15,16 @@ bool register_xevan_algo( algo_gate_t* gate );
void xevan_4way_hash( void *state, const void *input );
int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_xevan_4way_ctx();
//void init_xevan_4way_ctx();
#endif
void xevan_hash( void *state, const void *input );
int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done );
uint64_t *hashes_done, struct thr_info *mythr );
void init_xevan_ctx();

View File

@@ -230,12 +230,14 @@ void xevan_hash(void *output, const void *input)
memcpy(output, hash, 32);
}
int scanhash_xevan(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
int scanhash_xevan( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) hash[8];
uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];

View File

@@ -290,7 +290,7 @@ SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx)
/* Initialize an HMAC-SHA256 operation with the given key. */
void
HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen)
{
unsigned char pad[64];
unsigned char khash[32];
@@ -326,7 +326,7 @@ HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
/* Add bytes to the HMAC-SHA256 operation. */
void
HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len)
{
/* Feed data to the inner SHA256 operation. */
@@ -335,7 +335,7 @@ HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
/* Finish an HMAC-SHA256 operation. */
void
HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx)
HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx)
{
unsigned char ihash[32];
@@ -361,7 +361,7 @@ void
PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
HMAC_SHA256_CTX PShctx, hctx;
HMAC_SHA256_CTX_Y PShctx, hctx;
uint8_t _ALIGN(128) T[32];
uint8_t _ALIGN(128) U[32];
uint8_t ivec[4];
@@ -370,8 +370,8 @@ PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
int k;
/* Compute HMAC state after processing P and S. */
HMAC_SHA256_Init(&PShctx, passwd, passwdlen);
HMAC_SHA256_Update(&PShctx, salt, saltlen);
HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen);
HMAC_SHA256_Update_Y(&PShctx, salt, saltlen);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
@@ -379,18 +379,18 @@ PBKDF2_SHA256_Y(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
HMAC_SHA256_Update(&hctx, ivec, 4);
HMAC_SHA256_Final(U, &hctx);
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y));
HMAC_SHA256_Update_Y(&hctx, ivec, 4);
HMAC_SHA256_Final_Y(U, &hctx);
/* T_i = U_1 ... */
memcpy(T, U, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
HMAC_SHA256_Init(&hctx, passwd, passwdlen);
HMAC_SHA256_Update(&hctx, U, 32);
HMAC_SHA256_Final(U, &hctx);
HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen);
HMAC_SHA256_Update_Y(&hctx, U, 32);
HMAC_SHA256_Final_Y(U, &hctx);
/* ... xor U_j ... */
for (k = 0; k < 32; k++)

View File

@@ -49,14 +49,14 @@ typedef struct HMAC_SHA256Context {
typedef struct HMAC_SHA256Context {
SHA256_CTX ictx;
SHA256_CTX octx;
} HMAC_SHA256_CTX;
} HMAC_SHA256_CTX_Y;
void SHA256_Init_Y(SHA256_CTX_Y *);
void SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t);
void SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *);
void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
void HMAC_SHA256_Final(unsigned char [32], HMAC_SHA256_CTX *);
void HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
void HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
void HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *);
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):

View File

@@ -1354,14 +1354,14 @@ yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local,
if ((t || flags) && buflen == sizeof(sha256)) {
/* Compute ClientKey */
{
HMAC_SHA256_CTX ctx;
HMAC_SHA256_Init(&ctx, buf, buflen);
HMAC_SHA256_CTX_Y ctx;
HMAC_SHA256_Init_Y(&ctx, buf, buflen);
if ( yescrypt_client_key )
HMAC_SHA256_Update( &ctx, (uint8_t*)yescrypt_client_key,
HMAC_SHA256_Update_Y( &ctx, (uint8_t*)yescrypt_client_key,
yescrypt_client_key_len );
else
HMAC_SHA256_Update( &ctx, salt, saltlen );
HMAC_SHA256_Final(sha256, &ctx);
HMAC_SHA256_Update_Y( &ctx, salt, saltlen );
HMAC_SHA256_Final_Y(sha256, &ctx);
}
/* Compute StoredKey */
{

View File

@@ -383,7 +383,7 @@ void yescrypthash(void *output, const void *input)
}
int scanhash_yescrypt( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) vhash[8];
uint32_t _ALIGN(64) endiandata[20];
@@ -393,6 +393,7 @@ int scanhash_yescrypt( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
for (int k = 0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);

View File

@@ -1,646 +0,0 @@
/*-
* Copyright 2005-2016 Colin Percival
* Copyright 2016-2018 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <assert.h>
#include <stdint.h>
#include <string.h>
#include "insecure_memzero.h"
#include "sysendian.h"
#include "sha256.h"
#ifdef __ICC
/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
#define restrict
#elif __STDC_VERSION__ >= 199901L
/* Have restrict */
#elif defined(__GNUC__)
#define restrict __restrict
#else
#define restrict
#endif
/*
* Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
* (uint8_t) in big-endian form.
*/
static void
be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
{
/* Encode vector, two words at a time. */
do {
be32enc(&dst[0], src[0]);
be32enc(&dst[4], src[1]);
src += 2;
dst += 8;
} while (--len);
}
/*
* Decode a big-endian length len*8 vector of (uint8_t) into a length
* len*2 vector of (uint32_t).
*/
static void
be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
{
/* Decode vector, two words at a time. */
do {
dst[0] = be32dec(&src[0]);
dst[1] = be32dec(&src[4]);
src += 8;
dst += 2;
} while (--len);
}
/* SHA256 round constants. */
static const uint32_t Krnd[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define SHR(x, n) (x >> n)
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
h += S1(e) + Ch(e, f, g) + k; \
d += h; \
h += S0(a) + Maj(a, b, c);
/* Adjusted round function for rotating state */
#define RNDr(S, W, i, ii) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i + ii] + Krnd[i + ii])
/* Message schedule computation */
#define MSCH(W, ii, i) \
W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
/*
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state.
*/
static void
SHA256_Transform(uint32_t state[static restrict 8],
const uint8_t block[static restrict 64],
uint32_t W[static restrict 64], uint32_t S[static restrict 8])
{
int i;
/* 1. Prepare the first part of the message schedule W. */
be32dec_vect(W, block, 8);
/* 2. Initialize working variables. */
memcpy(S, state, 32);
/* 3. Mix. */
for (i = 0; i < 64; i += 16) {
RNDr(S, W, 0, i);
RNDr(S, W, 1, i);
RNDr(S, W, 2, i);
RNDr(S, W, 3, i);
RNDr(S, W, 4, i);
RNDr(S, W, 5, i);
RNDr(S, W, 6, i);
RNDr(S, W, 7, i);
RNDr(S, W, 8, i);
RNDr(S, W, 9, i);
RNDr(S, W, 10, i);
RNDr(S, W, 11, i);
RNDr(S, W, 12, i);
RNDr(S, W, 13, i);
RNDr(S, W, 14, i);
RNDr(S, W, 15, i);
if (i == 48)
break;
MSCH(W, 0, i);
MSCH(W, 1, i);
MSCH(W, 2, i);
MSCH(W, 3, i);
MSCH(W, 4, i);
MSCH(W, 5, i);
MSCH(W, 6, i);
MSCH(W, 7, i);
MSCH(W, 8, i);
MSCH(W, 9, i);
MSCH(W, 10, i);
MSCH(W, 11, i);
MSCH(W, 12, i);
MSCH(W, 13, i);
MSCH(W, 14, i);
MSCH(W, 15, i);
}
/* 4. Mix local working variables into global state. */
state[0] += S[0];
state[1] += S[1];
state[2] += S[2];
state[3] += S[3];
state[4] += S[4];
state[5] += S[5];
state[6] += S[6];
state[7] += S[7];
}
static const uint8_t PAD[64] = {
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/* Add padding and terminating bit-count. */
static void
SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
{
size_t r;
/* Figure out how many bytes we have buffered. */
r = (ctx->count >> 3) & 0x3f;
/* Pad to 56 mod 64, transforming if we finish a block en route. */
if (r < 56) {
/* Pad to 56 mod 64. */
memcpy(&ctx->buf[r], PAD, 56 - r);
} else {
/* Finish the current block and mix. */
memcpy(&ctx->buf[r], PAD, 64 - r);
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
/* The start of the final block is all zeroes. */
memset(&ctx->buf[0], 0, 56);
}
/* Add the terminating bit-count. */
be64enc(&ctx->buf[56], ctx->count);
/* Mix in the final block. */
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
}
/* Magic initialization constants. */
static const uint32_t initial_state[8] = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
/**
* SHA256_Init(ctx):
* Initialize the SHA256 context ${ctx}.
*/
void
SHA256_Init(SHA256_CTX * ctx)
{
/* Zero bits processed so far. */
ctx->count = 0;
/* Initialize state. */
memcpy(ctx->state, initial_state, sizeof(initial_state));
}
/**
* SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
*/
static void
_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
uint32_t tmp32[static restrict 72])
{
uint32_t r;
const uint8_t * src = in;
/* Return immediately if we have nothing to do. */
if (len == 0)
return;
/* Number of bytes left in the buffer from previous updates. */
r = (ctx->count >> 3) & 0x3f;
/* Update number of bits. */
ctx->count += (uint64_t)(len) << 3;
/* Handle the case where we don't need to perform any transforms. */
if (len < 64 - r) {
memcpy(&ctx->buf[r], src, len);
return;
}
/* Finish the current block. */
memcpy(&ctx->buf[r], src, 64 - r);
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
src += 64 - r;
len -= 64 - r;
/* Perform complete blocks. */
while (len >= 64) {
SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
src += 64;
len -= 64;
}
/* Copy left over data into buffer. */
memcpy(ctx->buf, src, len);
}
/* Wrapper function for intermediate-values sanitization. */
void
SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
{
uint32_t tmp32[72];
/* Call the real function. */
_SHA256_Update(ctx, in, len, tmp32);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* SHA256_Final(digest, ctx):
* Output the SHA256 hash of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
static void
_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
uint32_t tmp32[static restrict 72])
{
/* Add padding. */
SHA256_Pad(ctx, tmp32);
/* Write the hash. */
be32enc_vect(digest, ctx->state, 4);
}
/* Wrapper function for intermediate-values sanitization. */
void
SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
{
uint32_t tmp32[72];
/* Call the real function. */
_SHA256_Final(digest, ctx, tmp32);
/* Clear the context state. */
insecure_memzero(ctx, sizeof(SHA256_CTX));
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void
SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
{
SHA256_CTX ctx;
uint32_t tmp32[72];
SHA256_Init(&ctx);
_SHA256_Update(&ctx, in, len, tmp32);
_SHA256_Final(digest, &ctx, tmp32);
/* Clean the stack. */
insecure_memzero(&ctx, sizeof(SHA256_CTX));
insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Init(ctx, K, Klen):
* Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
* ${K}.
*/
static void
_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
uint8_t khash[static restrict 32])
{
const uint8_t * K = _K;
size_t i;
/* If Klen > 64, the key is really SHA256(K). */
if (Klen > 64) {
SHA256_Init(&ctx->ictx);
_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
_SHA256_Final(khash, &ctx->ictx, tmp32);
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
SHA256_Init(&ctx->ictx);
memset(pad, 0x36, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
SHA256_Init(&ctx->octx);
memset(pad, 0x5c, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
_SHA256_Update(&ctx->octx, pad, 64, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
{
uint32_t tmp32[72];
uint8_t pad[64];
uint8_t khash[32];
/* Call the real function. */
_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
insecure_memzero(khash, 32);
insecure_memzero(pad, 64);
}
/**
* HMAC_SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
*/
static void
_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
uint32_t tmp32[static restrict 72])
{
/* Feed data to the inner SHA256 operation. */
_SHA256_Update(&ctx->ictx, in, len, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
{
uint32_t tmp32[72];
/* Call the real function. */
_HMAC_SHA256_Update(ctx, in, len, tmp32);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Final(digest, ctx):
* Output the HMAC-SHA256 of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
static void
_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
{
/* Finish the inner SHA256 operation. */
_SHA256_Final(ihash, &ctx->ictx, tmp32);
/* Feed the inner hash to the outer SHA256 operation. */
_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
/* Finish the outer SHA256 operation. */
_SHA256_Final(digest, &ctx->octx, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
{
uint32_t tmp32[72];
uint8_t ihash[32];
/* Call the real function. */
_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
insecure_memzero(ihash, 32);
}
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void
HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
uint8_t digest[32])
{
HMAC_SHA256_CTX ctx;
uint32_t tmp32[72];
uint8_t tmp8[96];
_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
_HMAC_SHA256_Update(&ctx, in, len, tmp32);
_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
/* Clean the stack. */
insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(tmp32, 288);
insecure_memzero(tmp8, 96);
}
/* Add padding and terminating bit-count, but don't invoke Transform yet. */
static int
SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
uint32_t tmp32[static restrict 72])
{
uint32_t r;
r = (ctx->count >> 3) & 0x3f;
if (r >= 56)
return -1;
/*
* Convert length to a vector of bytes -- we do this now rather
* than later because the length will change after we pad.
*/
be64enc(len, ctx->count);
/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
_SHA256_Update(ctx, PAD, 56 - r, tmp32);
/* Add the terminating bit-count. */
ctx->buf[63] = len[7];
_SHA256_Update(ctx, len, 7, tmp32);
return 0;
}
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void
PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
HMAC_SHA256_CTX Phctx, PShctx, hctx;
uint32_t tmp32[72];
union {
uint8_t tmp8[96];
uint32_t state[8];
} u;
size_t i;
uint8_t ivec[4];
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Sanity-check. */
assert(dkLen <= 32 * (size_t)(UINT32_MAX));
if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
uint32_t oldcount;
uint8_t * ivecp;
/* Compute HMAC state after processing P and S. */
_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
tmp32, &u.tmp8[0], &u.tmp8[64]);
_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
/* Prepare ictx padding. */
oldcount = hctx.ictx.count & (0x3f << 3);
_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
goto generic; /* Can't happen due to saltlen check */
ivecp = hctx.ictx.buf + (oldcount >> 3);
/* Prepare octx padding. */
hctx.octx.count += 32 << 3;
SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivecp, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(u.state, hctx.ictx.state, sizeof(u.state));
SHA256_Transform(u.state, hctx.ictx.buf,
&tmp32[0], &tmp32[64]);
be32enc_vect(hctx.octx.buf, u.state, 4);
memcpy(u.state, hctx.octx.state, sizeof(u.state));
SHA256_Transform(u.state, hctx.octx.buf,
&tmp32[0], &tmp32[64]);
be32enc_vect(&buf[i * 32], u.state, 4);
}
goto cleanup;
}
generic:
/* Compute HMAC state after processing P. */
_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
tmp32, &u.tmp8[0], &u.tmp8[64]);
/* Compute HMAC state after processing P and S. */
memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
if (c > 1) {
/* T_i = U_1 ... */
memcpy(U, T, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
/* ... xor U_j ... */
for (k = 0; k < 32; k++)
T[k] ^= U[k];
}
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy(&buf[i * 32], T, clen);
}
/* Clean the stack. */
insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(U, 32);
insecure_memzero(T, 32);
cleanup:
insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(tmp32, 288);
insecure_memzero(&u, sizeof(u));
}

View File

@@ -1,680 +0,0 @@
/*-
* Copyright 2005-2016 Colin Percival
* Copyright 2016-2018 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <assert.h>
#include <stdint.h>
#include <string.h>
#include "insecure_memzero.h"
#include "sysendian.h"
#include "sha256.h"
#include "avxdefs.h"
#ifdef __ICC
/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
#define restrict
#elif __STDC_VERSION__ >= 199901L
/* Have restrict */
#elif defined(__GNUC__)
#define restrict __restrict
#else
#define restrict
#endif
/*
* Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
* (uint8_t) in big-endian form.
*/
static void
be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
{
/* Encode vector, two words at a time. */
do {
be32enc(&dst[0], src[0]);
be32enc(&dst[4], src[1]);
src += 2;
dst += 8;
} while (--len);
}
/*
* Decode a big-endian length len*8 vector of (uint8_t) into a length
* len*2 vector of (uint32_t).
*/
static void
be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
{
/* Decode vector, two words at a time. */
do {
dst[0] = be32dec(&src[0]);
dst[1] = be32dec(&src[4]);
src += 8;
dst += 2;
} while (--len);
}
/* SHA256 round constants. */
static const uint32_t Krnd[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define SHR(x, n) (x >> n)
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
#if 0 //defined(__SHA__)
// ABEF = _mm_sha256rnds2_epu32( CDGH, ABEF, k )
//_mm_sha256rnds2_epu32 (__m128i a, __m128i b, __m128i k)
// b = { ABEF } a = { CDGH }
//
//a = _mm_set_epi32( S[(66 - i) % 8], S[(67 - i) % 8],
// S[(70 - i) % 8], S[(71 - i) % 8] );
//b = _mm_set_epi32( S[(64 - i) % 8], S[(65 - i) % 8],
// S[(68 - i) % 8], S[(69 - i) % 8] );
//k = _mm_set1_epi32( W[i + ii] + Krnd[i + ii] )
// _mm_sha256rnds2_epu32(a,b,k)
#define RNDr( S, W, i, ii ) do \
{ \
uint32_t abef[4]; \
__m128i ABEF = _mm_set_epi32( S[(66 - i) % 8], S[(67 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8] ); \
__m128i CDGH = _mm_set_epi32( S[(64 - i) % 8], S[(65 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8] ); \
__m128i K = _mm_set1_epi32( W[i + ii] + Krnd[i + ii] ); \
casti_m128i( abef, 0 ) = _mm_sha256rnds2_epu32( CDGH, ABEF, K ); \
S[(66 - i) % 8] = abef[3]; \
S[(67 - i) % 8] = abef[2]; \
S[(64 - i) % 8] = abef[1]; \
S[(65 - i) % 8] = abef[0]; \
} while(0)
#else
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
h += S1(e) + Ch(e, f, g) + k; \
d += h; \
h += S0(a) + Maj(a, b, c);
/* Adjusted round function for rotating state */
#define RNDr(S, W, i, ii) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i + ii] + Krnd[i + ii])
#endif
/* Message schedule computation */
#define MSCH(W, ii, i) \
W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
/*
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state.
*/
static void
SHA256_Transform(uint32_t state[static restrict 8],
const uint8_t block[static restrict 64],
uint32_t W[static restrict 64], uint32_t S[static restrict 8])
{
int i;
/* 1. Prepare the first part of the message schedule W. */
be32dec_vect(W, block, 8);
/* 2. Initialize working variables. */
memcpy(S, state, 32);
/* 3. Mix. */
for (i = 0; i < 64; i += 16) {
RNDr(S, W, 0, i);
RNDr(S, W, 1, i);
RNDr(S, W, 2, i);
RNDr(S, W, 3, i);
RNDr(S, W, 4, i);
RNDr(S, W, 5, i);
RNDr(S, W, 6, i);
RNDr(S, W, 7, i);
RNDr(S, W, 8, i);
RNDr(S, W, 9, i);
RNDr(S, W, 10, i);
RNDr(S, W, 11, i);
RNDr(S, W, 12, i);
RNDr(S, W, 13, i);
RNDr(S, W, 14, i);
RNDr(S, W, 15, i);
if (i == 48)
break;
MSCH(W, 0, i);
MSCH(W, 1, i);
MSCH(W, 2, i);
MSCH(W, 3, i);
MSCH(W, 4, i);
MSCH(W, 5, i);
MSCH(W, 6, i);
MSCH(W, 7, i);
MSCH(W, 8, i);
MSCH(W, 9, i);
MSCH(W, 10, i);
MSCH(W, 11, i);
MSCH(W, 12, i);
MSCH(W, 13, i);
MSCH(W, 14, i);
MSCH(W, 15, i);
}
/* 4. Mix local working variables into global state. */
state[0] += S[0];
state[1] += S[1];
state[2] += S[2];
state[3] += S[3];
state[4] += S[4];
state[5] += S[5];
state[6] += S[6];
state[7] += S[7];
}
static const uint8_t PAD[64] = {
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/* Add padding and terminating bit-count. */
static void
SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
{
size_t r;
/* Figure out how many bytes we have buffered. */
r = (ctx->count >> 3) & 0x3f;
/* Pad to 56 mod 64, transforming if we finish a block en route. */
if (r < 56) {
/* Pad to 56 mod 64. */
memcpy(&ctx->buf[r], PAD, 56 - r);
} else {
/* Finish the current block and mix. */
memcpy(&ctx->buf[r], PAD, 64 - r);
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
/* The start of the final block is all zeroes. */
memset(&ctx->buf[0], 0, 56);
}
/* Add the terminating bit-count. */
be64enc(&ctx->buf[56], ctx->count);
/* Mix in the final block. */
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
}
/* Magic initialization constants. */
static const uint32_t initial_state[8] = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
/**
* SHA256_Init(ctx):
* Initialize the SHA256 context ${ctx}.
*/
void
SHA256_Init(SHA256_CTX * ctx)
{
/* Zero bits processed so far. */
ctx->count = 0;
/* Initialize state. */
memcpy(ctx->state, initial_state, sizeof(initial_state));
}
/**
* SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
*/
static void
_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
uint32_t tmp32[static restrict 72])
{
uint32_t r;
const uint8_t * src = in;
/* Return immediately if we have nothing to do. */
if (len == 0)
return;
/* Number of bytes left in the buffer from previous updates. */
r = (ctx->count >> 3) & 0x3f;
/* Update number of bits. */
ctx->count += (uint64_t)(len) << 3;
/* Handle the case where we don't need to perform any transforms. */
if (len < 64 - r) {
memcpy(&ctx->buf[r], src, len);
return;
}
/* Finish the current block. */
memcpy(&ctx->buf[r], src, 64 - r);
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
src += 64 - r;
len -= 64 - r;
/* Perform complete blocks. */
while (len >= 64) {
SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
src += 64;
len -= 64;
}
/* Copy left over data into buffer. */
memcpy(ctx->buf, src, len);
}
/* Wrapper function for intermediate-values sanitization. */
void
SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
{
uint32_t tmp32[72];
/* Call the real function. */
_SHA256_Update(ctx, in, len, tmp32);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* SHA256_Final(digest, ctx):
* Output the SHA256 hash of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
static void
_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
uint32_t tmp32[static restrict 72])
{
/* Add padding. */
SHA256_Pad(ctx, tmp32);
/* Write the hash. */
be32enc_vect(digest, ctx->state, 4);
}
/* Wrapper function for intermediate-values sanitization. */
void
SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
{
uint32_t tmp32[72];
/* Call the real function. */
_SHA256_Final(digest, ctx, tmp32);
/* Clear the context state. */
insecure_memzero(ctx, sizeof(SHA256_CTX));
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void
SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
{
SHA256_CTX ctx;
uint32_t tmp32[72];
SHA256_Init(&ctx);
_SHA256_Update(&ctx, in, len, tmp32);
_SHA256_Final(digest, &ctx, tmp32);
/* Clean the stack. */
insecure_memzero(&ctx, sizeof(SHA256_CTX));
insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Init(ctx, K, Klen):
* Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
* ${K}.
*/
static void
_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
uint8_t khash[static restrict 32])
{
const uint8_t * K = _K;
size_t i;
/* If Klen > 64, the key is really SHA256(K). */
if (Klen > 64) {
SHA256_Init(&ctx->ictx);
_SHA256_Update(&ctx->ictx, K, Klen, tmp32);
_SHA256_Final(khash, &ctx->ictx, tmp32);
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
SHA256_Init(&ctx->ictx);
memset(pad, 0x36, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
_SHA256_Update(&ctx->ictx, pad, 64, tmp32);
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
SHA256_Init(&ctx->octx);
memset(pad, 0x5c, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
_SHA256_Update(&ctx->octx, pad, 64, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
{
uint32_t tmp32[72];
uint8_t pad[64];
uint8_t khash[32];
/* Call the real function. */
_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
insecure_memzero(khash, 32);
insecure_memzero(pad, 64);
}
/**
* HMAC_SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
*/
static void
_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
uint32_t tmp32[static restrict 72])
{
/* Feed data to the inner SHA256 operation. */
_SHA256_Update(&ctx->ictx, in, len, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
{
uint32_t tmp32[72];
/* Call the real function. */
_HMAC_SHA256_Update(ctx, in, len, tmp32);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Final(digest, ctx):
* Output the HMAC-SHA256 of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
static void
_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
{
/* Finish the inner SHA256 operation. */
_SHA256_Final(ihash, &ctx->ictx, tmp32);
/* Feed the inner hash to the outer SHA256 operation. */
_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
/* Finish the outer SHA256 operation. */
_SHA256_Final(digest, &ctx->octx, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
{
uint32_t tmp32[72];
uint8_t ihash[32];
/* Call the real function. */
_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
insecure_memzero(ihash, 32);
}
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void
HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
uint8_t digest[32])
{
HMAC_SHA256_CTX ctx;
uint32_t tmp32[72];
uint8_t tmp8[96];
_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
_HMAC_SHA256_Update(&ctx, in, len, tmp32);
_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
/* Clean the stack. */
insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(tmp32, 288);
insecure_memzero(tmp8, 96);
}
/* Add padding and terminating bit-count, but don't invoke Transform yet. */
static int
SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
uint32_t tmp32[static restrict 72])
{
uint32_t r;
r = (ctx->count >> 3) & 0x3f;
if (r >= 56)
return -1;
/*
* Convert length to a vector of bytes -- we do this now rather
* than later because the length will change after we pad.
*/
be64enc(len, ctx->count);
/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
_SHA256_Update(ctx, PAD, 56 - r, tmp32);
/* Add the terminating bit-count. */
ctx->buf[63] = len[7];
_SHA256_Update(ctx, len, 7, tmp32);
return 0;
}
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void
PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
HMAC_SHA256_CTX Phctx, PShctx, hctx;
uint32_t tmp32[72];
union {
uint8_t tmp8[96];
uint32_t state[8];
} u;
size_t i;
uint8_t ivec[4];
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Sanity-check. */
assert(dkLen <= 32 * (size_t)(UINT32_MAX));
if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
uint32_t oldcount;
uint8_t * ivecp;
/* Compute HMAC state after processing P and S. */
_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
tmp32, &u.tmp8[0], &u.tmp8[64]);
_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
/* Prepare ictx padding. */
oldcount = hctx.ictx.count & (0x3f << 3);
_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
goto generic; /* Can't happen due to saltlen check */
ivecp = hctx.ictx.buf + (oldcount >> 3);
/* Prepare octx padding. */
hctx.octx.count += 32 << 3;
SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivecp, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(u.state, hctx.ictx.state, sizeof(u.state));
SHA256_Transform(u.state, hctx.ictx.buf,
&tmp32[0], &tmp32[64]);
be32enc_vect(hctx.octx.buf, u.state, 4);
memcpy(u.state, hctx.octx.state, sizeof(u.state));
SHA256_Transform(u.state, hctx.octx.buf,
&tmp32[0], &tmp32[64]);
be32enc_vect(&buf[i * 32], u.state, 4);
}
goto cleanup;
}
generic:
/* Compute HMAC state after processing P. */
_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
tmp32, &u.tmp8[0], &u.tmp8[64]);
/* Compute HMAC state after processing P and S. */
memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
if (c > 1) {
/* T_i = U_1 ... */
memcpy(U, T, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
/* ... xor U_j ... */
for (k = 0; k < 32; k++)
T[k] ^= U[k];
}
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy(&buf[i * 32], T, clen);
}
/* Clean the stack. */
insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(U, 32);
insecure_memzero(T, 32);
cleanup:
insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(tmp32, 288);
insecure_memzero(&u, sizeof(u));
}

View File

@@ -1,672 +0,0 @@
/*-
* Copyright 2005-2016 Colin Percival
* Copyright 2016-2018 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <assert.h>
#include <stdint.h>
#include <string.h>
#include "insecure_memzero.h"
#include "sysendian.h"
#include "sha256.h"
#ifdef __ICC
/* Miscompile with icc 14.0.0 (at least), so don't use restrict there */
#define restrict
#elif __STDC_VERSION__ >= 199901L
/* Have restrict */
#elif defined(__GNUC__)
#define restrict __restrict
#else
#define restrict
#endif
/*
* Encode a length len*2 vector of (uint32_t) into a length len*8 vector of
* (uint8_t) in big-endian form.
*/
static void
be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
{
/* Encode vector, two words at a time. */
do {
be32enc(&dst[0], src[0]);
be32enc(&dst[4], src[1]);
src += 2;
dst += 8;
} while (--len);
}
/*
* Decode a big-endian length len*8 vector of (uint8_t) into a length
* len*2 vector of (uint32_t).
*/
static void
be32dec_vect(uint32_t * dst, const uint8_t * src, size_t len)
{
/* Decode vector, two words at a time. */
do {
dst[0] = be32dec(&src[0]);
dst[1] = be32dec(&src[4]);
src += 8;
dst += 2;
} while (--len);
}
#if 0
/* SHA256 round constants. */
static const uint32_t Krnd[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define SHR(x, n) (x >> n)
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
h += S1(e) + Ch(e, f, g) + k; \
d += h; \
h += S0(a) + Maj(a, b, c);
/* Adjusted round function for rotating state */
#define RNDr(S, W, i, ii) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i + ii] + Krnd[i + ii])
/* Message schedule computation */
#define MSCH(W, ii, i) \
W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + s0(W[i + ii + 1]) + W[i + ii]
/*
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state.
*/
static void
SHA256_Transform(uint32_t state[static restrict 8],
const uint8_t block[static restrict 64],
uint32_t W[static restrict 64], uint32_t S[static restrict 8])
{
int i;
/* 1. Prepare the first part of the message schedule W. */
be32dec_vect(W, block, 8);
/* 2. Initialize working variables. */
memcpy(S, state, 32);
/* 3. Mix. */
for (i = 0; i < 64; i += 16) {
RNDr(S, W, 0, i);
RNDr(S, W, 1, i);
RNDr(S, W, 2, i);
RNDr(S, W, 3, i);
RNDr(S, W, 4, i);
RNDr(S, W, 5, i);
RNDr(S, W, 6, i);
RNDr(S, W, 7, i);
RNDr(S, W, 8, i);
RNDr(S, W, 9, i);
RNDr(S, W, 10, i);
RNDr(S, W, 11, i);
RNDr(S, W, 12, i);
RNDr(S, W, 13, i);
RNDr(S, W, 14, i);
RNDr(S, W, 15, i);
if (i == 48)
break;
MSCH(W, 0, i);
MSCH(W, 1, i);
MSCH(W, 2, i);
MSCH(W, 3, i);
MSCH(W, 4, i);
MSCH(W, 5, i);
MSCH(W, 6, i);
MSCH(W, 7, i);
MSCH(W, 8, i);
MSCH(W, 9, i);
MSCH(W, 10, i);
MSCH(W, 11, i);
MSCH(W, 12, i);
MSCH(W, 13, i);
MSCH(W, 14, i);
MSCH(W, 15, i);
}
/* 4. Mix local working variables into global state. */
state[0] += S[0];
state[1] += S[1];
state[2] += S[2];
state[3] += S[3];
state[4] += S[4];
state[5] += S[5];
state[6] += S[6];
state[7] += S[7];
}
#endif
static const uint8_t PAD[64] = {
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/* Add padding and terminating bit-count. */
static void
SHA256_Pad(SHA256_CTX * ctx, uint32_t tmp32[static restrict 72])
{
size_t r;
/* Figure out how many bytes we have buffered. */
r = (ctx->count >> 3) & 0x3f;
/* Pad to 56 mod 64, transforming if we finish a block en route. */
if (r < 56) {
/* Pad to 56 mod 64. */
memcpy(&ctx->buf[r], PAD, 56 - r);
} else {
/* Finish the current block and mix. */
memcpy(&ctx->buf[r], PAD, 64 - r);
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
/* The start of the final block is all zeroes. */
memset(&ctx->buf[0], 0, 56);
}
/* Add the terminating bit-count. */
be64enc(&ctx->buf[56], ctx->count);
/* Mix in the final block. */
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
}
#if 0
/* Magic initialization constants. */
static const uint32_t initial_state[8] = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
/**
* SHA256_Init(ctx):
* Initialize the SHA256 context ${ctx}.
*/
void
SHA256_Init(SHA256_CTX * ctx)
{
/* Zero bits processed so far. */
ctx->count = 0;
/* Initialize state. */
memcpy(ctx->state, initial_state, sizeof(initial_state));
}
/**
* SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
*/
static void
_SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len,
uint32_t tmp32[static restrict 72])
{
uint32_t r;
const uint8_t * src = in;
/* Return immediately if we have nothing to do. */
if (len == 0)
return;
/* Number of bytes left in the buffer from previous updates. */
r = (ctx->count >> 3) & 0x3f;
/* Update number of bits. */
ctx->count += (uint64_t)(len) << 3;
/* Handle the case where we don't need to perform any transforms. */
if (len < 64 - r) {
memcpy(&ctx->buf[r], src, len);
return;
}
/* Finish the current block. */
memcpy(&ctx->buf[r], src, 64 - r);
SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
src += 64 - r;
len -= 64 - r;
/* Perform complete blocks. */
while (len >= 64) {
SHA256_Transform(ctx->state, src, &tmp32[0], &tmp32[64]);
src += 64;
len -= 64;
}
/* Copy left over data into buffer. */
memcpy(ctx->buf, src, len);
}
/* Wrapper function for intermediate-values sanitization. */
void
SHA256_Update(SHA256_CTX * ctx, const void * in, size_t len)
{
uint32_t tmp32[72];
/* Call the real function. */
_SHA256_Update(ctx, in, len, tmp32);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* SHA256_Final(digest, ctx):
* Output the SHA256 hash of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
static void
_SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx,
uint32_t tmp32[static restrict 72])
{
/* Add padding. */
SHA256_Pad(ctx, tmp32);
/* Write the hash. */
be32enc_vect(digest, ctx->state, 4);
}
/* Wrapper function for intermediate-values sanitization. */
void
SHA256_Final(uint8_t digest[32], SHA256_CTX * ctx)
{
uint32_t tmp32[72];
/* Call the real function. */
_SHA256_Final(digest, ctx, tmp32);
/* Clear the context state. */
insecure_memzero(ctx, sizeof(SHA256_CTX));
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
#endif
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void
SHA256_Buf(const void * in, size_t len, uint8_t digest[32])
{
SHA256_CTX ctx;
uint32_t tmp32[72];
SHA256_Init(&ctx);
SHA256_Update(&ctx, in, len);
SHA256_Final(digest, &ctx);
// _SHA256_Update(&ctx, in, len, tmp32);
// _SHA256_Final(digest, &ctx, tmp32);
/* Clean the stack. */
insecure_memzero(&ctx, sizeof(SHA256_CTX));
insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Init(ctx, K, Klen):
* Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
* ${K}.
*/
static void
_HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen,
uint32_t tmp32[static restrict 72], uint8_t pad[static restrict 64],
uint8_t khash[static restrict 32])
{
const uint8_t * K = _K;
size_t i;
/* If Klen > 64, the key is really SHA256(K). */
if (Klen > 64) {
SHA256_Init(&ctx->ictx);
SHA256_Update(&ctx->ictx, K, Klen);
SHA256_Final(khash, &ctx->ictx);
// _SHA256_Update(&ctx->ictx, K, Klen, tmp32);
// _SHA256_Final(khash, &ctx->ictx, tmp32);
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
SHA256_Init(&ctx->ictx);
memset(pad, 0x36, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
SHA256_Update(&ctx->ictx, pad, 64);
// _SHA256_Update(&ctx->ictx, pad, 64, tmp32);
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
SHA256_Init(&ctx->octx);
memset(pad, 0x5c, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
SHA256_Update(&ctx->octx, pad, 64);
// _SHA256_Update(&ctx->octx, pad, 64, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Init(HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen)
{
uint32_t tmp32[72];
uint8_t pad[64];
uint8_t khash[32];
/* Call the real function. */
_HMAC_SHA256_Init(ctx, _K, Klen, tmp32, pad, khash);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
insecure_memzero(khash, 32);
insecure_memzero(pad, 64);
}
/**
* HMAC_SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
*/
static void
_HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len,
uint32_t tmp32[static restrict 72])
{
/* Feed data to the inner SHA256 operation. */
SHA256_Update(&ctx->ictx, in, len);
// _SHA256_Update(&ctx->ictx, in, len, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void * in, size_t len)
{
uint32_t tmp32[72];
/* Call the real function. */
_HMAC_SHA256_Update(ctx, in, len, tmp32);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Final(digest, ctx):
* Output the HMAC-SHA256 of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
static void
_HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx,
uint32_t tmp32[static restrict 72], uint8_t ihash[static restrict 32])
{
/* Finish the inner SHA256 operation. */
_SHA256_Final(ihash, &ctx->ictx, tmp32);
/* Feed the inner hash to the outer SHA256 operation. */
_SHA256_Update(&ctx->octx, ihash, 32, tmp32);
/* Finish the outer SHA256 operation. */
_SHA256_Final(digest, &ctx->octx, tmp32);
// _SHA256_Final(ihash, &ctx->ictx, tmp32);
// _SHA256_Update(&ctx->octx, ihash, 32, tmp32);
// _SHA256_Final(digest, &ctx->octx, tmp32);
}
/* Wrapper function for intermediate-values sanitization. */
void
HMAC_SHA256_Final(uint8_t digest[32], HMAC_SHA256_CTX * ctx)
{
uint32_t tmp32[72];
uint8_t ihash[32];
/* Call the real function. */
_HMAC_SHA256_Final(digest, ctx, tmp32, ihash);
/* Clean the stack. */
insecure_memzero(tmp32, 288);
insecure_memzero(ihash, 32);
}
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void
HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
uint8_t digest[32])
{
HMAC_SHA256_CTX ctx;
uint32_t tmp32[72];
uint8_t tmp8[96];
_HMAC_SHA256_Init(&ctx, K, Klen, tmp32, &tmp8[0], &tmp8[64]);
_HMAC_SHA256_Update(&ctx, in, len, tmp32);
_HMAC_SHA256_Final(digest, &ctx, tmp32, &tmp8[0]);
/* Clean the stack. */
insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(tmp32, 288);
insecure_memzero(tmp8, 96);
}
/* Add padding and terminating bit-count, but don't invoke Transform yet. */
static int
SHA256_Pad_Almost(SHA256_CTX * ctx, uint8_t len[static restrict 8],
uint32_t tmp32[static restrict 72])
{
uint32_t r;
r = (ctx->count >> 3) & 0x3f;
if (r >= 56)
return -1;
/*
* Convert length to a vector of bytes -- we do this now rather
* than later because the length will change after we pad.
*/
be64enc(len, ctx->count);
/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
SHA256_Update(ctx, PAD, 56 - r, tmp);
/* Add the terminating bit-count. */
ctx->buf[63] = len[7];
SHA256_Update(ctx, len, 7, tmp);
/* Add 1--56 bytes so that the resulting length is 56 mod 64. */
// _SHA256_Update(ctx, PAD, 56 - r, tmp32);
/* Add the terminating bit-count. */
// ctx->buf[63] = len[7];
// _SHA256_Update(ctx, len, 7, tmp32);
return 0;
}
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void
PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
HMAC_SHA256_CTX Phctx, PShctx, hctx;
uint32_t tmp32[72];
union {
uint8_t tmp8[96];
uint32_t state[8];
} u;
size_t i;
uint8_t ivec[4];
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Sanity-check. */
assert(dkLen <= 32 * (size_t)(UINT32_MAX));
if (c == 1 && (dkLen & 31) == 0 && (saltlen & 63) <= 51) {
uint32_t oldcount;
uint8_t * ivecp;
/* Compute HMAC state after processing P and S. */
_HMAC_SHA256_Init(&hctx, passwd, passwdlen,
tmp32, &u.tmp8[0], &u.tmp8[64]);
_HMAC_SHA256_Update(&hctx, salt, saltlen, tmp32);
/* Prepare ictx padding. */
oldcount = hctx.ictx.count & (0x3f << 3);
_HMAC_SHA256_Update(&hctx, "\0\0\0", 4, tmp32);
if ((hctx.ictx.count & (0x3f << 3)) < oldcount ||
SHA256_Pad_Almost(&hctx.ictx, u.tmp8, tmp32))
goto generic; /* Can't happen due to saltlen check */
ivecp = hctx.ictx.buf + (oldcount >> 3);
/* Prepare octx padding. */
hctx.octx.count += 32 << 3;
SHA256_Pad_Almost(&hctx.octx, u.tmp8, tmp32);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivecp, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(u.state, hctx.ictx.state, sizeof(u.state));
SHA256_Transform(u.state, hctx.ictx.buf );
be32enc_vect(hctx.octx.buf, u.state, 4);
memcpy(u.state, hctx.octx.state, sizeof(u.state));
SHA256_Transform(u.state, hctx.octx.buf );
// SHA256_Transform(u.state, hctx.ictx.buf,
// &tmp32[0], &tmp32[64]);
// be32enc_vect(hctx.octx.buf, u.state, 4);
// memcpy(u.state, hctx.octx.state, sizeof(u.state));
// SHA256_Transform(u.state, hctx.octx.buf,
// &tmp32[0], &tmp32[64]);
be32enc_vect(&buf[i * 32], u.state, 4);
}
goto cleanup;
}
generic:
/* Compute HMAC state after processing P. */
_HMAC_SHA256_Init(&Phctx, passwd, passwdlen,
tmp32, &u.tmp8[0], &u.tmp8[64]);
/* Compute HMAC state after processing P and S. */
memcpy(&PShctx, &Phctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&PShctx, salt, saltlen, tmp32);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&hctx, ivec, 4, tmp32);
_HMAC_SHA256_Final(T, &hctx, tmp32, u.tmp8);
if (c > 1) {
/* T_i = U_1 ... */
memcpy(U, T, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
memcpy(&hctx, &Phctx, sizeof(HMAC_SHA256_CTX));
_HMAC_SHA256_Update(&hctx, U, 32, tmp32);
_HMAC_SHA256_Final(U, &hctx, tmp32, u.tmp8);
/* ... xor U_j ... */
for (k = 0; k < 32; k++)
T[k] ^= U[k];
}
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy(&buf[i * 32], T, clen);
}
/* Clean the stack. */
insecure_memzero(&Phctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(&PShctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(U, 32);
insecure_memzero(T, 32);
cleanup:
insecure_memzero(&hctx, sizeof(HMAC_SHA256_CTX));
insecure_memzero(tmp32, 288);
insecure_memzero(&u, sizeof(u));
}

View File

@@ -1,129 +0,0 @@
/*-
* Copyright 2005-2016 Colin Percival
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef _SHA256_H_
#define _SHA256_H_
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Use #defines in order to avoid namespace collisions with anyone else's
* SHA256 code (e.g., the code in OpenSSL).
*/
#define SHA256_Init libcperciva_SHA256_Init
#define SHA256_Update libcperciva_SHA256_Update
#define SHA256_Final libcperciva_SHA256_Final
#define SHA256_Buf libcperciva_SHA256_Buf
#define SHA256_CTX libcperciva_SHA256_CTX
#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init
#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update
#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final
#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf
#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX
/* Context structure for SHA256 operations. */
typedef struct {
uint32_t state[8];
uint64_t count;
uint8_t buf[64];
} SHA256_CTX;
/**
* SHA256_Init(ctx):
* Initialize the SHA256 context ${ctx}.
*/
void SHA256_Init(SHA256_CTX *);
/**
* SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
*/
void SHA256_Update(SHA256_CTX *, const void *, size_t);
/**
* SHA256_Final(digest, ctx):
* Output the SHA256 hash of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
void SHA256_Final(uint8_t[32], SHA256_CTX *);
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void SHA256_Buf(const void *, size_t, uint8_t[32]);
/* Context structure for HMAC-SHA256 operations. */
typedef struct {
SHA256_CTX ictx;
SHA256_CTX octx;
} HMAC_SHA256_CTX;
/**
* HMAC_SHA256_Init(ctx, K, Klen):
* Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
* ${K}.
*/
void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
/**
* HMAC_SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
*/
void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
/**
* HMAC_SHA256_Final(digest, ctx):
* Output the HMAC-SHA256 of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *);
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]);
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
uint64_t, uint8_t *, size_t);
#ifdef __cplusplus
}
#endif
#endif /* !_SHA256_H_ */

View File

@@ -1,134 +0,0 @@
/*-
* Copyright 2005-2016 Colin Percival
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef _SHA256_H_
#define _SHA256_H_
#include <stddef.h>
#include <stdint.h>
#include <openssl.sha>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Use #defines in order to avoid namespace collisions with anyone else's
* SHA256 code (e.g., the code in OpenSSL).
*/
/*
#define SHA256_Init libcperciva_SHA256_Init
#define SHA256_Update libcperciva_SHA256_Update
#define SHA256_Final libcperciva_SHA256_Final
#define SHA256_CTX libcperciva_SHA256_CTX
*/
#define SHA256_Buf libcperciva_SHA256_Buf
#define HMAC_SHA256_Init libcperciva_HMAC_SHA256_Init
#define HMAC_SHA256_Update libcperciva_HMAC_SHA256_Update
#define HMAC_SHA256_Final libcperciva_HMAC_SHA256_Final
#define HMAC_SHA256_Buf libcperciva_HMAC_SHA256_Buf
#define HMAC_SHA256_CTX libcperciva_HMAC_SHA256_CTX
#if 0
/* Context structure for SHA256 operations. */
typedef struct {
uint32_t state[8];
uint64_t count;
uint8_t buf[64];
} SHA256_CTX;
/**
* SHA256_Init(ctx):
* Initialize the SHA256 context ${ctx}.
*/
void SHA256_Init(SHA256_CTX *);
/**
* SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the SHA256 context ${ctx}.
*/
void SHA256_Update(SHA256_CTX *, const void *, size_t);
/**
* SHA256_Final(digest, ctx):
* Output the SHA256 hash of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
void SHA256_Final(uint8_t[32], SHA256_CTX *);
#endif
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void SHA256_Buf(const void *, size_t, uint8_t[32]);
/* Context structure for HMAC-SHA256 operations. */
typedef struct {
SHA256_CTX ictx;
SHA256_CTX octx;
} HMAC_SHA256_CTX;
/**
* HMAC_SHA256_Init(ctx, K, Klen):
* Initialize the HMAC-SHA256 context ${ctx} with ${Klen} bytes of key from
* ${K}.
*/
void HMAC_SHA256_Init(HMAC_SHA256_CTX *, const void *, size_t);
/**
* HMAC_SHA256_Update(ctx, in, len):
* Input ${len} bytes from ${in} into the HMAC-SHA256 context ${ctx}.
*/
void HMAC_SHA256_Update(HMAC_SHA256_CTX *, const void *, size_t);
/**
* HMAC_SHA256_Final(digest, ctx):
* Output the HMAC-SHA256 of the data input to the context ${ctx} into the
* buffer ${digest}.
*/
void HMAC_SHA256_Final(uint8_t[32], HMAC_SHA256_CTX *);
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void HMAC_SHA256_Buf(const void *, size_t, const void *, size_t, uint8_t[32]);
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
uint64_t, uint8_t *, size_t);
#ifdef __cplusplus
}
#endif
#endif /* !_SHA256_H_ */

218
algo/yespower/sha256_p.c Normal file
View File

@@ -0,0 +1,218 @@
/*-
* Copyright 2005,2007,2009 Colin Percival
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include "sysendian.h"
#include "sha256_p.h"
#include "compat.h"
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define SHR(x, n) (x >> n)
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
t0 = h + S1(e) + Ch(e, f, g) + k; \
t1 = S0(a) + Maj(a, b, c); \
d += t0; \
h = t0 + t1;
/* Adjusted round function for rotating state */
#define RNDr(S, W, i, k) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i] + k)
/*
static unsigned char PAD[64] = {
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
*/
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void
SHA256_Buf( const void * in, size_t len, uint8_t digest[32] )
{
SHA256_CTX ctx;
SHA256_Init( &ctx );
SHA256_Update( &ctx, in, len );
SHA256_Final( digest, &ctx );
}
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void
HMAC_SHA256_Buf(const void * K, size_t Klen, const void * in, size_t len,
uint8_t digest[32])
{
HMAC_SHA256_CTX ctx;
HMAC_SHA256_Init( &ctx, K, Klen );
HMAC_SHA256_Update( &ctx, in, len );
HMAC_SHA256_Final( digest, &ctx );
}
/* Initialize an HMAC-SHA256 operation with the given key. */
void
HMAC_SHA256_Init( HMAC_SHA256_CTX * ctx, const void * _K, size_t Klen )
{
unsigned char pad[64];
unsigned char khash[32];
const unsigned char * K = _K;
size_t i;
/* If Klen > 64, the key is really SHA256(K). */
if (Klen > 64) {
SHA256_Init( &ctx->ictx );
SHA256_Update( &ctx->ictx, K, Klen );
SHA256_Final( khash, &ctx->ictx );
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
SHA256_Init( &ctx->ictx );
memset( pad, 0x36, 64 );
for ( i = 0; i < Klen; i++ )
pad[i] ^= K[i];
SHA256_Update( &ctx->ictx, pad, 64 );
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
SHA256_Init( &ctx->octx );
memset(pad, 0x5c, 64);
for ( i = 0; i < Klen; i++ )
pad[i] ^= K[i];
SHA256_Update( &ctx->octx, pad, 64 );
/* Clean the stack. */
//memset(khash, 0, 32);
}
/* Add bytes to the HMAC-SHA256 operation. */
void
HMAC_SHA256_Update(HMAC_SHA256_CTX * ctx, const void *in, size_t len)
{
/* Feed data to the inner SHA256 operation. */
SHA256_Update( &ctx->ictx, in, len );
}
/* Finish an HMAC-SHA256 operation. */
void
HMAC_SHA256_Final(unsigned char digest[32], HMAC_SHA256_CTX * ctx )
{
unsigned char ihash[32];
/* Finish the inner SHA256 operation. */
SHA256_Final( ihash, &ctx->ictx );
/* Feed the inner hash to the outer SHA256 operation. */
SHA256_Update( &ctx->octx, ihash, 32 );
/* Finish the outer SHA256 operation. */
SHA256_Final( digest, &ctx->octx );
/* Clean the stack. */
//memset(ihash, 0, 32);
}
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void
PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
HMAC_SHA256_CTX PShctx, hctx;
uint8_t _ALIGN(128) T[32];
uint8_t _ALIGN(128) U[32];
uint8_t ivec[4];
size_t i, clen;
uint64_t j;
int k;
/* Compute HMAC state after processing P and S. */
HMAC_SHA256_Init(&PShctx, passwd, passwdlen);
HMAC_SHA256_Update(&PShctx, salt, saltlen);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX));
HMAC_SHA256_Update(&hctx, ivec, 4);
HMAC_SHA256_Final(U, &hctx);
/* T_i = U_1 ... */
memcpy(T, U, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
HMAC_SHA256_Init(&hctx, passwd, passwdlen);
HMAC_SHA256_Update(&hctx, U, 32);
HMAC_SHA256_Final(U, &hctx);
/* ... xor U_j ... */
for (k = 0; k < 32; k++)
T[k] ^= U[k];
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy(&buf[i * 32], T, clen);
}
/* Clean PShctx, since we never called _Final on it. */
//memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
}

View File

@@ -1,496 +0,0 @@
/*-
* Copyright 2005,2007,2009 Colin Percival
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include "sysendian.h"
#include "sha256_p.h"
#include "compat.h"
/*
* Encode a length len/4 vector of (uint32_t) into a length len vector of
* (unsigned char) in big-endian form. Assumes len is a multiple of 4.
*/
static void
be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
{
size_t i;
for (i = 0; i < len / 4; i++)
be32enc(dst + i * 4, src[i]);
}
/*
* Decode a big-endian length len vector of (unsigned char) into a length
* len/4 vector of (uint32_t). Assumes len is a multiple of 4.
*/
static void
be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
{
size_t i;
for (i = 0; i < len / 4; i++)
dst[i] = be32dec(src + i * 4);
}
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define SHR(x, n) (x >> n)
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
t0 = h + S1(e) + Ch(e, f, g) + k; \
t1 = S0(a) + Maj(a, b, c); \
d += t0; \
h = t0 + t1;
/* Adjusted round function for rotating state */
#define RNDr(S, W, i, k) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i] + k)
/*
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state.
*/
static void
SHA256_Transform_p(uint32_t * state, const unsigned char block[64])
{
uint32_t _ALIGN(128) W[64], S[8];
uint32_t t0, t1;
int i;
/* 1. Prepare message schedule W. */
be32dec_vect(W, block, 64);
for (i = 16; i < 64; i++)
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
/* 2. Initialize working variables. */
memcpy(S, state, 32);
/* 3. Mix. */
RNDr(S, W, 0, 0x428a2f98);
RNDr(S, W, 1, 0x71374491);
RNDr(S, W, 2, 0xb5c0fbcf);
RNDr(S, W, 3, 0xe9b5dba5);
RNDr(S, W, 4, 0x3956c25b);
RNDr(S, W, 5, 0x59f111f1);
RNDr(S, W, 6, 0x923f82a4);
RNDr(S, W, 7, 0xab1c5ed5);
RNDr(S, W, 8, 0xd807aa98);
RNDr(S, W, 9, 0x12835b01);
RNDr(S, W, 10, 0x243185be);
RNDr(S, W, 11, 0x550c7dc3);
RNDr(S, W, 12, 0x72be5d74);
RNDr(S, W, 13, 0x80deb1fe);
RNDr(S, W, 14, 0x9bdc06a7);
RNDr(S, W, 15, 0xc19bf174);
RNDr(S, W, 16, 0xe49b69c1);
RNDr(S, W, 17, 0xefbe4786);
RNDr(S, W, 18, 0x0fc19dc6);
RNDr(S, W, 19, 0x240ca1cc);
RNDr(S, W, 20, 0x2de92c6f);
RNDr(S, W, 21, 0x4a7484aa);
RNDr(S, W, 22, 0x5cb0a9dc);
RNDr(S, W, 23, 0x76f988da);
RNDr(S, W, 24, 0x983e5152);
RNDr(S, W, 25, 0xa831c66d);
RNDr(S, W, 26, 0xb00327c8);
RNDr(S, W, 27, 0xbf597fc7);
RNDr(S, W, 28, 0xc6e00bf3);
RNDr(S, W, 29, 0xd5a79147);
RNDr(S, W, 30, 0x06ca6351);
RNDr(S, W, 31, 0x14292967);
RNDr(S, W, 32, 0x27b70a85);
RNDr(S, W, 33, 0x2e1b2138);
RNDr(S, W, 34, 0x4d2c6dfc);
RNDr(S, W, 35, 0x53380d13);
RNDr(S, W, 36, 0x650a7354);
RNDr(S, W, 37, 0x766a0abb);
RNDr(S, W, 38, 0x81c2c92e);
RNDr(S, W, 39, 0x92722c85);
RNDr(S, W, 40, 0xa2bfe8a1);
RNDr(S, W, 41, 0xa81a664b);
RNDr(S, W, 42, 0xc24b8b70);
RNDr(S, W, 43, 0xc76c51a3);
RNDr(S, W, 44, 0xd192e819);
RNDr(S, W, 45, 0xd6990624);
RNDr(S, W, 46, 0xf40e3585);
RNDr(S, W, 47, 0x106aa070);
RNDr(S, W, 48, 0x19a4c116);
RNDr(S, W, 49, 0x1e376c08);
RNDr(S, W, 50, 0x2748774c);
RNDr(S, W, 51, 0x34b0bcb5);
RNDr(S, W, 52, 0x391c0cb3);
RNDr(S, W, 53, 0x4ed8aa4a);
RNDr(S, W, 54, 0x5b9cca4f);
RNDr(S, W, 55, 0x682e6ff3);
RNDr(S, W, 56, 0x748f82ee);
RNDr(S, W, 57, 0x78a5636f);
RNDr(S, W, 58, 0x84c87814);
RNDr(S, W, 59, 0x8cc70208);
RNDr(S, W, 60, 0x90befffa);
RNDr(S, W, 61, 0xa4506ceb);
RNDr(S, W, 62, 0xbef9a3f7);
RNDr(S, W, 63, 0xc67178f2);
/* 4. Mix local working variables into global state */
for (i = 0; i < 8; i++)
state[i] += S[i];
#if 0
/* Clean the stack. */
memset(W, 0, 256);
memset(S, 0, 32);
t0 = t1 = 0;
#endif
}
static unsigned char PAD[64] = {
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
// only called by SHA256_Final_p
/* Add padding and terminating bit-count. */
static void
SHA256_Pad_p(SHA256_CTX_p * ctx)
{
unsigned char len[8];
uint32_t r, plen;
/*
* Convert length to a vector of bytes -- we do this now rather
* than later because the length will change after we pad.
*/
be32enc_vect(len, ctx->count, 8);
/* Add 1--64 bytes so that the resulting length is 56 mod 64 */
r = (ctx->count[1] >> 3) & 0x3f;
plen = (r < 56) ? (56 - r) : (120 - r);
SHA256_Update_p(ctx, PAD, (size_t)plen);
/* Add the terminating bit-count */
SHA256_Update_p(ctx, len, 8);
}
/* SHA-256 initialization. Begins a SHA-256 operation. */
void
SHA256_Init_p(SHA256_CTX_p * ctx)
{
/* Zero bits processed so far */
ctx->count[0] = ctx->count[1] = 0;
/* Magic initialization constants */
ctx->state[0] = 0x6A09E667;
ctx->state[1] = 0xBB67AE85;
ctx->state[2] = 0x3C6EF372;
ctx->state[3] = 0xA54FF53A;
ctx->state[4] = 0x510E527F;
ctx->state[5] = 0x9B05688C;
ctx->state[6] = 0x1F83D9AB;
ctx->state[7] = 0x5BE0CD19;
}
/* Add bytes into the hash */
void
SHA256_Update_p(SHA256_CTX_p * ctx, const void *in, size_t len)
{
uint32_t bitlen[2];
uint32_t r;
const unsigned char *src = in;
/* Number of bytes left in the buffer from previous updates */
r = (ctx->count[1] >> 3) & 0x3f;
/* Convert the length into a number of bits */
bitlen[1] = ((uint32_t)len) << 3;
bitlen[0] = (uint32_t)(len >> 29);
/* Update number of bits */
if ((ctx->count[1] += bitlen[1]) < bitlen[1])
ctx->count[0]++;
ctx->count[0] += bitlen[0];
/* Handle the case where we don't need to perform any transforms */
if (len < 64 - r) {
memcpy(&ctx->buf[r], src, len);
return;
}
/* Finish the current block */
memcpy(&ctx->buf[r], src, 64 - r);
SHA256_Transform_p(ctx->state, ctx->buf);
src += 64 - r;
len -= 64 - r;
/* Perform complete blocks */
while (len >= 64) {
SHA256_Transform_p(ctx->state, src);
src += 64;
len -= 64;
}
/* Copy left over data into buffer */
memcpy(ctx->buf, src, len);
}
/*
* SHA-256 finalization. Pads the input data, exports the hash value,
* and clears the context state.
*/
void
SHA256_Final_p(unsigned char digest[32], SHA256_CTX_p * ctx)
{
/* Add padding */
SHA256_Pad_p(ctx);
/* Write the hash */
be32enc_vect(digest, ctx->state, 32);
/* Clear the context state */
memset((void *)ctx, 0, sizeof(*ctx));
}
/**
* SHA256_Buf(in, len, digest):
* Compute the SHA256 hash of ${len} bytes from ${in} and write it to ${digest}.
*/
void
SHA256_Buf_p(const void * in, size_t len, uint8_t digest[32])
{
// SHA256_CTX_p ctx;
// uint32_t tmp32[72];
#if defined(__SHA__)
SHA256_CTX ctx;
SHA256_Init(&ctx);
SHA256_Update(&ctx, in, len);
SHA256_Final(digest, &ctx);
#else
SHA256_CTX_p ctx;
SHA256_Init_p(&ctx);
SHA256_Update_p(&ctx, in, len);
SHA256_Final_p(digest, &ctx);
#endif
/* Clean the stack. */
// insecure_memzero(&ctx, sizeof(SHA256_CTX));
// insecure_memzero(tmp32, 288);
}
/**
* HMAC_SHA256_Buf(K, Klen, in, len, digest):
* Compute the HMAC-SHA256 of ${len} bytes from ${in} using the key ${K} of
* length ${Klen}, and write the result to ${digest}.
*/
void
HMAC_SHA256_Buf_p(const void * K, size_t Klen, const void * in, size_t len,
uint8_t digest[32])
{
HMAC_SHA256_CTX_p ctx;
// uint32_t tmp32[72];
// uint8_t tmp8[96];
HMAC_SHA256_Init_p(&ctx, K, Klen);
HMAC_SHA256_Update_p(&ctx, in, len);
HMAC_SHA256_Final_p(digest, &ctx);
/* Clean the stack. */
// insecure_memzero(&ctx, sizeof(HMAC_SHA256_CTX));
// insecure_memzero(tmp32, 288);
// insecure_memzero(tmp8, 96);
}
/* Initialize an HMAC-SHA256 operation with the given key. */
void
HMAC_SHA256_Init_p(HMAC_SHA256_CTX_p * ctx, const void * _K, size_t Klen)
{
unsigned char pad[64];
unsigned char khash[32];
const unsigned char * K = _K;
size_t i;
/* If Klen > 64, the key is really SHA256(K). */
if (Klen > 64) {
#if defined(__SHA__)
SHA256_Init(&ctx->ictx);
SHA256_Update(&ctx->ictx, K, Klen);
SHA256_Final(khash, &ctx->ictx);
#else
SHA256_Init_p(&ctx->ictx);
SHA256_Update_p(&ctx->ictx, K, Klen);
SHA256_Final_p(khash, &ctx->ictx);
#endif
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
#if defined(__SHA__)
SHA256_Init(&ctx->ictx);
#else
SHA256_Init_p(&ctx->ictx);
#endif
memset(pad, 0x36, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
#if defined(__SHA__)
SHA256_Update(&ctx->ictx, pad, 64);
#else
SHA256_Update_p(&ctx->ictx, pad, 64);
#endif
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
#if defined(__SHA__)
SHA256_Init(&ctx->octx);
#else
SHA256_Init_p(&ctx->octx);
#endif
memset(pad, 0x5c, 64);
for (i = 0; i < Klen; i++)
pad[i] ^= K[i];
#if defined(__SHA__)
SHA256_Update(&ctx->octx, pad, 64);
#else
SHA256_Update_p(&ctx->octx, pad, 64);
#endif
/* Clean the stack. */
//memset(khash, 0, 32);
}
/* Add bytes to the HMAC-SHA256 operation. */
void
HMAC_SHA256_Update_p(HMAC_SHA256_CTX_p * ctx, const void *in, size_t len)
{
/* Feed data to the inner SHA256 operation. */
#if defined(__SHA__)
SHA256_Update(&ctx->ictx, in, len);
#else
SHA256_Update_p(&ctx->ictx, in, len);
#endif
}
/* Finish an HMAC-SHA256 operation. */
void
HMAC_SHA256_Final_p(unsigned char digest[32], HMAC_SHA256_CTX_p * ctx)
{
unsigned char ihash[32];
#if defined(__SHA__)
/* Finish the inner SHA256 operation. */
SHA256_Final(ihash, &ctx->ictx);
/* Feed the inner hash to the outer SHA256 operation. */
SHA256_Update(&ctx->octx, ihash, 32);
/* Finish the outer SHA256 operation. */
SHA256_Final(digest, &ctx->octx);
#else
/* Finish the inner SHA256 operation. */
SHA256_Final_p(ihash, &ctx->ictx);
/* Feed the inner hash to the outer SHA256 operation. */
SHA256_Update_p(&ctx->octx, ihash, 32);
/* Finish the outer SHA256 operation. */
SHA256_Final_p(digest, &ctx->octx);
#endif
/* Clean the stack. */
//memset(ihash, 0, 32);
}
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void
PBKDF2_SHA256_p(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
HMAC_SHA256_CTX_p PShctx, hctx;
uint8_t _ALIGN(128) T[32];
uint8_t _ALIGN(128) U[32];
uint8_t ivec[4];
size_t i, clen;
uint64_t j;
int k;
/* Compute HMAC state after processing P and S. */
HMAC_SHA256_Init_p(&PShctx, passwd, passwdlen);
HMAC_SHA256_Update_p(&PShctx, salt, saltlen);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_p));
HMAC_SHA256_Update_p(&hctx, ivec, 4);
HMAC_SHA256_Final_p(U, &hctx);
/* T_i = U_1 ... */
memcpy(T, U, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
HMAC_SHA256_Init_p(&hctx, passwd, passwdlen);
HMAC_SHA256_Update_p(&hctx, U, 32);
HMAC_SHA256_Final_p(U, &hctx);
/* ... xor U_j ... */
for (k = 0; k < 32; k++)
T[k] ^= U[k];
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy(&buf[i * 32], T, clen);
}
/* Clean PShctx, since we never called _Final on it. */
//memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
}

View File

@@ -33,45 +33,24 @@
#include <stdint.h>
#include <openssl/sha.h>
typedef struct SHA256Context {
uint32_t state[8];
uint32_t count[2];
unsigned char buf[64];
} SHA256_CTX_p;
/*
typedef struct HMAC_SHA256Context {
SHA256_CTX_Y ictx;
SHA256_CTX_Y octx;
} HMAC_SHA256_CTX_Y;
*/
typedef struct HMAC_SHA256Context {
#if defined(__SHA__)
SHA256_CTX ictx;
SHA256_CTX octx;
#else
SHA256_CTX_p ictx;
SHA256_CTX_p octx;
#endif
} HMAC_SHA256_CTX_p;
} HMAC_SHA256_CTX;
void SHA256_Init_p(SHA256_CTX_p *);
void SHA256_Update_p(SHA256_CTX_p *, const void *, size_t);
void SHA256_Final_p(unsigned char [32], SHA256_CTX_p *);
void SHA256_Buf_p(const void * in, size_t len, uint8_t digest[32]);
void HMAC_SHA256_Init_p(HMAC_SHA256_CTX_p *, const void *, size_t);
void HMAC_SHA256_Update_p(HMAC_SHA256_CTX_p *, const void *, size_t);
void HMAC_SHA256_Final_p(unsigned char [32], HMAC_SHA256_CTX_p *);
void HMAC_SHA256_Buf_p(const void * K, size_t Klen, const void * in,
size_t len, uint8_t digest[32]);
void SHA256_Buf( const void * in, size_t len, uint8_t digest[32] );
void HMAC_SHA256_Init( HMAC_SHA256_CTX *, const void *, size_t );
void HMAC_SHA256_Update( HMAC_SHA256_CTX *, const void *, size_t );
void HMAC_SHA256_Final( unsigned char [32], HMAC_SHA256_CTX * );
void HMAC_SHA256_Buf( const void * K, size_t Klen, const void * in,
size_t len, uint8_t digest[32] );
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void PBKDF2_SHA256_p(const uint8_t *, size_t, const uint8_t *, size_t,
uint64_t, uint8_t *, size_t);
void PBKDF2_SHA256( const uint8_t *, size_t, const uint8_t *, size_t,
uint64_t, uint8_t *, size_t);
#endif /* !_SHA256_H_ */

View File

@@ -62,6 +62,7 @@
#warning "Note: building generic code for non-x86. That's OK."
#endif
*/
/*
* The SSE4 code version has fewer instructions than the generic SSE2 version,
* but all of the instructions are SIMD, thereby wasting the scalar execution
@@ -96,7 +97,7 @@
#include <string.h>
#include "insecure_memzero.h"
#include "sha256.h"
#include "sha256_p.h"
#include "sysendian.h"
#include "yespower.h"

File diff suppressed because it is too large Load Diff

View File

@@ -51,7 +51,7 @@
#include <stdlib.h>
#include <string.h>
#include "sha256.h"
#include "sha256_p.h"
#include "sysendian.h"
#include "yespower.h"
@@ -534,11 +534,12 @@ int yespower(yespower_local_t *local,
if (pers) {
HMAC_SHA256_Buf(dst, sizeof(*dst), pers, perslen,
return true;
(uint8_t *)sha256);
SHA256_Buf(sha256, sizeof(sha256), (uint8_t *)dst);
}
} else {
HMAC_SHA256_Buf((uint8_t *)B + B_size - 64, 64,
HMAC_SHA256_Buf_P((uint8_t *)B + B_size - 64, 64,
sha256, sizeof(sha256), (uint8_t *)dst);
}

View File

@@ -38,7 +38,7 @@ void yespower_hash( const char *input, char *output, uint32_t len )
}
int scanhash_yespower( int thr_id, struct work *work, uint32_t max_nonce,
uint64_t *hashes_done )
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(64) vhash[8];
uint32_t _ALIGN(64) endiandata[20];
@@ -48,6 +48,7 @@ int scanhash_yespower( int thr_id, struct work *work, uint32_t max_nonce,
const uint32_t Htarg = ptarget[7];
const uint32_t first_nonce = pdata[19];
uint32_t n = first_nonce;
/* int */ thr_id = mythr->id; // thr_id arg is deprecated
for (int k = 0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);

125
avxdefs.h
View File

@@ -99,20 +99,73 @@
#include <memory.h>
#include <stdbool.h>
// 64 bit seems completely useless
// First some integer stuff that mirrors the SIMD utilities
#define ror_64( x, c ) \
(uint64_t)( ( (uint64_t)(x) >> (c) ) | ( (uint64_t)(x) << (64-(c)) ) )
#define rol_64( x, c ) \
(uint64_t)( ( (uint64_t)(x) << (c) ) | ( (uint64_t)(x) >> (64-(c)) ) )
#define ror_32( x, c ) \
(uint32_t)( ( (uint32_t)(x) >> (c) ) | ( (uint32_t)(x) << (32-(c)) ) )
#define rol_32( x, c ) \
(uint32_t)( ( (uint32_t)(x) << (c) ) | ( (uint32_t)(x) >> (32-(c)) ) )
#define ror_16( x, c ) \
(uint16_t)( ( (uint16_t)(x) >> (c) ) | ( (uint16_t)(x) << (16-(c)) ) )
#define rol_16( x, c ) \
(uint16_t)( ( (uint16_t)(x) << (c) ) | ( (uint16_t)(x) >> (16-(c)) ) )
#define ror_8( x, c ) \
(uint8_t) ( ( (uint8_t) (x) >> (c) ) | ( (uint8_t) (x) << ( 8-(c)) ) )
#define rol_8( x, c ) \
(uint8_t) ( ( (uint8_t) (x) << (c) ) | ( (uint8_t) (x) >> ( 8-(c)) ) )
#define bswap_64( x ) __builtin_bswap64(x)
#define bswap_32( x ) __builtin_bswap32(x)
// 128 bit integer
//
// Int128 uses two 64 bit GPRs to hold the data. The main benefits are
// for 128 bit arithmetic. Vectors are preferred when 128 bit arith
// is not required. int128 also works better with other integer sizes.
// Vectors benefit from wider registers.
//
// Use typecasting for conversion to/from 128 bit vector:
// __m128i v128 = (__m128i)my_int128l
// __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
// my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
// Test this before using int128.
#define GCC_INT128 1
// Familiar looking type names
typedef __int128 int128_t;
typedef unsigned __int128 uint128_t;
// No real need or use.
#define i128_neg1 (uint128_t)(-1LL)
// Extract selected 64 bit half of 128 bit integer.
// A generic macro with a selector argument can't be encoded as a statement
// function and would require a branch.
#define i128_hi64( x ) (uint64_t)( (uint128_t)(x) >> 64 )
#define i128_lo64( x ) (uint64_t)( (uint128_t)(x) << 64 >> 64 )
// Not much need for this but it fills a gap.
#define ror_128( x, c ) \
( ( (uint128_t)(x) >> (c) ) | ( (uint128_t)(x) << (128-(c)) ) )
#define rol_128( x, c ) \
( ( (uint128_t)(x) << (c) ) | ( (uint128_t)(x) >> (128-(c)) ) )
#endif // INT128
////////////////////////////////////////////////////////////////
//
// 64 bit MMX vectors.
// 64 bit MMX vectors.
//
// There are rumours MMX wil be removed. Although casting with int64
// works there is likely some overhead to move the data to An MMX register
// and back.
// Byte swap and rotation may be more efficient using an MMX shuffle
// except that it won't compile due to a "target specific option mismatch"
// with "inlining failed in call to always inline". MMX was designed for
// 32 bit CPUs and might not work on 64 bit CPUs where the CPU has full
// support for 64 bit operations without vectoring.
// Byte swap and rotation may be more efficient using an MMX shuffle.
//
// Universal 64 bit overlay
union _m64v
@@ -165,6 +218,7 @@ typedef union _m64_v16 m64_v16;
#define casti_m64(p,i) (((__m64*)(p))[(i)])
// cast all arguments as the're likely uint64_t
// Bitwise not: ~(a)
@@ -173,6 +227,7 @@ typedef union _m64_v16 m64_v16;
// Unary negate elements
#define mm64_negate_32( v ) _mm_sub_pi32( m64_zero, (__m64)v )
#define mm64_negate_16( v ) _mm_sub_pi16( m64_zero, (__m64)v )
#define mm64_negate_8( v ) _mm_sub_pi8( m64_zero, (__m64)v )
// Rotate bits in packed elements of 64 bit vector
#define mm64_rol_32( a, n ) \
@@ -206,15 +261,32 @@ typedef union _m64_v16 m64_v16;
#if defined(__SSSE3__)
// Endian byte swap packed elements
// A vectorized version of the u64 bswap, use when data already in MMX reg.
#define mm64_bswap_64( v ) \
_mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 0,1,2,3,4,5,6,7 ) )
#define mm64_bswap_32( v ) \
_mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 4,5,6,7, 0,1,2,3 ) )
#define mm64_bswap_16( v ) \
_mm_shuffle_pi8( (__m64)v, _mm_set_pi8( 6,7, 4,5, 2,3, 0,1 ) );
#else
#define mm64_bswap_64( v ) \
(__m64)__builtin_bswap64( (uint64_t)v )
// Looks clumsy but hopefully it works.
#define mm64_bswap_32( v ) \
_mm_set_pi32( __builtin_bswap32( ((uint32_t*)v)[1] ), \
__builtin_bswap32( ((uint32_t*)v)[0] ) )
#endif
// Invert vector: {3,2,1,0} -> {0,1,2,3}
// Invert_64 is the same as bswap64
// Invert_32 is the same as swap32
#define mm64_invert_16( v ) _mm_shuffle_pi16( (__m64)v, 0x1b )
#if defined(__SSSE3__)
@@ -237,6 +309,12 @@ static inline void memset_zero_64( __m64 *src, int n )
static inline void memset_64( __m64 *dst, const __m64 a, int n )
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
// The b is for broadcast, don't use in hybrid hash, interleave.
static inline void mem_bcpy_32( __m64 *dst, const uint32_t src, int n )
{
for ( int i = 0; i < n; i++ ) dst[i] = _mm_set1_pi32( src );
}
//////////////////////////////////////////////////////////////////
//
@@ -644,57 +722,57 @@ do { \
#define mm128_ror1x64_256( v1, v2 ) \
do { \
__m128i t = _mm_srli_si128( v1, 8 ) | _mm_slli_si128( v2, 24 ); \
v2 = _mm_srli_si128( v2, 8 ) | _mm_slli_si128( v1, 24 ); \
__m128i t = _mm_srli_si128( v1, 8 ) | _mm_slli_si128( v2, 8 ); \
v2 = _mm_srli_si128( v2, 8 ) | _mm_slli_si128( v1, 8 ); \
v1 = t; \
} while(0)
#define mm128_rol1x64_256( v1, v2 ) \
do { \
__m128i t = _mm_slli_si128( v1, 8 ) | _mm_srli_si128( v2, 24 ); \
v2 = _mm_slli_si128( v2, 8 ) | _mm_srli_si128( v1, 24 ); \
__m128i t = _mm_slli_si128( v1, 8 ) | _mm_srli_si128( v2, 8 ); \
v2 = _mm_slli_si128( v2, 8 ) | _mm_srli_si128( v1, 8 ); \
v1 = t; \
} while(0)
#define mm128_ror1x32_256( v1, v2 ) \
do { \
__m128i t = _mm_srli_si128( v1, 4 ) | _mm_slli_si128( v2, 28 ); \
v2 = _mm_srli_si128( v2, 4 ) | _mm_slli_si128( v1, 28 ); \
__m128i t = _mm_srli_si128( v1, 4 ) | _mm_slli_si128( v2, 12 ); \
v2 = _mm_srli_si128( v2, 4 ) | _mm_slli_si128( v1, 12 ); \
v1 = t; \
} while(0)
#define mm128_rol1x32_256( v1, v2 ) \
do { \
__m128i t = _mm_slli_si128( v1, 4 ) | _mm_srli_si128( v2, 28 ); \
v2 = _mm_slli_si128( v2, 4 ) | _mm_srli_si128( v1, 28 ); \
__m128i t = _mm_slli_si128( v1, 4 ) | _mm_srli_si128( v2, 12 ); \
v2 = _mm_slli_si128( v2, 4 ) | _mm_srli_si128( v1, 12 ); \
v1 = t; \
} while(0)
#define mm128_ror1x16_256( v1, v2 ) \
do { \
__m128i t = _mm_srli_si128( v1, 2 ) | _mm_slli_si128( v2, 30 ); \
v2 = _mm_srli_si128( v2, 2 ) | _mm_slli_si128( v1, 30 ); \
__m128i t = _mm_srli_si128( v1, 2 ) | _mm_slli_si128( v2, 14 ); \
v2 = _mm_srli_si128( v2, 2 ) | _mm_slli_si128( v1, 14 ); \
v1 = t; \
} while(0)
#define mm128_rol1x16_256( v1, v2 ) \
do { \
__m128i t = _mm_slli_si128( v1, 2 ) | _mm_srli_si128( v2, 30 ); \
v2 = _mm_slli_si128( v2, 2 ) | _mm_srli_si128( v1, 30 ); \
__m128i t = _mm_slli_si128( v1, 2 ) | _mm_srli_si128( v2, 14 ); \
v2 = _mm_slli_si128( v2, 2 ) | _mm_srli_si128( v1, 14 ); \
v1 = t; \
} while(0)
#define mm128_ror1x8_256( v1, v2 ) \
do { \
__m128i t = _mm_srli_si128( v1, 1 ) | _mm_slli_si128( v2, 31 ); \
v2 = _mm_srli_si128( v2, 1 ) | _mm_slli_si128( v1, 31 ); \
__m128i t = _mm_srli_si128( v1, 1 ) | _mm_slli_si128( v2, 15 ); \
v2 = _mm_srli_si128( v2, 1 ) | _mm_slli_si128( v1, 15 ); \
v1 = t; \
} while(0)
#define mm128_rol1x8_256( v1, v2 ) \
do { \
__m128i t = _mm_slli_si128( v1, 1 ) | _mm_srli_si128( v2, 31 ); \
v2 = _mm_slli_si128( v2, 1 ) | _mm_srli_si128( v1, 31 ); \
__m128i t = _mm_slli_si128( v1, 1 ) | _mm_srli_si128( v2, 15 ); \
v2 = _mm_slli_si128( v2, 1 ) | _mm_srli_si128( v1, 15 ); \
v1 = t; \
} while(0)
@@ -1919,6 +1997,7 @@ static inline __m64 mmx_compile_test( __m64 a )
m = _mm_shuffle_pi8( m, (__m64)0x0102030405060708 );
i = (uint64_t) mm64_ror_32( (__m64)i, 7 );
casti_m64( n, 2 ) = m;
m = (__m64)__builtin_bswap64( (uint64_t)m );
return a;
}

22
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.1.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.2.4.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='3.9.1'
PACKAGE_STRING='cpuminer-opt 3.9.1'
PACKAGE_VERSION='3.9.2.4'
PACKAGE_STRING='cpuminer-opt 3.9.2.4'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 3.9.1 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 3.9.2.4 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 3.9.1:";;
short | recursive ) echo "Configuration of cpuminer-opt 3.9.2.4:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 3.9.1
cpuminer-opt configure 3.9.2.4
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 3.9.1, which was
It was created by cpuminer-opt $as_me 3.9.2.4, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='3.9.1'
VERSION='3.9.2.4'
cat >>confdefs.h <<_ACEOF
@@ -5884,7 +5884,7 @@ fi
# GC2 for GNU static
if test "x$OS" = "xWindows_NT" ; then
if test "x$have_win32" = "xtrue" ; then
# MinGW
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_create in -lpthread" >&5
$as_echo_n "checking for pthread_create in -lpthread... " >&6; }
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 3.9.1, which was
This file was extended by cpuminer-opt $as_me 3.9.2.4, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 3.9.1
cpuminer-opt config.status 3.9.2.4
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [3.9.1])
AC_INIT([cpuminer-opt], [3.9.2.4])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM
@@ -106,7 +106,7 @@ fi
AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
# GC2 for GNU static
if test "x$OS" = "xWindows_NT" ; then
if test "x$have_win32" = "xtrue" ; then
# MinGW
AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthreadGC2",[])
else

View File

@@ -105,10 +105,12 @@ enum algos opt_algo = ALGO_NULL;
int opt_scrypt_n = 0;
int opt_pluck_n = 128;
int opt_n_threads = 0;
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
__int128_t opt_affinity = -1LL;
// Windows doesn't support 128 bit affinity mask.
#if defined(__linux) && defined(GCC_INT128)
#define AFFINITY_USES_UINT128 1
uint128_t opt_affinity = -1LL;
#else
int64_t opt_affinity = -1LL;
uint64_t opt_affinity = -1LL;
#endif
int opt_priority = 0;
int num_cpus = 1;
@@ -203,7 +205,8 @@ static inline void drop_policy(void)
#define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */
#endif
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
// Linux affinity can use int128.
#if AFFINITY_USES_UINT128
static void affine_to_cpu_mask( int id, unsigned __int128 mask )
#else
static void affine_to_cpu_mask( int id, unsigned long long mask )
@@ -216,7 +219,7 @@ static void affine_to_cpu_mask( int id, unsigned long long mask )
for ( uint8_t i = 0; i < ncpus; i++ )
{
// cpu mask
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
#if AFFINITY_USES_UINT128
if( ( mask & ( (unsigned __int128)1ULL << i ) ) ) CPU_SET( i, &set );
#else
if( (ncpus > 64) || ( mask & (1ULL << i) ) ) CPU_SET( i, &set );
@@ -237,6 +240,7 @@ static void affine_to_cpu_mask( int id, unsigned long long mask )
#elif defined(WIN32) /* Windows */
static inline void drop_policy(void) { }
// Windows CPU groups to manage more than 64 CPUs.
static void affine_to_cpu_mask( int id, unsigned long mask )
{
bool success;
@@ -263,7 +267,7 @@ static void affine_to_cpu_mask( int id, unsigned long mask )
break;
cpu -= cpus;
}
}
if (opt_debug)
applog(LOG_DEBUG, "Binding thread %d to cpu %d on cpu group %d (mask %x)", id, cpu, group, (1ULL << cpu));
@@ -847,7 +851,8 @@ static int share_result( int result, struct work *work, const char *reason )
float rate;
char rate_s[8] = {0};
double sharediff = work ? work->sharediff : stratum.sharediff;
bool solved = result && (net_diff > 0.0 ) && ( sharediff >= net_diff );
bool solved = result && accepted_share_count && (net_diff > 0.0 )
&& ( sharediff >= net_diff );
char sol[32] = {0};
int i;
@@ -857,15 +862,17 @@ static int share_result( int result, struct work *work, const char *reason )
hashcount += thr_hashcount[i];
hashrate += thr_hashrates[i];
}
solved = result && ( (uint64_t)hashcount > 0 ) && (net_diff > 0.0 )
&& ( sharediff >= net_diff );
result ? accepted_share_count++ : rejected_share_count++;
if ( solved )
{
solved_block_count++;
if ( use_colors )
sprintf( sol, CL_GRN " Solved" CL_WHT " %d", solved_block_count );
sprintf( sol, CL_GRN " Solved: %d" CL_WHT, solved_block_count );
else
sprintf( sol, " Solved %d", solved_block_count );
sprintf( sol, ", Solved: %d", solved_block_count );
}
pthread_mutex_unlock(&stats_lock);
@@ -1839,26 +1846,42 @@ static void *miner_thread( void *userdata )
}
else
*/
if ( num_cpus > 1 )
{
if ( (opt_affinity == -1LL) && (opt_n_threads) > 1 )
{
if (opt_debug)
applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)",
thr_id, thr_id % num_cpus, ( 1ULL << (thr_id % num_cpus) ) );
#if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
affine_to_cpu_mask( thr_id,
(unsigned __int128)1LL << (thr_id % num_cpus) );
#if AFFINITY_USES_UINT128
// Default affinity
if ( (opt_affinity == i128_neg1 ) && opt_n_threads > 1 )
{
if ( opt_debug )
applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
thr_id, thr_id % num_cpus,
i128_hi64( (uint128_t)1ULL << (thr_id % num_cpus) ),
i128_lo64( (uint128_t)1ULL << (thr_id % num_cpus) ) );
affine_to_cpu_mask( thr_id, (uint128_t)1ULL << (thr_id % num_cpus) );
}
#else
affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) );
#endif
}
else if (opt_affinity != -1)
{
if ( (opt_affinity == -1LL) && opt_n_threads > 1 )
{
if (opt_debug)
applog( LOG_DEBUG, "Binding thread %d to cpu mask %x",
thr_id, opt_affinity);
affine_to_cpu_mask( thr_id, opt_affinity );
applog( LOG_DEBUG, "Binding thread %d to cpu %d.",
thr_id, thr_id % num_cpus, 1LL << (thr_id % num_cpus)) ;
affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) );
}
#endif
else // Custom affinity
{
#if AFFINITY_USES_UINT128
if (opt_debug)
applog( LOG_DEBUG, "Binding thread %d to mask %016llx %016llx",
thr_id, i128_hi64( opt_affinity ),
i128_lo64( opt_affinity ) );
#else
if (opt_debug)
applog( LOG_DEBUG, "Binding thread %d to mask %016llx",
thr_id, opt_affinity );
#endif
affine_to_cpu_mask( thr_id, opt_affinity );
}
}
@@ -2894,13 +2917,21 @@ void parse_arg(int key, char *arg )
break;
case 1020:
p = strstr(arg, "0x");
if (p)
ul = strtoul(p, NULL, 16);
if ( p )
ul = strtoull( p, NULL, 16 );
else
ul = atol(arg);
if (ul > (1UL<<num_cpus)-1)
ul = -1;
opt_affinity = ul;
ul = atoll( arg );
// if ( ul > ( 1ULL << num_cpus ) - 1ULL )
// ul = -1LL;
#if AFFINITY_USES_UINT128
// replicate the low 64 bits to make a full 128 bit mask if there are more
// than 64 CPUs, otherwise zero extend the upper half.
opt_affinity = (uint128_t)ul;
if ( num_cpus > 64 )
opt_affinity = (opt_affinity << 64 ) | (uint128_t)ul;
#else
opt_affinity = ul;
#endif
break;
case 1021:
v = atoi(arg);
@@ -3299,20 +3330,18 @@ int main(int argc, char *argv[])
}
if (!rpc_userpass)
{
{
rpc_userpass = (char*) malloc(strlen(rpc_user) + strlen(rpc_pass) + 2);
if (rpc_userpass)
sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
else
return 1;
if (rpc_userpass)
sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
else
return 1;
}
// All options must be set before starting the gate
if ( !register_algo_gate( opt_algo, &algo_gate ) )
exit(1);
// All options must be set before starting the gate
if ( !register_algo_gate( opt_algo, &algo_gate ) ) exit(1);
if ( !check_cpu_capability() )
exit(1);
if ( !check_cpu_capability() ) exit(1);
pthread_mutex_init(&stats_lock, NULL);
pthread_mutex_init(&g_work_lock, NULL);
@@ -3325,7 +3354,7 @@ int main(int argc, char *argv[])
? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
: CURL_GLOBAL_ALL;
if (curl_global_init(flags))
{
{
applog(LOG_ERR, "CURL initialization failed");
return 1;
}
@@ -3384,6 +3413,8 @@ int main(int argc, char *argv[])
if ( num_cpus != opt_n_threads )
applog( LOG_INFO,"%u CPU cores available, %u miner threads selected.",
num_cpus, opt_n_threads );
// To be reviewed
if ( opt_affinity != -1 )
{
if ( num_cpus > 64 )

View File

@@ -43,8 +43,127 @@
//
// AVX512: 4x128, 8x64, 16x32
//
// Interleaving and deinterleaving is done in blocks of 16*16, 32*32,
// or 64*64 bytes for SSE2, AVX2 and AVX512 vectors respectively.
// Interleaving and deinterleaving is done in blocks of 8*8, 16*16, 32*32,
// or 64*64 bytes for MMX, SSE2, AVX2 and AVX512 vectors respectively.
//////////////////////////////////////////////////////
//
// MMX 64 bit vectors
#define mm64_put_32( s0, s1 ) \
_mm_set_pi32( *((const uint32_t*)(s1)), *((const uint32_t*)(s0)) )
#define mm64_get_32( s, i0, i1 ) \
_mm_set_pi32( ((const uint32_t*)(s))[i1], ((const uint32_t*)(s))[i0] )
// 1 MMX block, 8 bytes * 2 lanes
static inline void mm64_interleave_2x32( void *d, const void *s0,
const void *s1, int len )
{
casti_m64( d, 0 ) = mm64_put_32( s0 , s1 );
casti_m64( d, 1 ) = mm64_put_32( s0+ 4, s1+ 4 );
casti_m64( d, 2 ) = mm64_put_32( s0+ 8, s1+ 8 );
casti_m64( d, 3 ) = mm64_put_32( s0+ 12, s1+ 12 );
casti_m64( d, 4 ) = mm64_put_32( s0+ 16, s1+ 16 );
casti_m64( d, 5 ) = mm64_put_32( s0+ 20, s1+ 20 );
casti_m64( d, 6 ) = mm64_put_32( s0+ 24, s1+ 24 );
casti_m64( d, 7 ) = mm64_put_32( s0+ 28, s1+ 28 );
if ( len <= 256 ) return;
casti_m64( d, 8 ) = mm64_put_32( s0+ 32, s1+ 32 );
casti_m64( d, 9 ) = mm64_put_32( s0+ 36, s1+ 36 );
casti_m64( d,10 ) = mm64_put_32( s0+ 40, s1+ 40 );
casti_m64( d,11 ) = mm64_put_32( s0+ 44, s1+ 44 );
casti_m64( d,12 ) = mm64_put_32( s0+ 48, s1+ 48 );
casti_m64( d,13 ) = mm64_put_32( s0+ 52, s1+ 52 );
casti_m64( d,14 ) = mm64_put_32( s0+ 56, s1+ 56 );
casti_m64( d,15 ) = mm64_put_32( s0+ 60, s1+ 60 );
if ( len <= 512 ) return;
casti_m64( d,16 ) = mm64_put_32( s0+ 64, s1+ 64 );
casti_m64( d,17 ) = mm64_put_32( s0+ 68, s1+ 68 );
casti_m64( d,18 ) = mm64_put_32( s0+ 72, s1+ 72 );
casti_m64( d,19 ) = mm64_put_32( s0+ 76, s1+ 76 );
if ( len <= 640 ) return;
casti_m64( d,20 ) = mm64_put_32( s0+ 80, s1+ 80 );
casti_m64( d,21 ) = mm64_put_32( s0+ 84, s1+ 84 );
casti_m64( d,22 ) = mm64_put_32( s0+ 88, s1+ 88 );
casti_m64( d,23 ) = mm64_put_32( s0+ 92, s1+ 92 );
casti_m64( d,24 ) = mm64_put_32( s0+ 96, s1+ 96 );
casti_m64( d,25 ) = mm64_put_32( s0+100, s1+100 );
casti_m64( d,26 ) = mm64_put_32( s0+104, s1+104 );
casti_m64( d,27 ) = mm64_put_32( s0+108, s1+108 );
casti_m64( d,28 ) = mm64_put_32( s0+112, s1+112 );
casti_m64( d,29 ) = mm64_put_32( s0+116, s1+116 );
casti_m64( d,30 ) = mm64_put_32( s0+120, s1+120 );
casti_m64( d,31 ) = mm64_put_32( s0+124, s1+124 );
}
static inline void mm64_deinterleave_2x32( void *d00, void *d01,
const int n, const void *s, int len )
{
casti_m64( d00,0 ) = mm64_get_32( s, 0, 2 );
casti_m64( d01,0 ) = mm64_get_32( s, 1, 3 );
casti_m64( d00,1 ) = mm64_get_32( s, 4, 6 );
casti_m64( d01,1 ) = mm64_get_32( s, 5, 7 );
casti_m64( d00,2 ) = mm64_get_32( s, 8, 10 );
casti_m64( d01,2 ) = mm64_get_32( s, 9, 11 );
casti_m64( d00,3 ) = mm64_get_32( s, 12, 14 );
casti_m64( d01,3 ) = mm64_get_32( s, 13, 15 );
if ( len <= 256 ) return;
casti_m64( d00,4 ) = mm64_get_32( s, 16, 18 );
casti_m64( d01,4 ) = mm64_get_32( s, 17, 19 );
casti_m64( d00,5 ) = mm64_get_32( s, 20, 22 );
casti_m64( d01,5 ) = mm64_get_32( s, 21, 23 );
casti_m64( d00,6 ) = mm64_get_32( s, 24, 26 );
casti_m64( d01,6 ) = mm64_get_32( s, 25, 27 );
casti_m64( d00,7 ) = mm64_get_32( s, 28, 30 );
casti_m64( d01,7 ) = mm64_get_32( s, 29, 31 );
if ( len <= 512 ) return;
casti_m64( d00,8 ) = mm64_get_32( s, 32, 34 );
casti_m64( d01,8 ) = mm64_get_32( s, 33, 35 );
casti_m64( d00,9 ) = mm64_get_32( s, 36, 38 );
casti_m64( d01,9 ) = mm64_get_32( s, 37, 39 );
if ( len <= 640 ) return;
casti_m64( d00,10 ) = mm64_get_32( s, 40, 42 );
casti_m64( d01,10 ) = mm64_get_32( s, 41, 43 );
casti_m64( d00,11 ) = mm64_get_32( s, 44, 46 );
casti_m64( d01,11 ) = mm64_get_32( s, 45, 47 );
casti_m64( d00,12 ) = mm64_get_32( s, 48, 50 );
casti_m64( d01,12 ) = mm64_get_32( s, 49, 51 );
casti_m64( d00,13 ) = mm64_get_32( s, 52, 54 );
casti_m64( d01,13 ) = mm64_get_32( s, 53, 55 );
casti_m64( d00,14 ) = mm64_get_32( s, 56, 58 );
casti_m64( d01,14 ) = mm64_get_32( s, 57, 59 );
casti_m64( d00,15 ) = mm64_get_32( s, 60, 62 );
casti_m64( d01,15 ) = mm64_get_32( s, 61, 63 );
}
static inline void mm64_extract_lane_2x32( void *d, const void *s,
const int lane, const int bit_len )
{
casti_m64( d, 0 ) = mm64_get_32( s, lane , lane+ 4 );
casti_m64( d, 1 ) = mm64_get_32( s, lane+ 8, lane+12 );
casti_m64( d, 2 ) = mm64_get_32( s, lane+16, lane+20 );
casti_m64( d, 3 ) = mm64_get_32( s, lane+24, lane+28 );
if ( bit_len <= 256 ) return;
casti_m64( d, 4 ) = mm64_get_32( s, lane+32, lane+36 );
casti_m64( d, 5 ) = mm64_get_32( s, lane+40, lane+44 );
casti_m64( d, 6 ) = mm64_get_32( s, lane+48, lane+52 );
casti_m64( d, 7 ) = mm64_get_32( s, lane+56, lane+60 );
// bit_len == 512
}
///////////////////////////////////////////////////////////////
@@ -356,6 +475,36 @@ static inline void mm256_interleave_8x32x256( void *d, const void *s00,
s04+28, s05+28, s06+28, s07+28 );
}
static inline void mm256_be_interleave_8x32x256( void *d, const void *s00,
const void *s01, const void *s02, const void *s03, const void *s04,
const void *s05, const void *s06, const void *s07 )
{
casti_m256i( d, 0 ) = mm256_bswap_32(
mm256_put_32( s00, s01, s02, s03,
s04, s05, s06, s07 ) );
casti_m256i( d, 1 ) = mm256_bswap_32(
mm256_put_32( s00+ 4, s01+ 4, s02+ 4, s03+ 4,
s04+ 4, s05+ 4, s06+ 4, s07+ 4 ) );
casti_m256i( d, 2 ) = mm256_bswap_32(
mm256_put_32( s00+ 8, s01+ 8, s02+ 8, s03+ 8,
s04+ 8, s05+ 8, s06+ 8, s07+ 8 ) );
casti_m256i( d, 3 ) = mm256_bswap_32(
mm256_put_32( s00+12, s01+12, s02+12, s03+12,
s04+12, s05+12, s06+12, s07+12 ) );
casti_m256i( d, 4 ) = mm256_bswap_32(
mm256_put_32( s00+16, s01+16, s02+16, s03+16,
s04+16, s05+16, s06+16, s07+16 ) );
casti_m256i( d, 5 ) = mm256_bswap_32(
mm256_put_32( s00+20, s01+20, s02+20, s03+20,
s04+20, s05+20, s06+20, s07+20 ) );
casti_m256i( d, 6 ) = mm256_bswap_32(
mm256_put_32( s00+24, s01+24, s02+24, s03+24,
s04+24, s05+24, s06+24, s07+24 ) );
casti_m256i( d, 7 ) = mm256_bswap_32(
mm256_put_32( s00+28, s01+28, s02+28, s03+28,
s04+28, s05+28, s06+28, s07+28 ) );
}
static inline void mm256_interleave_8x32x128( void *d, const void *s00,
const void *s01, const void *s02, const void *s03, const void *s04,
const void *s05, const void *s06, const void *s07 )
@@ -370,6 +519,24 @@ static inline void mm256_interleave_8x32x128( void *d, const void *s00,
s04+12, s05+12, s06+12, s07+12 );
}
static inline void mm256_be_interleave_8x32x128( void *d, const void *s00,
const void *s01, const void *s02, const void *s03, const void *s04,
const void *s05, const void *s06, const void *s07 )
{
casti_m256i( d, 0 ) = mm256_bswap_32(
mm256_put_32( s00, s01, s02, s03,
s04, s05, s06, s07 ) );
casti_m256i( d, 1 ) = mm256_bswap_32(
mm256_put_32( s00+ 4, s01+ 4, s02+ 4, s03+ 4,
s04+ 4, s05+ 4, s06+ 4, s07+ 4 ) );
casti_m256i( d, 2 ) = mm256_bswap_32(
mm256_put_32( s00+ 8, s01+ 8, s02+ 8, s03+ 8,
s04+ 8, s05+ 8, s06+ 8, s07+ 8 ) );
casti_m256i( d, 3 ) = mm256_bswap_32(
mm256_put_32( s00+12, s01+12, s02+12, s03+12,
s04+12, s05+12, s06+12, s07+12 ) );
}
// can be called directly for 32 byte hash using AVX2
static inline void mm256_deinterleave_8x32x256( void *d00, void *d01,
void *d02, void *d03, void *d04, void *d05, void *d06,
@@ -394,6 +561,21 @@ static inline void mm256_interleave_4x64x256( void *d, const void *s0,
casti_m256i( d,3 ) = mm256_put_64( s0+24, s1+24, s2+24, s3+24 );
}
// bswap the data as it's interleaved.
// A bit of a missnomer, but be is nice and short.
static inline void mm256_be_interleave_4x64x256( void *d, const void *s0,
const void *s1, const void *s2, const void *s3 )
{
casti_m256i( d,0 ) = mm256_bswap_32(
mm256_put_64( s0, s1, s2, s3 ) );
casti_m256i( d,1 ) = mm256_bswap_32(
mm256_put_64( s0+ 8, s1+ 8, s2+ 8, s3+ 8 ) );
casti_m256i( d,2 ) = mm256_bswap_32(
mm256_put_64( s0+16, s1+16, s2+16, s3+16 ) );
casti_m256i( d,3 ) = mm256_bswap_32(
mm256_put_64( s0+24, s1+24, s2+24, s3+24 ) );
}
static inline void mm256_interleave_4x64x128( void *d, const void *s0,
const void *s1, const void *s2, const void *s3 )
{
@@ -401,6 +583,14 @@ static inline void mm256_interleave_4x64x128( void *d, const void *s0,
casti_m256i( d,1 ) = mm256_put_64( s0+ 8, s1+ 8, s2+ 8, s3+ 8 );
}
static inline void mm256_be_interleave_4x64x128( void *d, const void *s0,
const void *s1, const void *s2, const void *s3 )
{
casti_m256i( d,0 ) = mm256_bswap_32(
mm256_put_64( s0, s1, s2, s3 ) );
casti_m256i( d,1 ) = mm256_bswap_32(
mm256_put_64( s0+ 8, s1+ 8, s2+ 8, s3+ 8 ) );
}
// 4 lanes of 256 bits using 64 bit interleaving (standard final hash size)
static inline void mm256_deinterleave_4x64x256( void *d0, void *d1, void *d2,
@@ -496,6 +686,28 @@ static inline void mm256_interleave_8x32( void *d, const void *s0,
// bit_len == 1024
}
static inline void mm256_be_interleave_8x32( void *d, const void *s0,
const void *s1, const void *s2, const void *s3, const void *s4,
const void *s5, const void *s6, const void *s7, int bit_len )
{
mm256_be_interleave_8x32x256( d, s0, s1, s2, s3, s4, s5, s6, s7 );
if ( bit_len <= 256 ) return;
mm256_be_interleave_8x32x256( d+256, s0+32, s1+32, s2+32, s3+32,
s4+32, s5+32, s6+32, s7+32 );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm256_be_interleave_8x32x128( d+512, s0+64, s1+64, s2+64, s3+64,
s4+64, s5+64, s6+64, s7+64 );
return;
}
mm256_be_interleave_8x32x256( d+512, s0+64, s1+64, s2+64, s3+64,
s4+64, s5+64, s6+64, s7+64 );
mm256_be_interleave_8x32x256( d+768, s0+96, s1+96, s2+96, s3+96,
s4+96, s5+96, s6+96, s7+96 );
// bit_len == 1024
}
/*
// Slower but it works with 32 bit data
// bit_len must be multiple of 32
@@ -595,6 +807,23 @@ static inline void mm256_interleave_4x64( void *d, const void *s0,
mm256_interleave_4x64x256( d+384, s0+96, s1+96, s2+96, s3+96 );
}
static inline void mm256_be_interleave_4x64( void *d, const void *s0,
const void *s1, const void *s2, const void *s3, int bit_len )
{
mm256_be_interleave_4x64x256( d, s0, s1, s2, s3 );
if ( bit_len <= 256 ) return;
mm256_be_interleave_4x64x256( d+128, s0+32, s1+32, s2+32, s3+32 );
if ( bit_len <= 512 ) return;
if ( bit_len <= 640 )
{
mm256_be_interleave_4x64x128( d+256, s0+64, s1+64, s2+64, s3+64 );
return;
}
// bit_len == 1024
mm256_be_interleave_4x64x256( d+256, s0+64, s1+64, s2+64, s3+64 );
mm256_be_interleave_4x64x256( d+384, s0+96, s1+96, s2+96, s3+96 );
}
/*
// Slower version
// bit_len must be multiple of 64
@@ -676,7 +905,9 @@ static inline void mm256_extract_lane_4x64( void *d, const void *s,
// Convert from 4x32 SSE2 interleaving to 4x64 AVX2.
// Can't do it in place
static inline void mm256_reinterleave_4x64( void *dst, void *src, int bit_len )
#define mm256_reinterleave_4x64 mm256_reinterleave_4x32_4x64
static inline void mm256_reinterleave_4x32_4x64( void *dst, void *src,
int bit_len )
{
__m256i* d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src;
@@ -736,7 +967,9 @@ static inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
// bit_len must be multiple of 64
static inline void mm256_reinterleave_4x32( void *dst, void *src, int bit_len )
#define mm256_reinterleave_4x32 mm256_reinterleave_4x64_4x32
static inline void mm256_reinterleave_4x64_4x32( void *dst, void *src,
int bit_len )
{
__m256i *d = (__m256i*)dst;
uint32_t *s = (uint32_t*)src;
@@ -862,7 +1095,8 @@ static inline void mm_reinterleave_4x32( void *dst, void *src, int bit_len )
}
*/
static inline void mm256_interleave_2x128( const void *d, const void *s0,
#define mm256_interleave_2x128 mm256_interleave_1x128
static inline void mm256_interleave_1x128( const void *d, const void *s0,
void *s1, const int bit_len )
{
casti_m256i( d, 0 ) = mm256_put_64( s0 , s0+ 8, s1 , s1+ 8 );
@@ -879,7 +1113,8 @@ static inline void mm256_interleave_2x128( const void *d, const void *s0,
// bit_len == 1024
}
static inline void mm256_deinterleave_2x128( void *d0, void *d1, void *s,
#define mm256_deinterleave_2x128 mm256_deinterleave_1x128
static inline void mm256_deinterleave_1x128( void *d0, void *d1, void *s,
int bit_len )
{
mm256_deinterleave_2x128x256( d0, d1, 0, s );
@@ -1078,38 +1313,38 @@ static inline void mm512_deinterleave_16x32x512( void *d00, void *d01,
void *d12, void *d13, void *d14, void *d15, const int n,
const void *s )
{
casti_m512i(d00,n) = mm512_get_32( s, 0, 16, 32, 48, 64, 80, 96, 112,
128, 144, 160, 176, 192, 208, 224, 240 );
casti_m512i(d01,n) = mm512_get_32( s, 1, 17, 33, 49, 65, 81, 97, 113,
129, 145, 161, 177, 193, 209, 225, 241 );
casti_m512i(d02,n) = mm512_get_32( s, 2, 18, 34, 50, 66, 82, 98, 114,
130, 146, 162, 178, 194, 210, 226, 242 );
casti_m512i(d03,n) = mm512_get_32( s, 3, 19, 35, 51, 67, 83, 99, 115,
131, 147, 163, 179, 195, 211, 227, 243 );
casti_m512i(d04,n) = mm512_get_32( s, 4, 20, 36, 52, 68, 84, 100, 116,
132, 148, 164, 180, 196, 212, 228, 244 );
casti_m512i(d05,n) = mm512_get_32( s, 5, 21, 37, 53, 69, 85, 101, 117,
133, 149, 165, 181, 197, 213, 229, 245 );
casti_m512i(d06,n) = mm512_get_32( s, 6, 22, 38, 54, 70, 86, 102, 118,
134, 150, 166, 182, 198, 214, 230, 246 );
casti_m512i(d07,n) = mm512_get_32( s, 7, 23, 39, 55, 71, 87, 103, 119,
135, 151, 167, 183, 199, 215, 231, 247 );
casti_m512i(d08,n) = mm512_get_32( s, 8, 24, 40, 56, 72, 88, 104, 120,
136, 152, 168, 184, 200, 216, 232, 248 );
casti_m512i(d09,n) = mm512_get_32( s, 9, 25, 41, 57, 73, 89, 105, 121,
137, 153, 169, 185, 201, 217, 233, 249 );
casti_m512i(d10,n) = mm512_get_32( s, 10, 26, 42, 58, 74, 90, 106, 122,
138, 154, 170, 186, 202, 218, 234, 250 );
casti_m512i(d11,n) = mm512_get_32( s, 11, 27, 43, 59, 75, 91, 107, 123,
139, 155, 171, 187, 203, 219, 235, 251 );
casti_m512i(d12,n) = mm512_get_32( s, 12, 28, 44, 60, 76, 92, 108, 124,
140, 156, 172, 188, 204, 220, 236, 252 );
casti_m512i(d13,n) = mm512_get_32( s, 13, 29, 45, 61, 77, 93, 109, 125,
141, 157, 173, 189, 205, 221, 237, 253 );
casti_m512i(d14,n) = mm512_get_32( s, 14, 30, 46, 62, 78, 94, 110, 126,
142, 158, 174, 190, 206, 222, 238, 254 );
casti_m512i(d15,n) = mm512_get_32( s, 15, 31, 47, 63, 79, 95, 111, 127,
143, 159, 175, 191, 207, 223, 239, 255 );
casti_m512i(d00,n) = mm512_get_32( s, 0, 16, 32, 48, 64, 80, 96,112,
128,144,160,176,192,208,224,240 );
casti_m512i(d01,n) = mm512_get_32( s, 1, 17, 33, 49, 65, 81, 97,113,
129,145,161,177,193,209,225,241 );
casti_m512i(d02,n) = mm512_get_32( s, 2, 18, 34, 50, 66, 82, 98,114,
130,146,162,178,194,210,226,242 );
casti_m512i(d03,n) = mm512_get_32( s, 3, 19, 35, 51, 67, 83, 99,115,
131,147,163,179,195,211,227,243 );
casti_m512i(d04,n) = mm512_get_32( s, 4, 20, 36, 52, 68, 84,100,116,
132,148,164,180,196,212,228,244 );
casti_m512i(d05,n) = mm512_get_32( s, 5, 21, 37, 53, 69, 85,101,117,
133,149,165,181,197,213,229,245 );
casti_m512i(d06,n) = mm512_get_32( s, 6, 22, 38, 54, 70, 86,102,118,
134,150,166,182,198,214,230,246 );
casti_m512i(d07,n) = mm512_get_32( s, 7, 23, 39, 55, 71, 87,103,119,
135,151,167,183,199,215,231,247 );
casti_m512i(d08,n) = mm512_get_32( s, 8, 24, 40, 56, 72, 88,104,120,
136,152,168,184,200,216,232,248 );
casti_m512i(d09,n) = mm512_get_32( s, 9, 25, 41, 57, 73, 89,105,121,
137,153,169,185,201,217,233,249 );
casti_m512i(d10,n) = mm512_get_32( s, 10, 26, 42, 58, 74, 90,106,122,
138,154,170,186,202,218,234,250 );
casti_m512i(d11,n) = mm512_get_32( s, 11, 27, 43, 59, 75, 91,107,123,
139,155,171,187,203,219,235,251 );
casti_m512i(d12,n) = mm512_get_32( s, 12, 28, 44, 60, 76, 92,108,124,
140,156,172,188,204,220,236,252 );
casti_m512i(d13,n) = mm512_get_32( s, 13, 29, 45, 61, 77, 93,109,125,
141,157,173,189,205,221,237,253 );
casti_m512i(d14,n) = mm512_get_32( s, 14, 30, 46, 62, 78, 94,110,126,
142,158,174,190,206,222,238,254 );
casti_m512i(d15,n) = mm512_get_32( s, 15, 31, 47, 63, 79, 95,111,127,
143,159,175,191,207,223,239,255 );
}
static inline void mm512_interleave_8x64x512( void *d, const void *s0,
@@ -1363,6 +1598,99 @@ static inline void mm512_deinterleave_4x128( void *d0, void *d1, void *d2,
mm512_deinterleave_4x128x512( d0, d1, d2, d3, 1, s+256 );
}
// input one 8x64 buffer and return 2*4*128
static inline void mm512_reinterleave_8x64_4x128( void *dst0, void *dst1,
const void *src, int bit_len )
{
__m512i* d0 = (__m512i*)dst0;
__m512i* d1 = (__m512i*)dst1;
uint64_t *s = (uint64_t*)src;
d0[0] = _mm512_set_epi64( s[ 11], s[ 3], s[ 10], s[ 2],
s[ 9], s[ 1], s[ 8], s[ 0] );
d0[1] = _mm512_set_epi64( s[ 27], s[ 19], s[ 26], s[ 18],
s[ 25], s[ 17], s[ 24], s[ 16] );
d0[2] = _mm512_set_epi64( s[ 15], s[ 7], s[ 14], s[ 6],
s[ 13], s[ 5], s[ 12], s[ 4] );
d0[3] = _mm512_set_epi64( s[ 31], s[ 23], s[ 30], s[ 22],
s[ 29], s[ 21], s[ 28], s[ 20] );
d1[0] = _mm512_set_epi64( s[ 43], s[ 35], s[ 42], s[ 34],
s[ 41], s[ 33], s[ 40], s[ 32] );
d1[1] = _mm512_set_epi64( s[ 59], s[ 51], s[ 58], s[ 50],
s[ 57], s[ 49], s[ 56], s[ 48] );
d1[2] = _mm512_set_epi64( s[ 47], s[ 39], s[ 46], s[ 38],
s[ 45], s[ 37], s[ 44], s[ 36] );
d1[3] = _mm512_set_epi64( s[ 63], s[ 55], s[ 62], s[ 54],
s[ 61], s[ 53], s[ 60], s[ 52] );
if ( bit_len <= 512 ) return;
d0[4] = _mm512_set_epi64( s[ 75], s[ 67], s[ 74], s[ 66],
s[ 73], s[ 65], s[ 72], s[ 64] );
d0[5] = _mm512_set_epi64( s[ 91], s[ 83], s[ 90], s[ 82],
s[ 89], s[ 81], s[ 88], s[ 80] );
d0[6] = _mm512_set_epi64( s[ 79], s[ 71], s[ 78], s[ 70],
s[ 77], s[ 69], s[ 76], s[ 68] );
d0[7] = _mm512_set_epi64( s[ 95], s[ 87], s[ 94], s[ 86],
s[ 93], s[ 85], s[ 92], s[ 84] );
d1[4] = _mm512_set_epi64( s[107], s[ 99], s[106], s[ 98],
s[105], s[ 97], s[104], s[ 96] );
d1[5] = _mm512_set_epi64( s[123], s[115], s[122], s[114],
s[121], s[113], s[120], s[112] );
d1[6] = _mm512_set_epi64( s[111], s[103], s[110], s[102],
s[109], s[101], s[108], s[100] );
d1[7] = _mm512_set_epi64( s[127], s[119], s[126], s[118],
s[125], s[117], s[124], s[116] );
}
// input 2 4x128 return 8x64
static inline void mm512_reinterleave_4x128_8x64( void *dst, const void *src0,
const void *src1, int bit_len )
{
__m512i* d = (__m512i*)dst;
uint64_t *s0 = (uint64_t*)src0;
uint64_t *s1 = (uint64_t*)src1;
d[0] = _mm512_set_epi64( s1[ 6], s1[ 4], s1[ 2], s1[ 0],
s0[ 6], s0[ 4], s0[ 2], s0[ 0] );
d[1] = _mm512_set_epi64( s1[ 7], s1[ 5], s1[ 3], s1[ 1],
s0[ 7], s0[ 5], s0[ 3], s0[ 1] );
d[2] = _mm512_set_epi64( s1[14], s1[12], s1[10], s1[ 8],
s0[14], s0[12], s0[10], s0[ 8] );
d[3] = _mm512_set_epi64( s1[15], s1[13], s1[11], s1[ 9],
s0[15], s0[13], s0[11], s0[ 9] );
d[4] = _mm512_set_epi64( s1[22], s1[20], s1[18], s1[16],
s0[22], s0[20], s0[18], s0[16] );
d[5] = _mm512_set_epi64( s1[23], s1[21], s1[19], s1[17],
s0[24], s0[21], s0[19], s0[17] );
d[6] = _mm512_set_epi64( s1[22], s1[28], s1[26], s1[24],
s0[22], s0[28], s0[26], s0[24] );
d[7] = _mm512_set_epi64( s1[31], s1[29], s1[27], s1[25],
s0[31], s0[29], s0[27], s0[25] );
if ( bit_len <= 512 ) return;
d[0] = _mm512_set_epi64( s1[38], s1[36], s1[34], s1[32],
s0[38], s0[36], s0[34], s0[32] );
d[1] = _mm512_set_epi64( s1[39], s1[37], s1[35], s1[33],
s0[39], s0[37], s0[35], s0[33] );
d[2] = _mm512_set_epi64( s1[46], s1[44], s1[42], s1[40],
s0[46], s0[44], s0[42], s0[40] );
d[3] = _mm512_set_epi64( s1[47], s1[45], s1[43], s1[41],
s0[47], s0[45], s0[43], s0[41] );
d[4] = _mm512_set_epi64( s1[54], s1[52], s1[50], s1[48],
s0[54], s0[52], s0[50], s0[48] );
d[5] = _mm512_set_epi64( s1[55], s1[53], s1[51], s1[49],
s0[55], s0[53], s0[51], s0[49] );
d[6] = _mm512_set_epi64( s1[62], s1[60], s1[58], s1[56],
s0[62], s0[60], s0[58], s0[56] );
d[7] = _mm512_set_epi64( s1[63], s1[61], s1[59], s1[57],
s0[63], s0[61], s0[59], s0[57] );
}
static inline void mm512_extract_lane_4x128( void *d, const void *s,
const int lane, const int bit_len )
{

View File

@@ -538,6 +538,7 @@ enum algos {
ALGO_SCRYPTJANE,
ALGO_SHA256D,
ALGO_SHA256T,
ALGO_SHA256Q,
ALGO_SHAVITE3,
ALGO_SKEIN,
ALGO_SKEIN2,
@@ -625,6 +626,7 @@ static const char* const algo_names[] = {
"scryptjane",
"sha256d",
"sha256t",
"sha256q",
"shavite3",
"skein",
"skein2",
@@ -774,7 +776,8 @@ Options:\n\
scryptjane:nf\n\
sha256d Double SHA-256\n\
sha256t Triple SHA-256, Onecoin (OC)\n\
shavite3 Shavite3\n\
sha256q Quad SHA-256, Pyrite (PYE)\n\
shavite3 Shavite3\n\
skein Skein+Sha (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\
skunk Signatum (SIGT)\n\

View File

@@ -19,7 +19,7 @@ export CONFIGURE_ARGS="--with-curl=$LOCAL_LIB/curl --with-crypto=$LOCAL_LIB/open
ln -s $LOCAL_LIB/gmp/gmp.h ./gmp.h
# edit configure to fix pthread lib name for Windows.
sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
#sed -i 's/"-lpthread"/"-lpthreadGC2"/g' configure.ac
# make release directory and copy selected DLLs.
mkdir release