This commit is contained in:
Jay D Dee
2023-09-13 11:48:52 -04:00
parent 4378d2f841
commit d6b5750362
28 changed files with 1626 additions and 1327 deletions

View File

@@ -1,60 +1,15 @@
/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
/**
* BLAKE interface. BLAKE is a family of functions which differ by their
* output size; this implementation defines BLAKE for output sizes 224,
* 256, 384 and 512 bits. This implementation conforms to the "third
* round" specification.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_blake.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef __BLAKE_HASH_4WAY__
#define __BLAKE_HASH_4WAY__ 1
#ifdef __cplusplus
extern "C"{
#endif
#ifndef BLAKE_HASH_4WAY__
#define BLAKE_HASH_4WAY__ 1
#include <stddef.h>
#include "algo/sha/sph_types.h"
#include "simd-utils.h"
#define SPH_SIZE_blake256 256
#define SPH_SIZE_blake512 512
/////////////////////////
//
// Blake-256 1 way SSE2
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1 );
const uint32_t T0, const uint32_t T1, int rounds );
/////////////////////////
//
@@ -75,13 +30,13 @@ typedef struct {
int rounds; // 14 for blake, 8 for blakecoin & vanilla
} blake_4way_small_context __attribute__ ((aligned (64)));
// Default, 14 rounds, blake, decred
// Default, 14 rounds
typedef blake_4way_small_context blake256_4way_context;
void blake256_4way_init(void *ctx);
void blake256_4way_update(void *ctx, const void *data, size_t len);
void blake256_4way_close(void *ctx, void *dst);
// 14 rounds, blake, decred
// 14 rounds
typedef blake_4way_small_context blake256r14_4way_context;
void blake256r14_4way_init(void *cc);
void blake256r14_4way_update(void *cc, const void *data, size_t len);
@@ -103,7 +58,7 @@ typedef struct {
__m256i buf[16] __attribute__ ((aligned (64)));
__m256i H[8];
size_t ptr;
sph_u32 T0, T1;
uint32_t T0, T1;
int rounds; // 14 for blake, 8 for blakecoin & vanilla
} blake_8way_small_context;
@@ -117,7 +72,7 @@ void blake256_8way_close_le(void *cc, void *dst);
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data );
const void *midhash, const void *data, const int rounds );
// 14 rounds, blake, decred
typedef blake_8way_small_context blake256r14_8way_context;
@@ -138,7 +93,7 @@ typedef struct {
__m256i H[8];
__m256i S[4];
size_t ptr;
sph_u64 T0, T1;
uint64_t T0, T1;
} blake_4way_big_context __attribute__ ((aligned (128)));
typedef blake_4way_big_context blake512_4way_context;
@@ -180,7 +135,7 @@ void blake256_16way_close_le(void *cc, void *dst);
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
void *data );
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data );
const void *midhash, const void *data, const int rounds );
// 14 rounds, blake, decred
@@ -204,7 +159,7 @@ typedef struct {
__m512i H[8];
__m512i S[4];
size_t ptr;
sph_u64 T0, T1;
uint64_t T0, T1;
} blake_8way_big_context __attribute__ ((aligned (128)));
typedef blake_8way_big_context blake512_8way_context;
@@ -224,8 +179,4 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
#endif // AVX512
#endif // AVX2
#ifdef __cplusplus
}
#endif
#endif // BLAKE_HASH_4WAY_H__

View File

@@ -40,26 +40,6 @@
#include "blake-hash-4way.h"
#ifdef __cplusplus
extern "C"{
#endif
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
#define SPH_SMALL_FOOTPRINT_BLAKE 1
#endif
#if SPH_SMALL_FOOTPRINT_BLAKE
#define SPH_COMPACT_BLAKE_32 1
#endif
#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
#define SPH_COMPACT_BLAKE_64 1
#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
// Blake-256
static const uint32_t IV256[8] =
@@ -68,7 +48,7 @@ static const uint32_t IV256[8] =
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
#if 0
// Blake-256 4 & 8 way, Blake-512 4 way
@@ -273,44 +253,28 @@ static const unsigned sigma[16][16] = {
#define CSx_(n) CSx__(n)
#define CSx__(n) CS ## n
#define CS0 SPH_C32(0x243F6A88)
#define CS1 SPH_C32(0x85A308D3)
#define CS2 SPH_C32(0x13198A2E)
#define CS3 SPH_C32(0x03707344)
#define CS4 SPH_C32(0xA4093822)
#define CS5 SPH_C32(0x299F31D0)
#define CS6 SPH_C32(0x082EFA98)
#define CS7 SPH_C32(0xEC4E6C89)
#define CS8 SPH_C32(0x452821E6)
#define CS9 SPH_C32(0x38D01377)
#define CSA SPH_C32(0xBE5466CF)
#define CSB SPH_C32(0x34E90C6C)
#define CSC SPH_C32(0xC0AC29B7)
#define CSD SPH_C32(0xC97C50DD)
#define CSE SPH_C32(0x3F84D5B5)
#define CSF SPH_C32(0xB5470917)
#if SPH_COMPACT_BLAKE_32
static const sph_u32 CS[16] = {
SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
SPH_C32(0x13198A2E), SPH_C32(0x03707344),
SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
SPH_C32(0x452821E6), SPH_C32(0x38D01377),
SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
};
#endif
#define CS0 0x243F6A88
#define CS1 0x85A308D3
#define CS2 0x13198A2E
#define CS3 0x03707344
#define CS4 0xA4093822
#define CS5 0x299F31D0
#define CS6 0x082EFA98
#define CS7 0xEC4E6C89
#define CS8 0x452821E6
#define CS9 0x38D01377
#define CSA 0xBE5466CF
#define CSB 0x34E90C6C
#define CSC 0xC0AC29B7
#define CSD 0xC97C50DD
#define CSE 0x3F84D5B5
#define CSF 0xB5470917
/////////////////////////////////////////
//
// Blake-256 1 way SIMD
// Only used for prehash, otherwise 4way is used with SSE2.
// optimize shuffles to reduce latency caused by dependencies on V1.
#define BLAKE256_ROUND( r ) \
{ \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
@@ -353,52 +317,9 @@ static const sph_u32 CS[16] = {
V2 = mm128_shufll_32( V2 ); \
}
/*
#define BLAKE256_ROUND( r ) \
{ \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
CSx( r, 5 ) ^ Mx( r, 4 ), \
CSx( r, 3 ) ^ Mx( r, 2 ), \
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
CSx( r, 4 ) ^ Mx( r, 5 ), \
CSx( r, 2 ) ^ Mx( r, 3 ), \
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V3 = mm128_shufll_32( V3 ); \
V2 = mm128_swap_64( V2 ); \
V1 = mm128_shuflr_32( V1 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, F ) ^ Mx( r, E ), \
CSx( r, D ) ^ Mx( r, C ), \
CSx( r, B ) ^ Mx( r, A ), \
CSx( r, 9 ) ^ Mx( r, 8 ) ) ) ); \
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
_mm_set_epi32( CSx( r, E ) ^ Mx( r, F ), \
CSx( r, C ) ^ Mx( r, D ), \
CSx( r, A ) ^ Mx( r, B ), \
CSx( r, 8 ) ^ Mx( r, 9 ) ) ) ); \
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
V2 = _mm_add_epi32( V2, V3 ); \
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
V3 = mm128_shuflr_32( V3 ); \
V2 = mm128_swap_64( V2 ); \
V1 = mm128_shufll_32( V1 ); \
}
*/
// Default is 14 rounds, blakecoin & vanilla are 8.
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
const uint32_t T0, const uint32_t T1 )
const uint32_t T0, const uint32_t T1, int rounds )
{
__m128i V0, V1, V2, V3;
uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
@@ -431,12 +352,15 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
BLAKE256_ROUND( 5 );
BLAKE256_ROUND( 6 );
BLAKE256_ROUND( 7 );
BLAKE256_ROUND( 8 );
BLAKE256_ROUND( 9 );
BLAKE256_ROUND( 0 );
BLAKE256_ROUND( 1 );
BLAKE256_ROUND( 2 );
BLAKE256_ROUND( 3 );
if ( rounds > 8 ) // 14
{
BLAKE256_ROUND( 8 );
BLAKE256_ROUND( 9 );
BLAKE256_ROUND( 0 );
BLAKE256_ROUND( 1 );
BLAKE256_ROUND( 2 );
BLAKE256_ROUND( 3 );
}
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
}
@@ -459,34 +383,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
}
#if SPH_COMPACT_BLAKE_32
// Not used
#if 0
#define ROUND_S_4WAY(r) do { \
GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \
GS_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \
GS_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \
GS_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \
GS_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \
GS_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \
GS_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \
GS_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
} while (0)
#endif
#else
#define ROUND_S_4WAY(r) \
{ \
GS_4WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
@@ -499,8 +395,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
GS_4WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
}
#endif
#define DECL_STATE32_4WAY \
__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
uint32_t T0, T1;
@@ -531,56 +425,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
(state)->T1 = T1; \
} while (0)
#if SPH_COMPACT_BLAKE_32
// not used
#if 0
#define COMPRESS32_4WAY( rounds ) do { \
__m128i M[16]; \
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
unsigned r; \
V0 = H0; \
V1 = H1; \
V2 = H2; \
V3 = H3; \
V4 = H4; \
V5 = H5; \
V6 = H6; \
V7 = H7; \
V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
mm128_block_bswap_32( M, buf ); \
mm128_block_bswap_32( M+8, buf+8 ); \
for (r = 0; r < rounds; r ++) \
ROUND_S_4WAY(r); \
H0 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( S0, V0 ), V8 ), H0 ); \
H1 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( S1, V1 ), V9 ), H1 ); \
H2 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( S2, V2 ), VA ), H2 ); \
H3 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( S3, V3 ), VB ), H3 ); \
H4 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( S0, V4 ), VC ), H4 ); \
H5 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( S1, V5 ), VD ), H5 ); \
H6 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( S2, V6 ), VE ), H6 ); \
H7 = _mm_xor_si128( _mm_xor_si128( \
_mm_xor_si128( S3, V7 ), VF ), H7 ); \
} while (0)
#endif
#else
// current impl
#if defined(__SSSE3__)
@@ -680,8 +524,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
}
#endif
#if defined (__AVX2__)
/////////////////////////////////
@@ -968,7 +810,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
#define DECL_STATE32_8WAY \
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
sph_u32 T0, T1;
uint32_t T0, T1;
#define READ_STATE32_8WAY(state) \
do { \
@@ -1046,7 +888,7 @@ do { \
ROUND_S_8WAY(5); \
ROUND_S_8WAY(6); \
ROUND_S_8WAY(7); \
if (rounds == 14) \
if (rounds > 8) \
{ \
ROUND_S_8WAY(8); \
ROUND_S_8WAY(9); \
@@ -1111,7 +953,7 @@ do { \
ROUND_S_8WAY(5); \
ROUND_S_8WAY(6); \
ROUND_S_8WAY(7); \
if (rounds == 14) \
if (rounds > 8) \
{ \
ROUND_S_8WAY(8); \
ROUND_S_8WAY(9); \
@@ -1156,7 +998,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
// M[ 5:12, 14 ] are always zero and not needed or used.
// M[ 4], M[ 13], M[15] are constant and are initialized here.
// M[ 4], M[13], M[15] are constant and are initialized here.
// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
M[ 4] = _mm256_set1_epi32( 0x80000000 );
@@ -1221,7 +1063,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
}
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data )
const void *midhash, const void *data, const int rounds )
{
__m256i *H = (__m256i*)final_hash;
const __m256i *h = (const __m256i*)midhash;
@@ -1315,12 +1157,15 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
ROUND256_8WAY_5;
ROUND256_8WAY_6;
ROUND256_8WAY_7;
ROUND256_8WAY_8;
ROUND256_8WAY_9;
ROUND256_8WAY_0;
ROUND256_8WAY_1;
ROUND256_8WAY_2;
ROUND256_8WAY_3;
if ( rounds > 8 )
{
ROUND256_8WAY_8;
ROUND256_8WAY_9;
ROUND256_8WAY_0;
ROUND256_8WAY_1;
ROUND256_8WAY_2;
ROUND256_8WAY_3;
}
const __m256i shuf_bswap32 =
mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
@@ -1623,7 +1468,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
#define DECL_STATE32_16WAY \
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
sph_u32 T0, T1;
uint32_t T0, T1;
#define READ_STATE32_16WAY(state) \
do { \
@@ -1882,8 +1727,9 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
_mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
}
// Dfault is 14 rounds, blakecoin & vanilla are 8.
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
const void *midhash, const void *data )
const void *midhash, const void *data, const int rounds )
{
__m512i *H = (__m512i*)final_hash;
const __m512i *h = (const __m512i*)midhash;
@@ -1988,12 +1834,15 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
ROUND256_16WAY_5;
ROUND256_16WAY_6;
ROUND256_16WAY_7;
ROUND256_16WAY_8;
ROUND256_16WAY_9;
ROUND256_16WAY_0;
ROUND256_16WAY_1;
ROUND256_16WAY_2;
ROUND256_16WAY_3;
if ( rounds > 8 )
{
ROUND256_16WAY_8;
ROUND256_16WAY_9;
ROUND256_16WAY_0;
ROUND256_16WAY_1;
ROUND256_16WAY_2;
ROUND256_16WAY_3;
}
// Byte swap final hash
const __m512i shuf_bswap32 =
@@ -2057,7 +1906,7 @@ blake32_4way( blake_4way_small_context *ctx, const void *data,
size_t clen = ( sizeof ctx->buf ) - bptr;
if ( clen > blen )
clen = blen;
clen = blen;
memcpy( buf + vptr, data, clen );
bptr += clen;
data = (const unsigned char *)data + clen;
@@ -2130,11 +1979,11 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
// Blake-256 8 way
static const sph_u32 salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
static void
blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
const sph_u32 *salt, int rounds )
blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
const uint32_t *salt, int rounds )
{
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
@@ -2181,8 +2030,8 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
len -= clen;
if ( ptr == buf_size )
{
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
T1 = SPH_T32(T1 + 1);
if ( ( T0 = T0 + 512 ) < 512 )
T1 = T1 + 1;
COMPRESS32_8WAY( sc->rounds );
ptr = 0;
}
@@ -2198,7 +2047,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
__m256i buf[16];
size_t ptr;
unsigned bit_len;
sph_u32 th, tl;
uint32_t th, tl;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
@@ -2208,13 +2057,13 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
if ( ptr == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
sc->T0 = 0xFFFFFE00UL;
sc->T1 = 0xFFFFFFFFUL;
}
else if ( sc->T0 == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
sc->T1 = SPH_T32(sc->T1 - 1);
sc->T0 = 0xFFFFFE00UL + bit_len;
sc->T1 = sc->T1 - 1;
}
else
sc->T0 -= 512 - bit_len;
@@ -2233,8 +2082,8 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
{
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
sc->T0 = 0xFFFFFE00UL;
sc->T1 = 0xFFFFFFFFUL;
memset_zero_256( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = _mm256_set1_epi64x( 0x0100000001000000ULL );
@@ -2277,8 +2126,8 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
len -= clen;
if ( ptr == buf_size )
{
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
T1 = SPH_T32(T1 + 1);
if ( ( T0 = T0 + 512 ) < 512 )
T1 = T1 + 1;
COMPRESS32_8WAY_LE( sc->rounds );
ptr = 0;
}
@@ -2294,7 +2143,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
__m256i buf[16];
size_t ptr;
unsigned bit_len;
sph_u32 th, tl;
uint32_t th, tl;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
@@ -2304,13 +2153,13 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
if ( ptr == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
sc->T0 = 0xFFFFFE00UL;
sc->T1 = 0xFFFFFFFFUL;
}
else if ( sc->T0 == 0 )
{
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
sc->T1 = SPH_T32(sc->T1 - 1);
sc->T0 = 0xFFFFFE00UL + bit_len;
sc->T1 = sc->T1 - 1;
}
else
sc->T0 -= 512 - bit_len;
@@ -2328,8 +2177,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
{
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00UL);
sc->T1 = SPH_C32(0xFFFFFFFFUL);
sc->T0 = 0xFFFFFE00UL;
sc->T1 = 0xFFFFFFFFUL;
memset_zero_256( buf, 56>>2 );
if ( out_size_w32 == 8 )
buf[52>>2] = m256_one_32;
@@ -2348,8 +2197,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
//Blake-256 16 way AVX512
static void
blake32_16way_init( blake_16way_small_context *sc, const sph_u32 *iv,
const sph_u32 *salt, int rounds )
blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
const uint32_t *salt, int rounds )
{
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E6676A09E667 );
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
@@ -2411,7 +2260,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
__m512i buf[16];
size_t ptr;
unsigned bit_len;
sph_u32 th, tl;
uint32_t th, tl;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
@@ -2508,7 +2357,7 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
__m512i buf[16];
size_t ptr;
unsigned bit_len;
sph_u32 th, tl;
uint32_t th, tl;
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
@@ -2618,8 +2467,6 @@ blake256r8_16way_close(void *cc, void *dst)
#endif // AVX512
// Blake-256 4 way
// default 14 rounds, backward copatibility
@@ -2754,9 +2601,3 @@ blake256r8_8way_close(void *cc, void *dst)
}
#endif
#ifdef __cplusplus
}
#endif
//#endif

View File

@@ -1,62 +1,22 @@
/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
/*
* BLAKE implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#if defined (__AVX2__)
#include <stddef.h>
#include <string.h>
#include <limits.h>
#include "blake-hash-4way.h"
#ifdef __cplusplus
extern "C"{
#endif
#ifdef _MSC_VER
#pragma warning (disable: 4146)
#endif
// Blake-512 common
/*
static const sph_u64 IV512[8] = {
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
static const uint64_t IV512[8] =
{
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
static const uint64_t salt_zero_big[4] = { 0, 0, 0, 0 };
static const unsigned sigma[16][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
@@ -77,15 +37,15 @@ static const unsigned sigma[16][16] = {
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
static const sph_u64 CB[16] = {
SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
static const uint64_t CB[16] = {
0x243F6A8885A308D3, 0x13198A2E03707344,
0xA4093822299F31D0, 0x082EFA98EC4E6C89,
0x452821E638D01377, 0xBE5466CF34E90C6C,
0xC0AC29B7C97C50DD, 0x3F84D5B5B5470917,
0x9216D5D98979FB1B, 0xD1310BA698DFB5AC,
0x2FFD72DBD01ADFB7, 0xB8E1AFED6A267E96,
0xBA7C9045F12C7F99, 0x24A19947B3916CF7,
0x0801F2E2858EFC16, 0x636920D871574E69
*/
@@ -1486,7 +1446,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
if ( ptr == buf_size )
{
if ( (T0 = T0 + 1024 ) < 1024 )
T1 = SPH_T64(T1 + 1);
T1 = T1 + 1;
COMPRESS64_4WAY;
ptr = 0;
}
@@ -1538,8 +1498,8 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
memset_zero_256( buf, 112>>3 );
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
@@ -1629,8 +1589,4 @@ blake512_4way_close(void *cc, void *dst)
blake64_4way_close( cc, dst );
}
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -4,7 +4,149 @@
#include <stdint.h>
#include <memory.h>
#if defined (BLAKECOIN_4WAY)
#define rounds 8
#if defined (BLAKECOIN_16WAY)
int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash32[8*16] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
__m512i block0_hash[8] __attribute__ ((aligned (64)));
__m512i block_buf[16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( ((__m512i*)hash32)[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
uint32_t phash[8] __attribute__ ((aligned (64))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t n = pdata[19];
const uint32_t first_nonce = (const uint32_t) n;
const uint32_t last_nonce = max_nonce - 16;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i sixteen = _mm512_set1_epi32( 16 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0, rounds );
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
block0_hash[2] = _mm512_set1_epi32( phash[2] );
block0_hash[3] = _mm512_set1_epi32( phash[3] );
block0_hash[4] = _mm512_set1_epi32( phash[4] );
block0_hash[5] = _mm512_set1_epi32( phash[5] );
block0_hash[6] = _mm512_set1_epi32( phash[6] );
block0_hash[7] = _mm512_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[0] = _mm512_set1_epi32( pdata[16] );
block_buf[1] = _mm512_set1_epi32( pdata[17] );
block_buf[2] = _mm512_set1_epi32( pdata[18] );
block_buf[3] =
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
block_buf, rounds );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
block_buf[3] = _mm512_add_epi32( block_buf[3], sixteen );
n += 16;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (BLAKECOIN_8WAY)
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash32[8*8] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*8] __attribute__ ((aligned (32)));
__m256i block0_hash[8] __attribute__ ((aligned (32)));
__m256i block_buf[16] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( ((__m256i*)hash32)[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t n = pdata[19];
const uint32_t first_nonce = (const uint32_t) n;
const uint32_t last_nonce = max_nonce - 8;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = _mm256_set1_epi32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0, rounds );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[0] = _mm256_set1_epi32( pdata[16] );
block_buf[1] = _mm256_set1_epi32( pdata[17] );
block_buf[2] = _mm256_set1_epi32( pdata[18] );
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
// Partialy prehash second block without touching nonces in block_buf[3].
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
block_buf, rounds );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
block_buf[3] = _mm256_add_epi32( block_buf[3], eight );
n += 8;
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined (BLAKECOIN_4WAY)
blake256r8_4way_context blakecoin_4w_ctx;
@@ -61,7 +203,8 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
#endif
#if defined(BLAKECOIN_8WAY)
#if 0
//#if defined(BLAKECOIN_8WAY)
blake256r8_8way_context blakecoin_8w_ctx;
@@ -78,11 +221,84 @@ void blakecoin_8way_hash( void *state, const void *input )
state+160, state+192, state+224, vhash, 256 );
}
/*
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t hash32[8*8] __attribute__ ((aligned (64)));
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
__m256i block0_hash[8] __attribute__ ((aligned (64)));
__m256i block_buf[16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t phash[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
uint32_t *ptarget = (uint32_t*)work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m256i eight = _mm256_set1_epi32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0, 8 );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );
block0_hash[2] = _mm256_set1_epi32( phash[2] );
block0_hash[3] = _mm256_set1_epi32( phash[3] );
block0_hash[4] = _mm256_set1_epi32( phash[4] );
block0_hash[5] = _mm256_set1_epi32( phash[5] );
block0_hash[6] = _mm256_set1_epi32( phash[6] );
block0_hash[7] = _mm256_set1_epi32( phash[7] );
// Build vectored second block, interleave last 16 bytes of data using
// unique nonces.
block_buf[0] = _mm256_set1_epi32( pdata[16] );
block_buf[1] = _mm256_set1_epi32( pdata[17] );
block_buf[2] = _mm256_set1_epi32( pdata[18] );
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
// Partialy prehash second block without touching nonces
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
do {
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
block_buf );
for ( int lane = 0; lane < 8; lane++ )
if ( hash32_d7[ lane ] <= targ32_d7 )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
block_buf[3] = _mm256_add_epi32( block_buf[3], eight );
n += 8;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
*/
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
uint32_t hash[8*8] __attribute__ ((aligned (32)));
uint32_t hash32[8*8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
blake256r8_8way_context ctx __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( ((__m256i*)hash32)[7] );
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -101,15 +317,22 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
n+3, n+2, n+1, n ) );
pdata[19] = n;
blakecoin_8way_hash( hash, vdata );
for ( int i = 0; i < 8; i++ )
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget )
&& !opt_benchmark )
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
blake256r8_8way_update( &ctx, (const void*)vdata + (64<<3), 16 );
blake256r8_8way_close( &ctx, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( hash32_d7[ lane ] <= HTarget )
{
pdata[19] = n+i;
submit_solution( work, hash+(i<<3), mythr );
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
n += 8;
} while ( (n < max_nonce) && !work_restart[thr_id].restart );

View File

@@ -4,10 +4,10 @@
// vanilla uses default gen merkle root, otherwise identical to blakecoin
bool register_vanilla_algo( algo_gate_t* gate )
{
#if defined(BLAKECOIN_8WAY)
#if defined(BLAKECOIN_16WAY)
gate->scanhash = (void*)&scanhash_blakecoin_16way;
#elif defined(BLAKECOIN_8WAY)
gate->scanhash = (void*)&scanhash_blakecoin_8way;
gate->hash = (void*)&blakecoin_8way_hash;
#elif defined(BLAKECOIN_4WAY)
gate->scanhash = (void*)&scanhash_blakecoin_4way;
gate->hash = (void*)&blakecoin_4way_hash;
@@ -15,14 +15,14 @@ bool register_vanilla_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_blakecoin;
gate->hash = (void*)&blakecoinhash;
#endif
gate->optimizations = SSE42_OPT | AVX2_OPT;
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
return true;
}
bool register_blakecoin_algo( algo_gate_t* gate )
{
register_vanilla_algo( gate );
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
return true;
}

View File

@@ -1,30 +1,36 @@
#ifndef __BLAKECOIN_GATE_H__
#define __BLAKECOIN_GATE_H__ 1
#ifndef BLAKECOIN_GATE_H__
#define BLAKECOIN_GATE_H__ 1
#include "algo-gate-api.h"
#include <stdint.h>
#if defined(__SSE4_2__)
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define BLAKECOIN_16WAY
#elif defined(__AVX2__)
#define BLAKECOIN_8WAY
#elif defined(__SSE2__) // always true
#define BLAKECOIN_4WAY
#endif
#if defined(__AVX2__)
#define BLAKECOIN_8WAY
#endif
#if defined (BLAKECOIN_8WAY)
void blakecoin_8way_hash(void *state, const void *input);
#if defined (BLAKECOIN_16WAY)
int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#elif defined (BLAKECOIN_8WAY)
//void blakecoin_8way_hash(void *state, const void *input);
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined (BLAKECOIN_4WAY)
#elif defined (BLAKECOIN_4WAY)
void blakecoin_4way_hash(void *state, const void *input);
int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#else // never used
void blakecoinhash( void *state, const void *input );
int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#endif

View File

@@ -1,6 +1,6 @@
#include "blakecoin-gate.h"
#if !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
#if !defined(BLAKECOIN_16WAY) && !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
#define BLAKE32_ROUNDS 8
#include "sph_blake.h"
@@ -12,7 +12,6 @@ void blakecoin_close(void *cc, void *dst);
#include <string.h>
#include <stdint.h>
#include <memory.h>
#include <openssl/sha.h>
// context management is staged for efficiency.
// 1. global initial ctx cached on startup
@@ -35,8 +34,8 @@ void blakecoinhash( void *state, const void *input )
uint8_t hash[64] __attribute__ ((aligned (32)));
uint8_t *ending = (uint8_t*) input + 64;
// copy cached midstate
memcpy( &ctx, &blake_mid_ctx, sizeof ctx );
// copy cached midstate
memcpy( &ctx, &blake_mid_ctx, sizeof ctx );
blakecoin( &ctx, ending, 16 );
blakecoin_close( &ctx, hash );
memcpy( state, hash, 32 );
@@ -45,8 +44,8 @@ void blakecoinhash( void *state, const void *input )
int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
uint32_t HTarget = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
@@ -60,10 +59,10 @@ int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
HTarget = 0x7f;
// we need big endian data...
for (int kk=0; kk < 19; kk++)
be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
for (int kk=0; kk < 19; kk++)
be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
blake_midstate_init( endiandata );
blake_midstate_init( endiandata );
#ifdef DEBUG_ALGO
applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);

View File

@@ -48,7 +48,7 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
uint32_t hash15[8] __attribute__ ((aligned (32)));
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -217,7 +217,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
// Prehash first block.
blake256_transform_le( phash, pdata, 512, 0 );
blake256_transform_le( phash, pdata, 512, 0, 14 );
// Interleave hash for second block prehash.
block0_hash[0] = _mm512_set1_epi32( phash[0] );
@@ -286,7 +286,7 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
uint64_t *hash7 = (uint64_t*)hash+28;
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block );
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
vhashA, 256 );
@@ -401,7 +401,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
const __m256i eight = _mm256_set1_epi32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
blake256_transform_le( phash, pdata, 512, 0, 14 );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );

View File

@@ -35,7 +35,7 @@ static void lyra2z_16way_hash( void *state, const void *midstate_vars,
uint32_t hash14[8] __attribute__ ((aligned (32)));
uint32_t hash15[8] __attribute__ ((aligned (32)));
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -108,7 +108,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
blake256_transform_le( phash, pdata, 512, 0, 14 );
block0_hash[0] = _mm512_set1_epi32( phash[0] );
block0_hash[1] = _mm512_set1_epi32( phash[1] );
@@ -170,7 +170,7 @@ static void lyra2z_8way_hash( void *state, const void *midstate_vars,
uint32_t hash7[8] __attribute__ ((aligned (32)));
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block );
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
dintrlv_8x32( hash0, hash1, hash2, hash3,
hash4, hash5, hash6, hash7, vhash, 256 );
@@ -216,7 +216,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
const __m256i eight = _mm256_set1_epi32( 8 );
// Prehash first block
blake256_transform_le( phash, pdata, 512, 0 );
blake256_transform_le( phash, pdata, 512, 0, 14 );
block0_hash[0] = _mm256_set1_epi32( phash[0] );
block0_hash[1] = _mm256_set1_epi32( phash[1] );

View File

@@ -67,7 +67,7 @@ void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
const __m128i *state_in );
const __m128i *state_in, const uint32_t *target );
#endif // SSE2
@@ -95,7 +95,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i *state_in );
const __m256i *state_in, const uint32_t *target );
#endif // AVX2
@@ -123,7 +123,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
const __m512i *state_in );
const __m512i *state_in, const uint32_t *target );
#endif // AVX512

View File

@@ -658,43 +658,14 @@ int scanhash_sha256d_pooler( struct work *work, uint32_t max_nonce,
return 0;
}
/*
int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t _ALIGN(128) hash[8];
uint32_t _ALIGN(64) data[20];
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id;
memcpy( data, pdata, 80 );
do {
data[19] = ++n;
sha256d( (unsigned char*)hash, (const unsigned char*)data, 80 );
if ( unlikely( swab32( hash[7] ) <= Htarg ) )
{
pdata[19] = n;
sha256d_80_swap(hash, pdata);
if ( fulltest( hash, ptarget ) && !opt_benchmark )
submit_solution( work, hash, mythr );
}
} while ( likely( n < max_nonce && !work_restart[thr_id].restart ) );
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
*/
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#elif defined(SHA256D_SHA)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256d_sha;
//#elif defined(SHA256D_8WAY)
// gate->scanhash = (void*)&scanhash_sha256d_8way;
#else

File diff suppressed because it is too large Load Diff

View File

@@ -50,65 +50,6 @@ void sha256_update( sha256_context *ctx, const void *data, size_t len )
memcpy( ctx->buf, src, len );
}
#if 0
void sha256_final( sha256_context *ctx, uint32_t *hash )
{
size_t r;
/* Figure out how many bytes we have buffered. */
r = ctx->count & 0x3f;
// r = ( ctx->count >> 3 ) & 0x3f;
//printf("final: count= %d, r= %d\n", ctx->count, r );
/* Pad to 56 mod 64, transforming if we finish a block en route. */
if ( r < 56 )
{
/* Pad to 56 mod 64. */
memcpy( &ctx->buf[r], SHA256_PAD, 56 - r );
}
else
{
/* Finish the current block and mix. */
memcpy( &ctx->buf[r], SHA256_PAD, 64 - r );
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
/* The start of the final block is all zeroes. */
memset( &ctx->buf[0], 0, 56 );
}
/* Add the terminating bit-count. */
ctx->buf[56] = bswap_64( ctx->count << 3 );
// ctx->buf[56] = bswap_64( ctx->count );
// be64enc( &ctx->buf[56], ctx->count );
/* Mix in the final block. */
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
for ( int i = 0; i < 8; i++ ) hash[i] = bswap_32( ctx->state[i] );
// for ( int i = 0; i < 8; i++ ) be32enc( hash + 4*i, ctx->state + i );
/*
// be32enc_vect(digest, ctx->state, 4);
// be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
// Encode vector, two words at a time.
do {
be32enc(&dst[0], src[0]);
be32enc(&dst[4], src[1]);
src += 2;
dst += 8;
} while (--len);
*/
}
#endif
void sha256_final( sha256_context *ctx, void *hash )
{
int ptr = ctx->count & 0x3f;

View File

@@ -3,10 +3,194 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "sha256-hash.h"
#include "sha-hash-4way.h"
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
#if defined(SHA256D_SHA)
int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block0[16] __attribute__ ((aligned (64)));
uint32_t block1[16] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t mstate[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// hash first 64 bytes of data
sha256_opt_transform_le( mstate, pdata, sha256_iv );
do
{
// 1. final 16 bytes of data, with padding
memcpy( block0, pdata + 16, 16 );
memcpy( block1, pdata + 16, 16 );
block0[ 3] = n;
block1[ 3] = n+1;
block0[ 4] = block1[ 4] = 0x80000000;
memset( block0 + 5, 0, 40 );
memset( block1 + 5, 0, 40 );
block0[15] = block1[15] = 80*8; // bit count
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
mstate, mstate );
// 2. 32 byte hash from 1.
memcpy( block0, hash0, 32 );
memcpy( block1, hash1, 32 );
block0[ 8] = block1[ 8] = 0x80000000;
memset( block0 + 9, 0, 24 );
memset( block1 + 9, 0, 24 );
block0[15] = block1[15] = 32*8; // bit count
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
sha256_iv, sha256_iv );
if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
{
casti_m128i( hash0, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
casti_m128i( hash0, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
{
pdata[19] = n;
submit_solution( work, hash0, mythr );
}
}
if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
{
casti_m128i( hash1, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
casti_m128i( hash1, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
{
pdata[19] = n+1;
submit_solution( work, hash1, mythr );
}
}
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#endif
#if defined(SHA256D_16WAY)
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m512i hash32[8] __attribute__ ((aligned (128)));
__m512i block[16] __attribute__ ((aligned (64)));
__m512i buf[16] __attribute__ ((aligned (64)));
__m512i mstate1[8] __attribute__ ((aligned (64)));
__m512i mstate2[8] __attribute__ ((aligned (64)));
__m512i istate[8] __attribute__ ((aligned (64)));
__m512i mexp_pre[8] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const __m512i sixteen = _mm512_set1_epi32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
// prehash first block directly from pdata
sha256_transform_le( phash, pdata, sha256_iv );
// vectorize block 0 hash for second block
mstate1[0] = _mm512_set1_epi32( phash[0] );
mstate1[1] = _mm512_set1_epi32( phash[1] );
mstate1[2] = _mm512_set1_epi32( phash[2] );
mstate1[3] = _mm512_set1_epi32( phash[3] );
mstate1[4] = _mm512_set1_epi32( phash[4] );
mstate1[5] = _mm512_set1_epi32( phash[5] );
mstate1[6] = _mm512_set1_epi32( phash[6] );
mstate1[7] = _mm512_set1_epi32( phash[7] );
// second message block data, with nonce & padding
buf[0] = _mm512_set1_epi32( pdata[16] );
buf[1] = _mm512_set1_epi32( pdata[17] );
buf[2] = _mm512_set1_epi32( pdata[18] );
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
buf[4] = last_byte;
memset_zero_512( buf+5, 10 );
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for 2nd & 3rd sha256
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
istate[3] = _mm512_set1_epi32( sha256_iv[3] );
istate[4] = _mm512_set1_epi32( sha256_iv[4] );
istate[5] = _mm512_set1_epi32( sha256_iv[5] );
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
// initialize padding for 2nd sha256
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
do
{
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
{
for ( int lane = 0; lane < 16; lane++ )
if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
{
extr_lane_16x32( phash, hash32, lane, 256 );
casti_m256i( phash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, phash, mythr );
}
}
}
buf[3] = _mm512_add_epi32( buf[3], sixteen );
n += 16;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
/*
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
@@ -67,20 +251,18 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
mexp_pre );
// 2. 32 byte hash from 1.
if ( sha256_16way_transform_le_short( hash32, block, initstate ) )
{
// byte swap final hash for testing
mm512_block_bswap_32( hash32, hash32 );
sha256_16way_transform_le( hash32, block, initstate );
// byte swap final hash for testing
mm512_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
for ( int lane = 0; lane < 16; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm512_add_epi32( *noncev, sixteen );
@@ -90,6 +272,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
*hashes_done = n - first_nonce;
return 0;
}
*/
#endif
@@ -104,7 +287,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
__m256i mexp_pre[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
@@ -154,21 +337,18 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
mexp_pre );
// 2. 32 byte hash from 1.
if ( unlikely(
sha256_8way_transform_le_short( hash32, block, initstate ) ) )
{
// byte swap final hash for testing
mm256_block_bswap_32( hash32, hash32 );
sha256_8way_transform_le( hash32, block, initstate );
// byte swap final hash for testing
mm256_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm256_add_epi32( *noncev, eight );
@@ -191,8 +371,6 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i initstate[8] __attribute__ ((aligned (32)));
__m128i midstate1[8] __attribute__ ((aligned (32)));
__m128i midstate2[8] __attribute__ ((aligned (32)));
__m128i mexp_pre[16] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
@@ -232,31 +410,25 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
// hash first 64 bytes of data
sha256_4way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
do
{
// 1. final 16 bytes of data, with padding
sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
sha256_4way_transform_le( block, vdata+16, initstate );
// 2. 32 byte hash from 1.
if ( unlikely(
sha256_4way_transform_le_short( hash32, block, initstate ) ) )
{
// byte swap final hash for testing
mm128_block_bswap_32( hash32, hash32 );
sha256_4way_transform_le( hash32, block, initstate );
// byte swap final hash for testing
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
*noncev = _mm_add_epi32( *noncev, four );
@@ -268,21 +440,3 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
}
#endif
/*
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#elif defined(SHA256D_8WAY)
gate->scanhash = (void*)&scanhash_sha256d_8way;
#elif defined(SHA256D_4WAY)
gate->scanhash = (void*)&scanhash_sha256d_4way;
#endif
// gate->hash = (void*)&sha256d;
return true;
};
*/

View File

@@ -6,6 +6,8 @@
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA256D_16WAY 1
#elif defined(__SHA__)
#define SHA256D_SHA 1
#elif defined(__AVX2__)
#define SHA256D_8WAY 1
#else
@@ -32,15 +34,12 @@ int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_SHA)
/*
#if defined(__SHA__)
int scanhash_sha256d( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
*/
int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#endif

View File

@@ -3,99 +3,201 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "sha256-hash.h"
#include "sha-hash-4way.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA256DT_16WAY 1
#elif defined(__SHA__)
#define SHA256DT_SHA 1
#elif defined(__AVX2__)
#define SHA256DT_8WAY 1
#else
#define SHA256DT_4WAY 1
#endif
static const uint32_t sha256dt_iv[8] __attribute__ ((aligned (32))) =
{
0xdfa9bf2c, 0xb72074d4, 0x6bb01122, 0xd338e869,
0xaa3ff126, 0x475bbf30, 0x8fd52e5b, 0x9f75c9ad
};
#if defined(SHA256DT_16WAY)
int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m512i vdata[32] __attribute__ ((aligned (128)));
__m512i hash32[8] __attribute__ ((aligned (128)));
__m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (64)));
__m512i initstate[8] __attribute__ ((aligned (64)));
__m512i midstate1[8] __attribute__ ((aligned (64)));
__m512i midstate2[8] __attribute__ ((aligned (64)));
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
__m512i buf[16] __attribute__ ((aligned (64)));
__m512i mstate1[8] __attribute__ ((aligned (64)));
__m512i mstate2[8] __attribute__ ((aligned (64)));
__m512i istate[8] __attribute__ ((aligned (64)));
__m512i mexp_pre[8] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
// uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
// const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
uint32_t n = first_nonce;
__m512i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const __m512i sixteen = _mm512_set1_epi32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm512_set1_epi32( pdata[i] );
// prehash first block directly from pdata
sha256_transform_le( phash, pdata, sha256dt_iv );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
// vectorize block 0 hash for second block
mstate1[0] = _mm512_set1_epi32( phash[0] );
mstate1[1] = _mm512_set1_epi32( phash[1] );
mstate1[2] = _mm512_set1_epi32( phash[2] );
mstate1[3] = _mm512_set1_epi32( phash[3] );
mstate1[4] = _mm512_set1_epi32( phash[4] );
mstate1[5] = _mm512_set1_epi32( phash[5] );
mstate1[6] = _mm512_set1_epi32( phash[6] );
mstate1[7] = _mm512_set1_epi32( phash[7] );
vdata[16+4] = last_byte;
memset_zero_512( vdata+16 + 5, 10 );
vdata[16+15] = _mm512_set1_epi32( 0x480 );
// second message block data, with nonce & padding
buf[0] = _mm512_set1_epi32( pdata[16] );
buf[1] = _mm512_set1_epi32( pdata[17] );
buf[2] = _mm512_set1_epi32( pdata[18] );
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
buf[4] = last_byte;
memset_zero_512( buf+5, 10 );
buf[15] = _mm512_set1_epi32( 0x480 ); // sha256dt funky bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for 2nd sha256
istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
istate[3] = _mm512_set1_epi32( sha256dt_iv[3] );
istate[4] = _mm512_set1_epi32( sha256dt_iv[4] );
istate[5] = _mm512_set1_epi32( sha256dt_iv[5] );
istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );
// initialize padding for 2nd sha256
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = _mm512_set1_epi32( 0x300 );
initstate[0] = _mm512_set1_epi64( 0xdfa9bf2cdfa9bf2c );
initstate[1] = _mm512_set1_epi64( 0xb72074d4b72074d4 );
initstate[2] = _mm512_set1_epi64( 0x6bb011226bb01122 );
initstate[3] = _mm512_set1_epi64( 0xd338e869d338e869 );
initstate[4] = _mm512_set1_epi64( 0xaa3ff126aa3ff126 );
initstate[5] = _mm512_set1_epi64( 0x475bbf30475bbf30 );
initstate[6] = _mm512_set1_epi64( 0x8fd52e5b8fd52e5b );
initstate[7] = _mm512_set1_epi64( 0x9f75c9ad9f75c9ad );
sha256_16way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
memset_zero_512( block+9, 6 );
block[15] = _mm512_set1_epi32( 0x300 ); // bit count
do
{
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
sha256_16way_transform_le( hash32, block, initstate );
mm512_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 16; lane++ )
if ( hash32_d7[ lane ] <= targ32_d7 )
// finish second block with nonces
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
if ( unlikely( sha256_16way_transform_le_short(
hash32, block, istate, ptarget ) ) )
{
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
for ( int lane = 0; lane < 16; lane++ )
// if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
extr_lane_16x32( phash, hash32, lane, 256 );
casti_m256i( phash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, phash, mythr );
}
}
}
*noncev = _mm512_add_epi32( *noncev, sixteen );
buf[3] = _mm512_add_epi32( buf[3], sixteen );
n += 16;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(SHA256DT_SHA)
int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block0[16] __attribute__ ((aligned (64)));
uint32_t block1[16] __attribute__ ((aligned (64)));
uint32_t hash0[8] __attribute__ ((aligned (32)));
uint32_t hash1[8] __attribute__ ((aligned (32)));
uint32_t mstate[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i shuf_bswap32 =
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
#endif
// hash first 64 bytes of data
sha256_opt_transform_le( mstate, pdata, sha256dt_iv );
#if defined(SHA256DT_8WAY)
do
{
// 1. final 16 bytes of data, with padding
memcpy( block0, pdata + 16, 16 );
memcpy( block1, pdata + 16, 16 );
block0[ 3] = n;
block1[ 3] = n+1;
block0[ 4] = block1[ 4] = 0x80000000;
memset( block0 + 5, 0, 40 );
memset( block1 + 5, 0, 40 );
block0[15] = block1[15] = 0x480; // funky bit count
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
mstate, mstate );
// 2. 32 byte hash from 1.
memcpy( block0, hash0, 32 );
memcpy( block1, hash1, 32 );
block0[ 8] = block1[ 8] = 0x80000000;
memset( block0 + 9, 0, 24 );
memset( block1 + 9, 0, 24 );
block0[15] = block1[15] = 0x300; // bit count
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
sha256dt_iv, sha256dt_iv );
if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
{
casti_m128i( hash0, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
casti_m128i( hash0, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
{
pdata[19] = n;
submit_solution( work, hash0, mythr );
}
}
if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
{
casti_m128i( hash1, 0 ) =
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
casti_m128i( hash1, 1 ) =
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
{
pdata[19] = n+1;
submit_solution( work, hash1, mythr );
}
}
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#elif defined(SHA256DT_8WAY)
int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
@@ -103,15 +205,13 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
__m256i vdata[32] __attribute__ ((aligned (64)));
__m256i block[16] __attribute__ ((aligned (32)));
__m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
__m256i istate[8] __attribute__ ((aligned (32)));
__m256i mstate1[8] __attribute__ ((aligned (32)));
__m256i mstate2[8] __attribute__ ((aligned (32)));
__m256i mexp_pre[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
@@ -120,6 +220,8 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
const __m256i eight = _mm256_set1_epi32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm256_set1_epi32( pdata[i] );
@@ -135,35 +237,38 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
block[15] = _mm256_set1_epi32( 0x300 );
// initialize state
initstate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
initstate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
initstate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
initstate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
initstate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
initstate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
initstate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
initstate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );
istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
istate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
istate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
istate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
istate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
istate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );
sha256_8way_transform_le( midstate1, vdata, initstate );
sha256_8way_transform_le( mstate1, vdata, istate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
do
{
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
mexp_pre );
sha256_8way_transform_le( hash32, block, initstate );
mm256_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( hash32_d7[ lane ] <= targ32_d7 )
if ( unlikely( sha256_8way_transform_le_short(
hash32, block, istate, ptarget ) ) )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
for ( int lane = 0; lane < 8; lane++ )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
extr_lane_8x32( lane_hash, hash32, lane, 256 );
casti_m256i( lane_hash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
}
*noncev = _mm256_add_epi32( *noncev, eight );
@@ -174,10 +279,7 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
return 0;
}
#endif
#if defined(SHA256DT_4WAY)
#elif defined(SHA256DT_4WAY)
int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
@@ -230,21 +332,25 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
do
{
sha256_4way_transform_le( block, vdata+16, midstate );
sha256_4way_transform_le( hash32, block, initstate );
mm128_block_bswap_32( hash32, hash32 );
sha256_4way_transform_le( hash32, block, initstate );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
// if ( sha256_4way_transform_le_short( hash32, block, initstate, ptarget ) )
// {
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
}
*noncev = _mm_add_epi32( *noncev, four );
n += 4;
// }
*noncev = _mm_add_epi32( *noncev, four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
@@ -257,11 +363,14 @@ bool register_sha256dt_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256DT_16WAY)
gate->scanhash = (void*)&scanhash_sha256dt_16way;
gate->scanhash = (void*)&scanhash_sha256dt_16way;
#elif defined(SHA256DT_SHA)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256dt_sha;
#elif defined(SHA256DT_8WAY)
gate->scanhash = (void*)&scanhash_sha256dt_8way;
gate->scanhash = (void*)&scanhash_sha256dt_8way;
#else
gate->scanhash = (void*)&scanhash_sha256dt_4way;
gate->scanhash = (void*)&scanhash_sha256dt_4way;
#endif
return true;
}

View File

@@ -3,6 +3,7 @@
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "sha256-hash.h"
#include "sha-hash-4way.h"
#if defined(SHA256T_16WAY)
@@ -10,83 +11,96 @@
int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m512i vdata[32] __attribute__ ((aligned (128)));
__m512i hash32[8] __attribute__ ((aligned (128)));
__m512i block[16] __attribute__ ((aligned (64)));
__m512i hash32[8] __attribute__ ((aligned (64)));
__m512i initstate[8] __attribute__ ((aligned (64)));
__m512i midstate1[8] __attribute__ ((aligned (64)));
__m512i midstate2[8] __attribute__ ((aligned (64)));
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
__m512i buf[16] __attribute__ ((aligned (64)));
__m512i mstate1[8] __attribute__ ((aligned (64)));
__m512i mstate2[8] __attribute__ ((aligned (64)));
__m512i istate[8] __attribute__ ((aligned (64)));
__m512i mexp_pre[8] __attribute__ ((aligned (64)));
uint32_t phash[8] __attribute__ ((aligned (32)));
static const uint32_t IV[8] __attribute__ ((aligned (32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
uint32_t *ptarget = work->target;
uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 16;
uint32_t n = first_nonce;
__m512i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const __m512i sixteen = _mm512_set1_epi32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm512_set1_epi32( pdata[i] );
// prehash first block directly from pdata
sha256_transform_le( phash, pdata, IV );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
// vectorize block 0 hash for second block
mstate1[0] = _mm512_set1_epi32( phash[0] );
mstate1[1] = _mm512_set1_epi32( phash[1] );
mstate1[2] = _mm512_set1_epi32( phash[2] );
mstate1[3] = _mm512_set1_epi32( phash[3] );
mstate1[4] = _mm512_set1_epi32( phash[4] );
mstate1[5] = _mm512_set1_epi32( phash[5] );
mstate1[6] = _mm512_set1_epi32( phash[6] );
mstate1[7] = _mm512_set1_epi32( phash[7] );
vdata[16+4] = last_byte;
memset_zero_512( vdata+16 + 5, 10 );
vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
// second message block data, with nonce & padding
buf[0] = _mm512_set1_epi32( pdata[16] );
buf[1] = _mm512_set1_epi32( pdata[17] );
buf[2] = _mm512_set1_epi32( pdata[18] );
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
buf[4] = last_byte;
memset_zero_512( buf+5, 10 );
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for 2nd & 3rd sha256
istate[0] = _mm512_set1_epi32( IV[0] );
istate[1] = _mm512_set1_epi32( IV[1] );
istate[2] = _mm512_set1_epi32( IV[2] );
istate[3] = _mm512_set1_epi32( IV[3] );
istate[4] = _mm512_set1_epi32( IV[4] );
istate[5] = _mm512_set1_epi32( IV[5] );
istate[6] = _mm512_set1_epi32( IV[6] );
istate[7] = _mm512_set1_epi32( IV[7] );
// initialize padding for 2nd & 3rd sha256
block[ 8] = last_byte;
memset_zero_512( block + 9, 6 );
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
sha256_16way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
do
{
// 1. final 16 bytes of data, pre-padded
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
// 2. 32 byte hash from 1.
sha256_16way_transform_le( block, block, initstate );
sha256_16way_transform_le( block, block, istate );
// 3. 32 byte hash from 2.
if ( unlikely(
sha256_16way_transform_le_short( hash32, block, initstate ) ) )
if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
{
// byte swap final hash for testing
mm512_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 16; lane++ )
if ( hash32_d7[ lane ] <= targ32_d7 )
if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
{
extr_lane_16x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
extr_lane_16x32( phash, hash32, lane, 256 );
casti_m256i( phash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
submit_solution( work, phash, mythr );
}
}
}
*noncev = _mm512_add_epi32( *noncev, sixteen );
buf[3] = _mm512_add_epi32( buf[3], sixteen );
n += 16;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
@@ -94,26 +108,23 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
return 0;
}
#endif
#if defined(SHA256T_8WAY)
int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m256i vdata[32] __attribute__ ((aligned (64)));
__m256i block[16] __attribute__ ((aligned (32)));
__m256i hash32[8] __attribute__ ((aligned (32)));
__m256i initstate[8] __attribute__ ((aligned (32)));
__m256i midstate1[8] __attribute__ ((aligned (32)));
__m256i midstate2[8] __attribute__ ((aligned (32)));
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
__m256i istate[8] __attribute__ ((aligned (32)));
__m256i mstate1[8] __attribute__ ((aligned (32)));
__m256i mstate2[8] __attribute__ ((aligned (32)));
__m256i mexp_pre[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 8;
uint32_t n = first_nonce;
@@ -122,6 +133,8 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
const __m256i eight = _mm256_set1_epi32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm256_set1_epi32( pdata[i] );
@@ -135,42 +148,40 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
block[ 8] = last_byte;
memset_zero_256( block + 9, 6 );
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
sha256_8way_transform_le( midstate1, vdata, initstate );
// initialize state
istate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
istate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
istate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
istate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
istate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
istate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
istate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
istate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
sha256_8way_transform_le( mstate1, vdata, istate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
do
{
// 1. final 16 bytes of data, with padding
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
mexp_pre );
// 2. 32 byte hash from 1.
sha256_8way_transform_le( block, block, initstate );
sha256_8way_transform_le( block, block, istate );
// 3. 32 byte hash from 2.
if ( unlikely(
sha256_8way_transform_le_short( hash32, block, initstate ) ) )
if ( unlikely( sha256_8way_transform_le_short(
hash32, block, istate, ptarget ) ) )
{
// byte swap final hash for testing
mm256_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 8; lane++ )
if ( hash32_d7[ lane ] <= targ32_d7 )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
casti_m256i( lane_hash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
@@ -188,109 +199,18 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
#endif
#if defined(SHA256T_4WAY)
// Optimizations are slower with AVX/SSE2
// https://github.com/JayDDee/cpuminer-opt/issues/344
/*
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m128i vdata[32] __attribute__ ((aligned (64)));
__m128i block[16] __attribute__ ((aligned (32)));
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i initstate[8] __attribute__ ((aligned (32)));
__m128i midstate1[8] __attribute__ ((aligned (32)));
__m128i midstate2[8] __attribute__ ((aligned (32)));
__m128i mexp_pre[16] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
const uint32_t targ32_d7 = ptarget[7];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
__m128i *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
const __m128i four = _mm_set1_epi32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = _mm_set1_epi32( pdata[i] );
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
vdata[16+4] = last_byte;
memset_zero_128( vdata+16 + 5, 10 );
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
block[ 8] = last_byte;
memset_zero_128( block + 9, 6 );
block[15] = _mm_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform_le( midstate1, vdata, initstate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
do
{
// 1. final 16 bytes of data, with padding
sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
mexp_pre );
// 2. 32 byte hash from 1.
sha256_4way_transform_le( block, block, initstate );
// 3. 32 byte hash from 2.
if ( unlikely(
sha256_4way_transform_le_short( hash32, block, initstate ) ) )
{
// byte swap final hash for testing
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
}
*noncev = _mm_add_epi32( *noncev, four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
*/
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m128i vdata[32] __attribute__ ((aligned (64)));
__m128i block[16] __attribute__ ((aligned (32)));
__m128i hash32[8] __attribute__ ((aligned (32)));
__m128i initstate[8] __attribute__ ((aligned (32)));
__m128i midstate[8] __attribute__ ((aligned (32)));
__m128i istate[8] __attribute__ ((aligned (32)));
__m128i mstate[8] __attribute__ ((aligned (32)));
// __m128i mstate2[8] __attribute__ ((aligned (32)));
// __m128i mexp_pre[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
@@ -319,35 +239,44 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
block[15] = _mm_set1_epi32( 32*8 ); // bit count
// initialize state
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
istate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
istate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
istate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
istate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
istate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
istate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
istate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
istate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
// hash first 64 bytes of data
sha256_4way_transform_le( midstate, vdata, initstate );
sha256_4way_transform_le( mstate, vdata, istate );
// sha256_4way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
do
{
sha256_4way_transform_le( block, vdata+16, midstate );
sha256_4way_transform_le( block, block, initstate );
sha256_4way_transform_le( hash32, block, initstate );
mm128_block_bswap_32( hash32, hash32 );
// sha256_4way_final_rounds( block, vdata+16, mstate1, mstate2,
// mexp_pre );
sha256_4way_transform_le( block, vdata+16, mstate );
sha256_4way_transform_le( block, block, istate );
sha256_4way_transform_le( hash32, block, istate );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
// if ( unlikely( sha256_4way_transform_le_short(
// hash32, block, initstate, ptarget ) ))
// {
mm128_block_bswap_32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
}
}
}
// }
*noncev = _mm_add_epi32( *noncev, four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
@@ -356,6 +285,5 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
return 0;
}
#endif

View File

@@ -23,7 +23,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 1;
const uint32_t last_nonce = max_nonce - 2;
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;

View File

@@ -39,9 +39,9 @@
#define SPH_SMALL_FOOTPRINT_SHA2 1
#endif
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
#define CH(X, Y, Z) ( ( ( (Y) ^ (Z) ) & (X)) ^ (Z) )
//#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X)))
#define MAJ( X, Y, Z ) ( Y ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) )
#define MAJ( X, Y, Z ) ( (Y) ^ ( ( (X_xor_Y) = (X) ^ (Y) ) & (Y_xor_Z) ) )
#define ROTR SPH_ROTR32
#define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))