mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.23.1
This commit is contained in:
@@ -1,60 +1,15 @@
|
||||
/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
|
||||
/**
|
||||
* BLAKE interface. BLAKE is a family of functions which differ by their
|
||||
* output size; this implementation defines BLAKE for output sizes 224,
|
||||
* 256, 384 and 512 bits. This implementation conforms to the "third
|
||||
* round" specification.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_blake.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef __BLAKE_HASH_4WAY__
|
||||
#define __BLAKE_HASH_4WAY__ 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
#ifndef BLAKE_HASH_4WAY__
|
||||
#define BLAKE_HASH_4WAY__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_blake256 256
|
||||
|
||||
#define SPH_SIZE_blake512 512
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-256 1 way SSE2
|
||||
|
||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
const uint32_t T0, const uint32_t T1 );
|
||||
const uint32_t T0, const uint32_t T1, int rounds );
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
@@ -75,13 +30,13 @@ typedef struct {
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_4way_small_context __attribute__ ((aligned (64)));
|
||||
|
||||
// Default, 14 rounds, blake, decred
|
||||
// Default, 14 rounds
|
||||
typedef blake_4way_small_context blake256_4way_context;
|
||||
void blake256_4way_init(void *ctx);
|
||||
void blake256_4way_update(void *ctx, const void *data, size_t len);
|
||||
void blake256_4way_close(void *ctx, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
// 14 rounds
|
||||
typedef blake_4way_small_context blake256r14_4way_context;
|
||||
void blake256r14_4way_init(void *cc);
|
||||
void blake256r14_4way_update(void *cc, const void *data, size_t len);
|
||||
@@ -103,7 +58,7 @@ typedef struct {
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
__m256i H[8];
|
||||
size_t ptr;
|
||||
sph_u32 T0, T1;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_8way_small_context;
|
||||
|
||||
@@ -117,7 +72,7 @@ void blake256_8way_close_le(void *cc, void *dst);
|
||||
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data );
|
||||
const void *midhash, const void *data, const int rounds );
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_8way_small_context blake256r14_8way_context;
|
||||
@@ -138,7 +93,7 @@ typedef struct {
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
uint64_t T0, T1;
|
||||
} blake_4way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
@@ -180,7 +135,7 @@ void blake256_16way_close_le(void *cc, void *dst);
|
||||
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data );
|
||||
const void *midhash, const void *data, const int rounds );
|
||||
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
@@ -204,7 +159,7 @@ typedef struct {
|
||||
__m512i H[8];
|
||||
__m512i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
uint64_t T0, T1;
|
||||
} blake_8way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
@@ -224,8 +179,4 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // BLAKE_HASH_4WAY_H__
|
||||
|
@@ -40,26 +40,6 @@
|
||||
|
||||
#include "blake-hash-4way.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
|
||||
#define SPH_SMALL_FOOTPRINT_BLAKE 1
|
||||
#endif
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT_BLAKE
|
||||
#define SPH_COMPACT_BLAKE_32 1
|
||||
#endif
|
||||
|
||||
#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
|
||||
#define SPH_COMPACT_BLAKE_64 1
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
// Blake-256
|
||||
|
||||
static const uint32_t IV256[8] =
|
||||
@@ -68,7 +48,7 @@ static const uint32_t IV256[8] =
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
|
||||
#if 0
|
||||
|
||||
// Blake-256 4 & 8 way, Blake-512 4 way
|
||||
|
||||
@@ -273,44 +253,28 @@ static const unsigned sigma[16][16] = {
|
||||
#define CSx_(n) CSx__(n)
|
||||
#define CSx__(n) CS ## n
|
||||
|
||||
#define CS0 SPH_C32(0x243F6A88)
|
||||
#define CS1 SPH_C32(0x85A308D3)
|
||||
#define CS2 SPH_C32(0x13198A2E)
|
||||
#define CS3 SPH_C32(0x03707344)
|
||||
#define CS4 SPH_C32(0xA4093822)
|
||||
#define CS5 SPH_C32(0x299F31D0)
|
||||
#define CS6 SPH_C32(0x082EFA98)
|
||||
#define CS7 SPH_C32(0xEC4E6C89)
|
||||
#define CS8 SPH_C32(0x452821E6)
|
||||
#define CS9 SPH_C32(0x38D01377)
|
||||
#define CSA SPH_C32(0xBE5466CF)
|
||||
#define CSB SPH_C32(0x34E90C6C)
|
||||
#define CSC SPH_C32(0xC0AC29B7)
|
||||
#define CSD SPH_C32(0xC97C50DD)
|
||||
#define CSE SPH_C32(0x3F84D5B5)
|
||||
#define CSF SPH_C32(0xB5470917)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
|
||||
static const sph_u32 CS[16] = {
|
||||
SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
|
||||
SPH_C32(0x13198A2E), SPH_C32(0x03707344),
|
||||
SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
|
||||
SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
|
||||
SPH_C32(0x452821E6), SPH_C32(0x38D01377),
|
||||
SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
|
||||
SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
|
||||
SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
|
||||
};
|
||||
|
||||
#endif
|
||||
#define CS0 0x243F6A88
|
||||
#define CS1 0x85A308D3
|
||||
#define CS2 0x13198A2E
|
||||
#define CS3 0x03707344
|
||||
#define CS4 0xA4093822
|
||||
#define CS5 0x299F31D0
|
||||
#define CS6 0x082EFA98
|
||||
#define CS7 0xEC4E6C89
|
||||
#define CS8 0x452821E6
|
||||
#define CS9 0x38D01377
|
||||
#define CSA 0xBE5466CF
|
||||
#define CSB 0x34E90C6C
|
||||
#define CSC 0xC0AC29B7
|
||||
#define CSD 0xC97C50DD
|
||||
#define CSE 0x3F84D5B5
|
||||
#define CSF 0xB5470917
|
||||
|
||||
/////////////////////////////////////////
|
||||
//
|
||||
// Blake-256 1 way SIMD
|
||||
// Only used for prehash, otherwise 4way is used with SSE2.
|
||||
|
||||
// optimize shuffles to reduce latency caused by dependencies on V1.
|
||||
#define BLAKE256_ROUND( r ) \
|
||||
{ \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
@@ -353,52 +317,9 @@ static const sph_u32 CS[16] = {
|
||||
V2 = mm128_shufll_32( V2 ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#define BLAKE256_ROUND( r ) \
|
||||
{ \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
|
||||
CSx( r, 5 ) ^ Mx( r, 4 ), \
|
||||
CSx( r, 3 ) ^ Mx( r, 2 ), \
|
||||
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
|
||||
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
|
||||
CSx( r, 4 ) ^ Mx( r, 5 ), \
|
||||
CSx( r, 2 ) ^ Mx( r, 3 ), \
|
||||
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
|
||||
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
|
||||
V3 = mm128_shufll_32( V3 ); \
|
||||
V2 = mm128_swap_64( V2 ); \
|
||||
V1 = mm128_shuflr_32( V1 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, F ) ^ Mx( r, E ), \
|
||||
CSx( r, D ) ^ Mx( r, C ), \
|
||||
CSx( r, B ) ^ Mx( r, A ), \
|
||||
CSx( r, 9 ) ^ Mx( r, 8 ) ) ) ); \
|
||||
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, E ) ^ Mx( r, F ), \
|
||||
CSx( r, C ) ^ Mx( r, D ), \
|
||||
CSx( r, A ) ^ Mx( r, B ), \
|
||||
CSx( r, 8 ) ^ Mx( r, 9 ) ) ) ); \
|
||||
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
|
||||
V3 = mm128_shuflr_32( V3 ); \
|
||||
V2 = mm128_swap_64( V2 ); \
|
||||
V1 = mm128_shufll_32( V1 ); \
|
||||
}
|
||||
*/
|
||||
|
||||
// Default is 14 rounds, blakecoin & vanilla are 8.
|
||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
const uint32_t T0, const uint32_t T1 )
|
||||
const uint32_t T0, const uint32_t T1, int rounds )
|
||||
{
|
||||
__m128i V0, V1, V2, V3;
|
||||
uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
@@ -431,12 +352,15 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
BLAKE256_ROUND( 5 );
|
||||
BLAKE256_ROUND( 6 );
|
||||
BLAKE256_ROUND( 7 );
|
||||
BLAKE256_ROUND( 8 );
|
||||
BLAKE256_ROUND( 9 );
|
||||
BLAKE256_ROUND( 0 );
|
||||
BLAKE256_ROUND( 1 );
|
||||
BLAKE256_ROUND( 2 );
|
||||
BLAKE256_ROUND( 3 );
|
||||
if ( rounds > 8 ) // 14
|
||||
{
|
||||
BLAKE256_ROUND( 8 );
|
||||
BLAKE256_ROUND( 9 );
|
||||
BLAKE256_ROUND( 0 );
|
||||
BLAKE256_ROUND( 1 );
|
||||
BLAKE256_ROUND( 2 );
|
||||
BLAKE256_ROUND( 3 );
|
||||
}
|
||||
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
|
||||
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
|
||||
}
|
||||
@@ -459,34 +383,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
|
||||
}
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
|
||||
// Not used
|
||||
#if 0
|
||||
|
||||
#define ROUND_S_4WAY(r) do { \
|
||||
GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
|
||||
CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \
|
||||
GS_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
|
||||
CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \
|
||||
GS_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
|
||||
CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \
|
||||
GS_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
|
||||
CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \
|
||||
GS_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
|
||||
CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \
|
||||
GS_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
|
||||
CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \
|
||||
GS_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
|
||||
CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \
|
||||
GS_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
|
||||
CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define ROUND_S_4WAY(r) \
|
||||
{ \
|
||||
GS_4WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||
@@ -499,8 +395,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
GS_4WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define DECL_STATE32_4WAY \
|
||||
__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
uint32_t T0, T1;
|
||||
@@ -531,56 +425,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
// not used
|
||||
#if 0
|
||||
#define COMPRESS32_4WAY( rounds ) do { \
|
||||
__m128i M[16]; \
|
||||
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
unsigned r; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
|
||||
V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
|
||||
VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
|
||||
VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
|
||||
mm128_block_bswap_32( M, buf ); \
|
||||
mm128_block_bswap_32( M+8, buf+8 ); \
|
||||
for (r = 0; r < rounds; r ++) \
|
||||
ROUND_S_4WAY(r); \
|
||||
H0 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S0, V0 ), V8 ), H0 ); \
|
||||
H1 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S1, V1 ), V9 ), H1 ); \
|
||||
H2 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S2, V2 ), VA ), H2 ); \
|
||||
H3 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S3, V3 ), VB ), H3 ); \
|
||||
H4 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S0, V4 ), VC ), H4 ); \
|
||||
H5 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S1, V5 ), VD ), H5 ); \
|
||||
H6 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S2, V6 ), VE ), H6 ); \
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S3, V7 ), VF ), H7 ); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
// current impl
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
@@ -680,8 +524,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
/////////////////////////////////
|
||||
@@ -968,7 +810,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
|
||||
#define DECL_STATE32_8WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
sph_u32 T0, T1;
|
||||
uint32_t T0, T1;
|
||||
|
||||
#define READ_STATE32_8WAY(state) \
|
||||
do { \
|
||||
@@ -1046,7 +888,7 @@ do { \
|
||||
ROUND_S_8WAY(5); \
|
||||
ROUND_S_8WAY(6); \
|
||||
ROUND_S_8WAY(7); \
|
||||
if (rounds == 14) \
|
||||
if (rounds > 8) \
|
||||
{ \
|
||||
ROUND_S_8WAY(8); \
|
||||
ROUND_S_8WAY(9); \
|
||||
@@ -1111,7 +953,7 @@ do { \
|
||||
ROUND_S_8WAY(5); \
|
||||
ROUND_S_8WAY(6); \
|
||||
ROUND_S_8WAY(7); \
|
||||
if (rounds == 14) \
|
||||
if (rounds > 8) \
|
||||
{ \
|
||||
ROUND_S_8WAY(8); \
|
||||
ROUND_S_8WAY(9); \
|
||||
@@ -1156,7 +998,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
|
||||
// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
|
||||
// M[ 5:12, 14 ] are always zero and not needed or used.
|
||||
// M[ 4], M[ 13], M[15] are constant and are initialized here.
|
||||
// M[ 4], M[13], M[15] are constant and are initialized here.
|
||||
// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
|
||||
|
||||
M[ 4] = _mm256_set1_epi32( 0x80000000 );
|
||||
@@ -1221,7 +1063,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
}
|
||||
|
||||
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data )
|
||||
const void *midhash, const void *data, const int rounds )
|
||||
{
|
||||
__m256i *H = (__m256i*)final_hash;
|
||||
const __m256i *h = (const __m256i*)midhash;
|
||||
@@ -1315,12 +1157,15 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
ROUND256_8WAY_5;
|
||||
ROUND256_8WAY_6;
|
||||
ROUND256_8WAY_7;
|
||||
ROUND256_8WAY_8;
|
||||
ROUND256_8WAY_9;
|
||||
ROUND256_8WAY_0;
|
||||
ROUND256_8WAY_1;
|
||||
ROUND256_8WAY_2;
|
||||
ROUND256_8WAY_3;
|
||||
if ( rounds > 8 )
|
||||
{
|
||||
ROUND256_8WAY_8;
|
||||
ROUND256_8WAY_9;
|
||||
ROUND256_8WAY_0;
|
||||
ROUND256_8WAY_1;
|
||||
ROUND256_8WAY_2;
|
||||
ROUND256_8WAY_3;
|
||||
}
|
||||
|
||||
const __m256i shuf_bswap32 =
|
||||
mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
@@ -1623,7 +1468,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
|
||||
#define DECL_STATE32_16WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
sph_u32 T0, T1;
|
||||
uint32_t T0, T1;
|
||||
|
||||
#define READ_STATE32_16WAY(state) \
|
||||
do { \
|
||||
@@ -1882,8 +1727,9 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
_mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
|
||||
}
|
||||
|
||||
// Dfault is 14 rounds, blakecoin & vanilla are 8.
|
||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data )
|
||||
const void *midhash, const void *data, const int rounds )
|
||||
{
|
||||
__m512i *H = (__m512i*)final_hash;
|
||||
const __m512i *h = (const __m512i*)midhash;
|
||||
@@ -1988,12 +1834,15 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
ROUND256_16WAY_5;
|
||||
ROUND256_16WAY_6;
|
||||
ROUND256_16WAY_7;
|
||||
ROUND256_16WAY_8;
|
||||
ROUND256_16WAY_9;
|
||||
ROUND256_16WAY_0;
|
||||
ROUND256_16WAY_1;
|
||||
ROUND256_16WAY_2;
|
||||
ROUND256_16WAY_3;
|
||||
if ( rounds > 8 )
|
||||
{
|
||||
ROUND256_16WAY_8;
|
||||
ROUND256_16WAY_9;
|
||||
ROUND256_16WAY_0;
|
||||
ROUND256_16WAY_1;
|
||||
ROUND256_16WAY_2;
|
||||
ROUND256_16WAY_3;
|
||||
}
|
||||
|
||||
// Byte swap final hash
|
||||
const __m512i shuf_bswap32 =
|
||||
@@ -2057,7 +1906,7 @@ blake32_4way( blake_4way_small_context *ctx, const void *data,
|
||||
size_t clen = ( sizeof ctx->buf ) - bptr;
|
||||
|
||||
if ( clen > blen )
|
||||
clen = blen;
|
||||
clen = blen;
|
||||
memcpy( buf + vptr, data, clen );
|
||||
bptr += clen;
|
||||
data = (const unsigned char *)data + clen;
|
||||
@@ -2130,11 +1979,11 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
||||
|
||||
// Blake-256 8 way
|
||||
|
||||
static const sph_u32 salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
static void
|
||||
blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt, int rounds )
|
||||
blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
@@ -2181,8 +2030,8 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
|
||||
T1 = SPH_T32(T1 + 1);
|
||||
if ( ( T0 = T0 + 512 ) < 512 )
|
||||
T1 = T1 + 1;
|
||||
COMPRESS32_8WAY( sc->rounds );
|
||||
ptr = 0;
|
||||
}
|
||||
@@ -2198,7 +2047,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
__m256i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
sph_u32 th, tl;
|
||||
uint32_t th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
@@ -2208,13 +2057,13 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
if ( ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
sc->T0 = 0xFFFFFE00UL;
|
||||
sc->T1 = 0xFFFFFFFFUL;
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
|
||||
sc->T1 = SPH_T32(sc->T1 - 1);
|
||||
sc->T0 = 0xFFFFFE00UL + bit_len;
|
||||
sc->T1 = sc->T1 - 1;
|
||||
}
|
||||
else
|
||||
sc->T0 -= 512 - bit_len;
|
||||
@@ -2233,8 +2082,8 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
sc->T0 = 0xFFFFFE00UL;
|
||||
sc->T1 = 0xFFFFFFFFUL;
|
||||
memset_zero_256( buf, 56>>2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = _mm256_set1_epi64x( 0x0100000001000000ULL );
|
||||
@@ -2277,8 +2126,8 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
|
||||
T1 = SPH_T32(T1 + 1);
|
||||
if ( ( T0 = T0 + 512 ) < 512 )
|
||||
T1 = T1 + 1;
|
||||
COMPRESS32_8WAY_LE( sc->rounds );
|
||||
ptr = 0;
|
||||
}
|
||||
@@ -2294,7 +2143,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
__m256i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
sph_u32 th, tl;
|
||||
uint32_t th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
@@ -2304,13 +2153,13 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
if ( ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
sc->T0 = 0xFFFFFE00UL;
|
||||
sc->T1 = 0xFFFFFFFFUL;
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
|
||||
sc->T1 = SPH_T32(sc->T1 - 1);
|
||||
sc->T0 = 0xFFFFFE00UL + bit_len;
|
||||
sc->T1 = sc->T1 - 1;
|
||||
}
|
||||
else
|
||||
sc->T0 -= 512 - bit_len;
|
||||
@@ -2328,8 +2177,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||
blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
sc->T0 = 0xFFFFFE00UL;
|
||||
sc->T1 = 0xFFFFFFFFUL;
|
||||
memset_zero_256( buf, 56>>2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = m256_one_32;
|
||||
@@ -2348,8 +2197,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
//Blake-256 16 way AVX512
|
||||
|
||||
static void
|
||||
blake32_16way_init( blake_16way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt, int rounds )
|
||||
blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E6676A09E667 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
|
||||
@@ -2411,7 +2260,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
__m512i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
sph_u32 th, tl;
|
||||
uint32_t th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
@@ -2508,7 +2357,7 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
__m512i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
sph_u32 th, tl;
|
||||
uint32_t th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
@@ -2618,8 +2467,6 @@ blake256r8_16way_close(void *cc, void *dst)
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
|
||||
// Blake-256 4 way
|
||||
|
||||
// default 14 rounds, backward copatibility
|
||||
@@ -2754,9 +2601,3 @@ blake256r8_8way_close(void *cc, void *dst)
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
//#endif
|
||||
|
@@ -1,62 +1,22 @@
|
||||
/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
|
||||
/*
|
||||
* BLAKE implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include "blake-hash-4way.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
// Blake-512 common
|
||||
|
||||
/*
|
||||
static const sph_u64 IV512[8] = {
|
||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
||||
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
|
||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
||||
static const uint64_t IV512[8] =
|
||||
{
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
|
||||
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
};
|
||||
|
||||
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
static const uint64_t salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static const unsigned sigma[16][16] = {
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
@@ -77,15 +37,15 @@ static const unsigned sigma[16][16] = {
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||
};
|
||||
|
||||
static const sph_u64 CB[16] = {
|
||||
SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
|
||||
SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
|
||||
SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
|
||||
SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
|
||||
SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
|
||||
SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
|
||||
SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
|
||||
SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
|
||||
static const uint64_t CB[16] = {
|
||||
0x243F6A8885A308D3, 0x13198A2E03707344,
|
||||
0xA4093822299F31D0, 0x082EFA98EC4E6C89,
|
||||
0x452821E638D01377, 0xBE5466CF34E90C6C,
|
||||
0xC0AC29B7C97C50DD, 0x3F84D5B5B5470917,
|
||||
0x9216D5D98979FB1B, 0xD1310BA698DFB5AC,
|
||||
0x2FFD72DBD01ADFB7, 0xB8E1AFED6A267E96,
|
||||
0xBA7C9045F12C7F99, 0x24A19947B3916CF7,
|
||||
0x0801F2E2858EFC16, 0x636920D871574E69
|
||||
|
||||
*/
|
||||
|
||||
@@ -1486,7 +1446,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( (T0 = T0 + 1024 ) < 1024 )
|
||||
T1 = SPH_T64(T1 + 1);
|
||||
T1 = T1 + 1;
|
||||
COMPRESS64_4WAY;
|
||||
ptr = 0;
|
||||
}
|
||||
@@ -1538,8 +1498,8 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
memset_zero_256( buf, 112>>3 );
|
||||
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
|
||||
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
|
||||
@@ -1629,8 +1589,4 @@ blake512_4way_close(void *cc, void *dst)
|
||||
blake64_4way_close( cc, dst );
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -4,7 +4,149 @@
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
||||
#if defined (BLAKECOIN_4WAY)
|
||||
#define rounds 8
|
||||
|
||||
#if defined (BLAKECOIN_16WAY)
|
||||
|
||||
int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*16] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
|
||||
__m512i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m512i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( ((__m512i*)hash32)[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
uint32_t phash[8] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = (const uint32_t) n;
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, rounds );
|
||||
|
||||
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm512_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm512_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm512_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm512_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm512_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
block_buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
block_buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
block_buf[3] =
|
||||
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
block_buf, rounds );
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
block_buf[3] = _mm512_add_epi32( block_buf[3], sixteen );
|
||||
n += 16;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (BLAKECOIN_8WAY)
|
||||
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*8] __attribute__ ((aligned (32)));
|
||||
__m256i block0_hash[8] __attribute__ ((aligned (32)));
|
||||
__m256i block_buf[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( ((__m256i*)hash32)[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = (const uint32_t) n;
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, rounds );
|
||||
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
block_buf, rounds );
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
block_buf[3] = _mm256_add_epi32( block_buf[3], eight );
|
||||
n += 8;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (BLAKECOIN_4WAY)
|
||||
|
||||
blake256r8_4way_context blakecoin_4w_ctx;
|
||||
|
||||
@@ -61,7 +203,8 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(BLAKECOIN_8WAY)
|
||||
#if 0
|
||||
//#if defined(BLAKECOIN_8WAY)
|
||||
|
||||
blake256r8_8way_context blakecoin_8w_ctx;
|
||||
|
||||
@@ -78,11 +221,84 @@ void blakecoin_8way_hash( void *state, const void *input )
|
||||
state+160, state+192, state+224, vhash, 256 );
|
||||
}
|
||||
|
||||
/*
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
|
||||
__m256i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m256i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = (uint32_t*)work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, 8 );
|
||||
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
block_buf );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
block_buf[3] = _mm256_add_epi32( block_buf[3], eight );
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash32[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
blake256r8_8way_context ctx __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( ((__m256i*)hash32)[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -101,15 +317,22 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
blakecoin_8way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget )
|
||||
&& !opt_benchmark )
|
||||
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
|
||||
blake256r8_8way_update( &ctx, (const void*)vdata + (64<<3), 16 );
|
||||
blake256r8_8way_close( &ctx, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash32_d7[ lane ] <= HTarget )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
|
||||
n += 8;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
|
@@ -4,10 +4,10 @@
|
||||
// vanilla uses default gen merkle root, otherwise identical to blakecoin
|
||||
bool register_vanilla_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKECOIN_8WAY)
|
||||
#if defined(BLAKECOIN_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blakecoin_16way;
|
||||
#elif defined(BLAKECOIN_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blakecoin_8way;
|
||||
gate->hash = (void*)&blakecoin_8way_hash;
|
||||
|
||||
#elif defined(BLAKECOIN_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blakecoin_4way;
|
||||
gate->hash = (void*)&blakecoin_4way_hash;
|
||||
@@ -15,14 +15,14 @@ bool register_vanilla_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_blakecoin;
|
||||
gate->hash = (void*)&blakecoinhash;
|
||||
#endif
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_blakecoin_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_vanilla_algo( gate );
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@@ -1,30 +1,36 @@
|
||||
#ifndef __BLAKECOIN_GATE_H__
|
||||
#define __BLAKECOIN_GATE_H__ 1
|
||||
#ifndef BLAKECOIN_GATE_H__
|
||||
#define BLAKECOIN_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKECOIN_16WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKECOIN_8WAY
|
||||
#elif defined(__SSE2__) // always true
|
||||
#define BLAKECOIN_4WAY
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
#define BLAKECOIN_8WAY
|
||||
#endif
|
||||
|
||||
#if defined (BLAKECOIN_8WAY)
|
||||
void blakecoin_8way_hash(void *state, const void *input);
|
||||
#if defined (BLAKECOIN_16WAY)
|
||||
int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKECOIN_8WAY)
|
||||
//void blakecoin_8way_hash(void *state, const void *input);
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
#if defined (BLAKECOIN_4WAY)
|
||||
#elif defined (BLAKECOIN_4WAY)
|
||||
void blakecoin_4way_hash(void *state, const void *input);
|
||||
int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
#else // never used
|
||||
|
||||
void blakecoinhash( void *state, const void *input );
|
||||
int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -1,6 +1,6 @@
|
||||
#include "blakecoin-gate.h"
|
||||
|
||||
#if !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
|
||||
#if !defined(BLAKECOIN_16WAY) && !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
|
||||
|
||||
#define BLAKE32_ROUNDS 8
|
||||
#include "sph_blake.h"
|
||||
@@ -12,7 +12,6 @@ void blakecoin_close(void *cc, void *dst);
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
#include <openssl/sha.h>
|
||||
|
||||
// context management is staged for efficiency.
|
||||
// 1. global initial ctx cached on startup
|
||||
@@ -35,8 +34,8 @@ void blakecoinhash( void *state, const void *input )
|
||||
uint8_t hash[64] __attribute__ ((aligned (32)));
|
||||
uint8_t *ending = (uint8_t*) input + 64;
|
||||
|
||||
// copy cached midstate
|
||||
memcpy( &ctx, &blake_mid_ctx, sizeof ctx );
|
||||
// copy cached midstate
|
||||
memcpy( &ctx, &blake_mid_ctx, sizeof ctx );
|
||||
blakecoin( &ctx, ending, 16 );
|
||||
blakecoin_close( &ctx, hash );
|
||||
memcpy( state, hash, 32 );
|
||||
@@ -45,8 +44,8 @@ void blakecoinhash( void *state, const void *input )
|
||||
int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t HTarget = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
@@ -60,10 +59,10 @@ int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
HTarget = 0x7f;
|
||||
|
||||
// we need big endian data...
|
||||
for (int kk=0; kk < 19; kk++)
|
||||
be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
|
||||
for (int kk=0; kk < 19; kk++)
|
||||
be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
|
||||
|
||||
blake_midstate_init( endiandata );
|
||||
blake_midstate_init( endiandata );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);
|
||||
|
Reference in New Issue
Block a user