mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.23.1
This commit is contained in:
@@ -65,6 +65,13 @@ If not what makes it happen or not happen?
|
||||
Change Log
|
||||
----------
|
||||
|
||||
v3.23.1
|
||||
|
||||
#349: Fix sha256t low difficulty shares and low effective hash rate.
|
||||
Faster sha256dt: AVX512 +7%, SHA +200%, AVX2 +5%.
|
||||
Faster blakecoin & vanilla: AVX2 +30%, AVX512 +110%.
|
||||
Other small improvements and code cleanup.
|
||||
|
||||
v3.23.0
|
||||
|
||||
#398: Prevent GBT fallback to Getwork on network error.
|
||||
@@ -214,40 +221,29 @@ v3.19.5
|
||||
|
||||
Enhanced stratum-keepalive preemptively resets the stratum connection
|
||||
before the server to avoid lost shares.
|
||||
|
||||
Added build-msys2.sh shell script for easier compiling on Windows, see Wiki for details.
|
||||
|
||||
X16RT: eliminate unnecessary recalculations of the hash order.
|
||||
|
||||
Fix a few compiler warnings.
|
||||
|
||||
Fixed log colour error when a block is solved.
|
||||
|
||||
v3.19.4
|
||||
|
||||
#359: Fix verthash memory allocation for non-hugepages, broken in v3.19.3.
|
||||
|
||||
New option stratum-keepalive prevents stratum timeouts when no shares are
|
||||
submitted for several minutes due to high difficulty.
|
||||
|
||||
Fixed a bug displaying optimizations for some algos.
|
||||
|
||||
v3.19.3
|
||||
|
||||
Linux: Faster verthash (+25%), scryptn2 (+2%) when huge pages are available.
|
||||
|
||||
Small speed up for Hamsi AVX2 & AVX512, Keccak AVX512.
|
||||
|
||||
v3.19.2
|
||||
|
||||
Fixed log displaying incorrect memory usage for scrypt, broken in v3.19.1.
|
||||
|
||||
Reduce log noise when replies to submitted shares are lost due to stratum errors.
|
||||
|
||||
Fugue prehash optimization for X16r family AVX2 & AVX512.
|
||||
|
||||
Small speed improvement for Hamsi AVX2 & AVX512.
|
||||
|
||||
Win: With CPU groups enabled the number of CPUs displayed in the ASCII art
|
||||
affinity map is the number of CPUs in a CPU group, was number of CPUs up to 64.
|
||||
|
||||
@@ -259,7 +255,6 @@ Changes to Windows binaries package:
|
||||
- zen build renamed to avx2-sha, supports Zen1 & Zen2,
|
||||
- avx512-sha build removed, Rocketlake CPUs can use avx512-sha-vaes,
|
||||
- see README.txt for compatibility details.
|
||||
|
||||
Fixed a few compiler warnings that are new in GCC 11.
|
||||
Other minor fixes.
|
||||
|
||||
@@ -273,22 +268,17 @@ Changes to cpu-affinity:
|
||||
- streamlined code for more efficient initialization of miner threads,
|
||||
- precise affining of each miner thread to a specific CPU,
|
||||
- added an option to disable CPU affinity with "--cpu-affinity 0"
|
||||
|
||||
Faster sha256t with AVX512 & AVX2.
|
||||
|
||||
Added stratum error count to stats log, reported only when non-zero.
|
||||
|
||||
v3.18.2
|
||||
|
||||
Issue #342, fixed Groestl AES on Windows, broken in v3.18.0.
|
||||
|
||||
AVX512 for sha256d.
|
||||
|
||||
SSE42 and AVX may now be displayed as mining features at startup.
|
||||
This is hard coded for each algo, and is only implemented for scrypt
|
||||
at this time as it is the only algo with significant performance differences
|
||||
with those features.
|
||||
|
||||
Fixed an issue where a high hashrate algo could cause excessive invalid hash
|
||||
rate log reports when starting up in benchmark mode.
|
||||
|
||||
@@ -299,9 +289,7 @@ More speed for scrypt:
|
||||
- AVX2 is now used by default on CPUS with SHA but not AVX512,
|
||||
- scrypt:1024 performance lost in v3.18.0 is restored,
|
||||
- AVX512 & AVX2 improvements to scrypt:1024.
|
||||
|
||||
Big speedup for SwiFFTx AVX2 & SSE4.1: x22i +55%, x25x +22%.
|
||||
|
||||
Issue #337: fixed a problem that could display negative stats values in the
|
||||
first summary report if the report was forced prematurely due to a stratum
|
||||
diff change. The stats will still be invalid but should display zeros.
|
||||
@@ -314,26 +302,19 @@ Complete rewrite of Scrypt code, optimized for large N factor (scryptn2):
|
||||
- memory requirements reduced 30-60% depending on CPU architecture,
|
||||
- memory usage displayed at startup,
|
||||
- scrypt, default N=1024 (LTC), will likely perform slower.
|
||||
|
||||
Improved stale share detection and handling for Scrypt with large N factor:
|
||||
- abort and discard partially computed hash when new work is detected,
|
||||
- quicker response to new job, less time wasted mining stale job.
|
||||
|
||||
Improved stale share handling for all algorithms:
|
||||
- report possible stale share when new work received with a previously
|
||||
submitted share still pending,
|
||||
- when new work is detected report the submission of an already completed,
|
||||
otherwise valid, but likely stale, share,
|
||||
- fixed incorrect block height in stale share log.
|
||||
|
||||
Small performance improvements to sha, bmw, cube & hamsi for AVX512 & AVX2.
|
||||
|
||||
When stratum disconnects miner threads go to idle until reconnected.
|
||||
|
||||
Colour changes to some logs.
|
||||
|
||||
Some low level function name changes for clarity and consistency.
|
||||
|
||||
The reference hashrate in the summary log and the benchmark total hashrate
|
||||
are now the mean hashrate for the session.
|
||||
|
||||
@@ -446,7 +427,6 @@ Fixed neoscrypt BUG log.
|
||||
v3.14.3
|
||||
|
||||
#265: more mutex changes to reduce blocking with high thread count.
|
||||
|
||||
#267: fixed hodl algo potential memory alignment issue,
|
||||
add warning when thread count is not valid for mining hodl algo.
|
||||
|
||||
|
@@ -267,6 +267,8 @@ void std_get_new_work( struct work *work, struct work *g_work, int thr_id,
|
||||
uint32_t* end_nonce_ptr );
|
||||
|
||||
void sha256d_gen_merkle_root( char *merkle_root, struct stratum_ctx *sctx );
|
||||
void sha256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
||||
// OpenSSL sha256 deprecated
|
||||
void SHA256_gen_merkle_root ( char *merkle_root, struct stratum_ctx *sctx );
|
||||
|
||||
bool std_le_work_decode( struct work *work );
|
||||
|
@@ -1,60 +1,15 @@
|
||||
/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
|
||||
/**
|
||||
* BLAKE interface. BLAKE is a family of functions which differ by their
|
||||
* output size; this implementation defines BLAKE for output sizes 224,
|
||||
* 256, 384 and 512 bits. This implementation conforms to the "third
|
||||
* round" specification.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @file sph_blake.h
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#ifndef __BLAKE_HASH_4WAY__
|
||||
#define __BLAKE_HASH_4WAY__ 1
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
#ifndef BLAKE_HASH_4WAY__
|
||||
#define BLAKE_HASH_4WAY__ 1
|
||||
|
||||
#include <stddef.h>
|
||||
#include "algo/sha/sph_types.h"
|
||||
#include "simd-utils.h"
|
||||
|
||||
#define SPH_SIZE_blake256 256
|
||||
|
||||
#define SPH_SIZE_blake512 512
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
// Blake-256 1 way SSE2
|
||||
|
||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
const uint32_t T0, const uint32_t T1 );
|
||||
const uint32_t T0, const uint32_t T1, int rounds );
|
||||
|
||||
/////////////////////////
|
||||
//
|
||||
@@ -75,13 +30,13 @@ typedef struct {
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_4way_small_context __attribute__ ((aligned (64)));
|
||||
|
||||
// Default, 14 rounds, blake, decred
|
||||
// Default, 14 rounds
|
||||
typedef blake_4way_small_context blake256_4way_context;
|
||||
void blake256_4way_init(void *ctx);
|
||||
void blake256_4way_update(void *ctx, const void *data, size_t len);
|
||||
void blake256_4way_close(void *ctx, void *dst);
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
// 14 rounds
|
||||
typedef blake_4way_small_context blake256r14_4way_context;
|
||||
void blake256r14_4way_init(void *cc);
|
||||
void blake256r14_4way_update(void *cc, const void *data, size_t len);
|
||||
@@ -103,7 +58,7 @@ typedef struct {
|
||||
__m256i buf[16] __attribute__ ((aligned (64)));
|
||||
__m256i H[8];
|
||||
size_t ptr;
|
||||
sph_u32 T0, T1;
|
||||
uint32_t T0, T1;
|
||||
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
||||
} blake_8way_small_context;
|
||||
|
||||
@@ -117,7 +72,7 @@ void blake256_8way_close_le(void *cc, void *dst);
|
||||
void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data );
|
||||
const void *midhash, const void *data, const int rounds );
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
typedef blake_8way_small_context blake256r14_8way_context;
|
||||
@@ -138,7 +93,7 @@ typedef struct {
|
||||
__m256i H[8];
|
||||
__m256i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
uint64_t T0, T1;
|
||||
} blake_4way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_4way_big_context blake512_4way_context;
|
||||
@@ -180,7 +135,7 @@ void blake256_16way_close_le(void *cc, void *dst);
|
||||
void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
void *data );
|
||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data );
|
||||
const void *midhash, const void *data, const int rounds );
|
||||
|
||||
|
||||
// 14 rounds, blake, decred
|
||||
@@ -204,7 +159,7 @@ typedef struct {
|
||||
__m512i H[8];
|
||||
__m512i S[4];
|
||||
size_t ptr;
|
||||
sph_u64 T0, T1;
|
||||
uint64_t T0, T1;
|
||||
} blake_8way_big_context __attribute__ ((aligned (128)));
|
||||
|
||||
typedef blake_8way_big_context blake512_8way_context;
|
||||
@@ -224,8 +179,4 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
|
||||
#endif // AVX512
|
||||
#endif // AVX2
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // BLAKE_HASH_4WAY_H__
|
||||
|
@@ -40,26 +40,6 @@
|
||||
|
||||
#include "blake-hash-4way.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
|
||||
#define SPH_SMALL_FOOTPRINT_BLAKE 1
|
||||
#endif
|
||||
|
||||
#if SPH_SMALL_FOOTPRINT_BLAKE
|
||||
#define SPH_COMPACT_BLAKE_32 1
|
||||
#endif
|
||||
|
||||
#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
|
||||
#define SPH_COMPACT_BLAKE_64 1
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
// Blake-256
|
||||
|
||||
static const uint32_t IV256[8] =
|
||||
@@ -68,7 +48,7 @@ static const uint32_t IV256[8] =
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
|
||||
#if 0
|
||||
|
||||
// Blake-256 4 & 8 way, Blake-512 4 way
|
||||
|
||||
@@ -273,44 +253,28 @@ static const unsigned sigma[16][16] = {
|
||||
#define CSx_(n) CSx__(n)
|
||||
#define CSx__(n) CS ## n
|
||||
|
||||
#define CS0 SPH_C32(0x243F6A88)
|
||||
#define CS1 SPH_C32(0x85A308D3)
|
||||
#define CS2 SPH_C32(0x13198A2E)
|
||||
#define CS3 SPH_C32(0x03707344)
|
||||
#define CS4 SPH_C32(0xA4093822)
|
||||
#define CS5 SPH_C32(0x299F31D0)
|
||||
#define CS6 SPH_C32(0x082EFA98)
|
||||
#define CS7 SPH_C32(0xEC4E6C89)
|
||||
#define CS8 SPH_C32(0x452821E6)
|
||||
#define CS9 SPH_C32(0x38D01377)
|
||||
#define CSA SPH_C32(0xBE5466CF)
|
||||
#define CSB SPH_C32(0x34E90C6C)
|
||||
#define CSC SPH_C32(0xC0AC29B7)
|
||||
#define CSD SPH_C32(0xC97C50DD)
|
||||
#define CSE SPH_C32(0x3F84D5B5)
|
||||
#define CSF SPH_C32(0xB5470917)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
|
||||
static const sph_u32 CS[16] = {
|
||||
SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
|
||||
SPH_C32(0x13198A2E), SPH_C32(0x03707344),
|
||||
SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
|
||||
SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
|
||||
SPH_C32(0x452821E6), SPH_C32(0x38D01377),
|
||||
SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
|
||||
SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
|
||||
SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
|
||||
};
|
||||
|
||||
#endif
|
||||
#define CS0 0x243F6A88
|
||||
#define CS1 0x85A308D3
|
||||
#define CS2 0x13198A2E
|
||||
#define CS3 0x03707344
|
||||
#define CS4 0xA4093822
|
||||
#define CS5 0x299F31D0
|
||||
#define CS6 0x082EFA98
|
||||
#define CS7 0xEC4E6C89
|
||||
#define CS8 0x452821E6
|
||||
#define CS9 0x38D01377
|
||||
#define CSA 0xBE5466CF
|
||||
#define CSB 0x34E90C6C
|
||||
#define CSC 0xC0AC29B7
|
||||
#define CSD 0xC97C50DD
|
||||
#define CSE 0x3F84D5B5
|
||||
#define CSF 0xB5470917
|
||||
|
||||
/////////////////////////////////////////
|
||||
//
|
||||
// Blake-256 1 way SIMD
|
||||
// Only used for prehash, otherwise 4way is used with SSE2.
|
||||
|
||||
// optimize shuffles to reduce latency caused by dependencies on V1.
|
||||
#define BLAKE256_ROUND( r ) \
|
||||
{ \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
@@ -353,52 +317,9 @@ static const sph_u32 CS[16] = {
|
||||
V2 = mm128_shufll_32( V2 ); \
|
||||
}
|
||||
|
||||
/*
|
||||
#define BLAKE256_ROUND( r ) \
|
||||
{ \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
|
||||
CSx( r, 5 ) ^ Mx( r, 4 ), \
|
||||
CSx( r, 3 ) ^ Mx( r, 2 ), \
|
||||
CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
|
||||
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
|
||||
CSx( r, 4 ) ^ Mx( r, 5 ), \
|
||||
CSx( r, 2 ) ^ Mx( r, 3 ), \
|
||||
CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
|
||||
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
|
||||
V3 = mm128_shufll_32( V3 ); \
|
||||
V2 = mm128_swap_64( V2 ); \
|
||||
V1 = mm128_shuflr_32( V1 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, F ) ^ Mx( r, E ), \
|
||||
CSx( r, D ) ^ Mx( r, C ), \
|
||||
CSx( r, B ) ^ Mx( r, A ), \
|
||||
CSx( r, 9 ) ^ Mx( r, 8 ) ) ) ); \
|
||||
V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
|
||||
V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
|
||||
_mm_set_epi32( CSx( r, E ) ^ Mx( r, F ), \
|
||||
CSx( r, C ) ^ Mx( r, D ), \
|
||||
CSx( r, A ) ^ Mx( r, B ), \
|
||||
CSx( r, 8 ) ^ Mx( r, 9 ) ) ) ); \
|
||||
V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
|
||||
V2 = _mm_add_epi32( V2, V3 ); \
|
||||
V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
|
||||
V3 = mm128_shuflr_32( V3 ); \
|
||||
V2 = mm128_swap_64( V2 ); \
|
||||
V1 = mm128_shufll_32( V1 ); \
|
||||
}
|
||||
*/
|
||||
|
||||
// Default is 14 rounds, blakecoin & vanilla are 8.
|
||||
void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
const uint32_t T0, const uint32_t T1 )
|
||||
const uint32_t T0, const uint32_t T1, int rounds )
|
||||
{
|
||||
__m128i V0, V1, V2, V3;
|
||||
uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
|
||||
@@ -431,12 +352,15 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
BLAKE256_ROUND( 5 );
|
||||
BLAKE256_ROUND( 6 );
|
||||
BLAKE256_ROUND( 7 );
|
||||
BLAKE256_ROUND( 8 );
|
||||
BLAKE256_ROUND( 9 );
|
||||
BLAKE256_ROUND( 0 );
|
||||
BLAKE256_ROUND( 1 );
|
||||
BLAKE256_ROUND( 2 );
|
||||
BLAKE256_ROUND( 3 );
|
||||
if ( rounds > 8 ) // 14
|
||||
{
|
||||
BLAKE256_ROUND( 8 );
|
||||
BLAKE256_ROUND( 9 );
|
||||
BLAKE256_ROUND( 0 );
|
||||
BLAKE256_ROUND( 1 );
|
||||
BLAKE256_ROUND( 2 );
|
||||
BLAKE256_ROUND( 3 );
|
||||
}
|
||||
casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
|
||||
casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
|
||||
}
|
||||
@@ -459,34 +383,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
|
||||
}
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
|
||||
// Not used
|
||||
#if 0
|
||||
|
||||
#define ROUND_S_4WAY(r) do { \
|
||||
GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
|
||||
CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \
|
||||
GS_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
|
||||
CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \
|
||||
GS_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
|
||||
CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \
|
||||
GS_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
|
||||
CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \
|
||||
GS_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
|
||||
CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \
|
||||
GS_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
|
||||
CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \
|
||||
GS_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
|
||||
CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \
|
||||
GS_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
|
||||
CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
|
||||
} while (0)
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define ROUND_S_4WAY(r) \
|
||||
{ \
|
||||
GS_4WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
|
||||
@@ -499,8 +395,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
GS_4WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#define DECL_STATE32_4WAY \
|
||||
__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
uint32_t T0, T1;
|
||||
@@ -531,56 +425,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
(state)->T1 = T1; \
|
||||
} while (0)
|
||||
|
||||
#if SPH_COMPACT_BLAKE_32
|
||||
// not used
|
||||
#if 0
|
||||
#define COMPRESS32_4WAY( rounds ) do { \
|
||||
__m128i M[16]; \
|
||||
__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
|
||||
__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
|
||||
unsigned r; \
|
||||
V0 = H0; \
|
||||
V1 = H1; \
|
||||
V2 = H2; \
|
||||
V3 = H3; \
|
||||
V4 = H4; \
|
||||
V5 = H5; \
|
||||
V6 = H6; \
|
||||
V7 = H7; \
|
||||
V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
|
||||
V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
|
||||
VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
|
||||
VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
|
||||
VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
|
||||
VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
|
||||
VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
|
||||
mm128_block_bswap_32( M, buf ); \
|
||||
mm128_block_bswap_32( M+8, buf+8 ); \
|
||||
for (r = 0; r < rounds; r ++) \
|
||||
ROUND_S_4WAY(r); \
|
||||
H0 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S0, V0 ), V8 ), H0 ); \
|
||||
H1 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S1, V1 ), V9 ), H1 ); \
|
||||
H2 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S2, V2 ), VA ), H2 ); \
|
||||
H3 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S3, V3 ), VB ), H3 ); \
|
||||
H4 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S0, V4 ), VC ), H4 ); \
|
||||
H5 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S1, V5 ), VD ), H5 ); \
|
||||
H6 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S2, V6 ), VE ), H6 ); \
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( \
|
||||
_mm_xor_si128( S3, V7 ), VF ), H7 ); \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
// current impl
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
@@ -680,8 +524,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
/////////////////////////////////
|
||||
@@ -968,7 +810,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
|
||||
|
||||
#define DECL_STATE32_8WAY \
|
||||
__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
sph_u32 T0, T1;
|
||||
uint32_t T0, T1;
|
||||
|
||||
#define READ_STATE32_8WAY(state) \
|
||||
do { \
|
||||
@@ -1046,7 +888,7 @@ do { \
|
||||
ROUND_S_8WAY(5); \
|
||||
ROUND_S_8WAY(6); \
|
||||
ROUND_S_8WAY(7); \
|
||||
if (rounds == 14) \
|
||||
if (rounds > 8) \
|
||||
{ \
|
||||
ROUND_S_8WAY(8); \
|
||||
ROUND_S_8WAY(9); \
|
||||
@@ -1111,7 +953,7 @@ do { \
|
||||
ROUND_S_8WAY(5); \
|
||||
ROUND_S_8WAY(6); \
|
||||
ROUND_S_8WAY(7); \
|
||||
if (rounds == 14) \
|
||||
if (rounds > 8) \
|
||||
{ \
|
||||
ROUND_S_8WAY(8); \
|
||||
ROUND_S_8WAY(9); \
|
||||
@@ -1156,7 +998,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
|
||||
// M[ 0:3 ] contain new message data including unique nonces in M[ 3].
|
||||
// M[ 5:12, 14 ] are always zero and not needed or used.
|
||||
// M[ 4], M[ 13], M[15] are constant and are initialized here.
|
||||
// M[ 4], M[13], M[15] are constant and are initialized here.
|
||||
// M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
|
||||
|
||||
M[ 4] = _mm256_set1_epi32( 0x80000000 );
|
||||
@@ -1221,7 +1063,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
}
|
||||
|
||||
void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data )
|
||||
const void *midhash, const void *data, const int rounds )
|
||||
{
|
||||
__m256i *H = (__m256i*)final_hash;
|
||||
const __m256i *h = (const __m256i*)midhash;
|
||||
@@ -1315,12 +1157,15 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
ROUND256_8WAY_5;
|
||||
ROUND256_8WAY_6;
|
||||
ROUND256_8WAY_7;
|
||||
ROUND256_8WAY_8;
|
||||
ROUND256_8WAY_9;
|
||||
ROUND256_8WAY_0;
|
||||
ROUND256_8WAY_1;
|
||||
ROUND256_8WAY_2;
|
||||
ROUND256_8WAY_3;
|
||||
if ( rounds > 8 )
|
||||
{
|
||||
ROUND256_8WAY_8;
|
||||
ROUND256_8WAY_9;
|
||||
ROUND256_8WAY_0;
|
||||
ROUND256_8WAY_1;
|
||||
ROUND256_8WAY_2;
|
||||
ROUND256_8WAY_3;
|
||||
}
|
||||
|
||||
const __m256i shuf_bswap32 =
|
||||
mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
|
||||
@@ -1623,7 +1468,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
|
||||
#define DECL_STATE32_16WAY \
|
||||
__m512i H0, H1, H2, H3, H4, H5, H6, H7; \
|
||||
sph_u32 T0, T1;
|
||||
uint32_t T0, T1;
|
||||
|
||||
#define READ_STATE32_16WAY(state) \
|
||||
do { \
|
||||
@@ -1882,8 +1727,9 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
|
||||
_mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
|
||||
}
|
||||
|
||||
// Dfault is 14 rounds, blakecoin & vanilla are 8.
|
||||
void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
const void *midhash, const void *data )
|
||||
const void *midhash, const void *data, const int rounds )
|
||||
{
|
||||
__m512i *H = (__m512i*)final_hash;
|
||||
const __m512i *h = (const __m512i*)midhash;
|
||||
@@ -1988,12 +1834,15 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
|
||||
ROUND256_16WAY_5;
|
||||
ROUND256_16WAY_6;
|
||||
ROUND256_16WAY_7;
|
||||
ROUND256_16WAY_8;
|
||||
ROUND256_16WAY_9;
|
||||
ROUND256_16WAY_0;
|
||||
ROUND256_16WAY_1;
|
||||
ROUND256_16WAY_2;
|
||||
ROUND256_16WAY_3;
|
||||
if ( rounds > 8 )
|
||||
{
|
||||
ROUND256_16WAY_8;
|
||||
ROUND256_16WAY_9;
|
||||
ROUND256_16WAY_0;
|
||||
ROUND256_16WAY_1;
|
||||
ROUND256_16WAY_2;
|
||||
ROUND256_16WAY_3;
|
||||
}
|
||||
|
||||
// Byte swap final hash
|
||||
const __m512i shuf_bswap32 =
|
||||
@@ -2057,7 +1906,7 @@ blake32_4way( blake_4way_small_context *ctx, const void *data,
|
||||
size_t clen = ( sizeof ctx->buf ) - bptr;
|
||||
|
||||
if ( clen > blen )
|
||||
clen = blen;
|
||||
clen = blen;
|
||||
memcpy( buf + vptr, data, clen );
|
||||
bptr += clen;
|
||||
data = (const unsigned char *)data + clen;
|
||||
@@ -2130,11 +1979,11 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
|
||||
|
||||
// Blake-256 8 way
|
||||
|
||||
static const sph_u32 salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
static void
|
||||
blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt, int rounds )
|
||||
blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
@@ -2181,8 +2030,8 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
|
||||
T1 = SPH_T32(T1 + 1);
|
||||
if ( ( T0 = T0 + 512 ) < 512 )
|
||||
T1 = T1 + 1;
|
||||
COMPRESS32_8WAY( sc->rounds );
|
||||
ptr = 0;
|
||||
}
|
||||
@@ -2198,7 +2047,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
__m256i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
sph_u32 th, tl;
|
||||
uint32_t th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
@@ -2208,13 +2057,13 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
if ( ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
sc->T0 = 0xFFFFFE00UL;
|
||||
sc->T1 = 0xFFFFFFFFUL;
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
|
||||
sc->T1 = SPH_T32(sc->T1 - 1);
|
||||
sc->T0 = 0xFFFFFE00UL + bit_len;
|
||||
sc->T1 = sc->T1 - 1;
|
||||
}
|
||||
else
|
||||
sc->T0 -= 512 - bit_len;
|
||||
@@ -2233,8 +2082,8 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||
blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
sc->T0 = 0xFFFFFE00UL;
|
||||
sc->T1 = 0xFFFFFFFFUL;
|
||||
memset_zero_256( buf, 56>>2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = _mm256_set1_epi64x( 0x0100000001000000ULL );
|
||||
@@ -2277,8 +2126,8 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
|
||||
T1 = SPH_T32(T1 + 1);
|
||||
if ( ( T0 = T0 + 512 ) < 512 )
|
||||
T1 = T1 + 1;
|
||||
COMPRESS32_8WAY_LE( sc->rounds );
|
||||
ptr = 0;
|
||||
}
|
||||
@@ -2294,7 +2143,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
__m256i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
sph_u32 th, tl;
|
||||
uint32_t th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
@@ -2304,13 +2153,13 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
if ( ptr == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
sc->T0 = 0xFFFFFE00UL;
|
||||
sc->T1 = 0xFFFFFFFFUL;
|
||||
}
|
||||
else if ( sc->T0 == 0 )
|
||||
{
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
|
||||
sc->T1 = SPH_T32(sc->T1 - 1);
|
||||
sc->T0 = 0xFFFFFE00UL + bit_len;
|
||||
sc->T1 = sc->T1 - 1;
|
||||
}
|
||||
else
|
||||
sc->T0 -= 512 - bit_len;
|
||||
@@ -2328,8 +2177,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
{
|
||||
memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||
blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
|
||||
sc->T0 = SPH_C32(0xFFFFFE00UL);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFFUL);
|
||||
sc->T0 = 0xFFFFFE00UL;
|
||||
sc->T1 = 0xFFFFFFFFUL;
|
||||
memset_zero_256( buf, 56>>2 );
|
||||
if ( out_size_w32 == 8 )
|
||||
buf[52>>2] = m256_one_32;
|
||||
@@ -2348,8 +2197,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
|
||||
//Blake-256 16 way AVX512
|
||||
|
||||
static void
|
||||
blake32_16way_init( blake_16way_small_context *sc, const sph_u32 *iv,
|
||||
const sph_u32 *salt, int rounds )
|
||||
blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
|
||||
const uint32_t *salt, int rounds )
|
||||
{
|
||||
casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E6676A09E667 );
|
||||
casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
|
||||
@@ -2411,7 +2260,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
__m512i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
sph_u32 th, tl;
|
||||
uint32_t th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
@@ -2508,7 +2357,7 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
|
||||
__m512i buf[16];
|
||||
size_t ptr;
|
||||
unsigned bit_len;
|
||||
sph_u32 th, tl;
|
||||
uint32_t th, tl;
|
||||
|
||||
ptr = sc->ptr;
|
||||
bit_len = ((unsigned)ptr << 3);
|
||||
@@ -2618,8 +2467,6 @@ blake256r8_16way_close(void *cc, void *dst)
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
|
||||
// Blake-256 4 way
|
||||
|
||||
// default 14 rounds, backward copatibility
|
||||
@@ -2754,9 +2601,3 @@ blake256r8_8way_close(void *cc, void *dst)
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
//#endif
|
||||
|
@@ -1,62 +1,22 @@
|
||||
/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
|
||||
/*
|
||||
* BLAKE implementation.
|
||||
*
|
||||
* ==========================(LICENSE BEGIN)============================
|
||||
*
|
||||
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
* ===========================(LICENSE END)=============================
|
||||
*
|
||||
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
|
||||
*/
|
||||
|
||||
#if defined (__AVX2__)
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include "blake-hash-4way.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning (disable: 4146)
|
||||
#endif
|
||||
|
||||
// Blake-512 common
|
||||
|
||||
/*
|
||||
static const sph_u64 IV512[8] = {
|
||||
SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
|
||||
SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
|
||||
SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
|
||||
SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
|
||||
static const uint64_t IV512[8] =
|
||||
{
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
|
||||
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
};
|
||||
|
||||
static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
static const uint64_t salt_zero_big[4] = { 0, 0, 0, 0 };
|
||||
|
||||
static const unsigned sigma[16][16] = {
|
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
@@ -77,15 +37,15 @@ static const unsigned sigma[16][16] = {
|
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
|
||||
};
|
||||
|
||||
static const sph_u64 CB[16] = {
|
||||
SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
|
||||
SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
|
||||
SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
|
||||
SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
|
||||
SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
|
||||
SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
|
||||
SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
|
||||
SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
|
||||
static const uint64_t CB[16] = {
|
||||
0x243F6A8885A308D3, 0x13198A2E03707344,
|
||||
0xA4093822299F31D0, 0x082EFA98EC4E6C89,
|
||||
0x452821E638D01377, 0xBE5466CF34E90C6C,
|
||||
0xC0AC29B7C97C50DD, 0x3F84D5B5B5470917,
|
||||
0x9216D5D98979FB1B, 0xD1310BA698DFB5AC,
|
||||
0x2FFD72DBD01ADFB7, 0xB8E1AFED6A267E96,
|
||||
0xBA7C9045F12C7F99, 0x24A19947B3916CF7,
|
||||
0x0801F2E2858EFC16, 0x636920D871574E69
|
||||
|
||||
*/
|
||||
|
||||
@@ -1486,7 +1446,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
if ( (T0 = T0 + 1024 ) < 1024 )
|
||||
T1 = SPH_T64(T1 + 1);
|
||||
T1 = T1 + 1;
|
||||
COMPRESS64_4WAY;
|
||||
ptr = 0;
|
||||
}
|
||||
@@ -1538,8 +1498,8 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
|
||||
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
|
||||
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
|
||||
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
|
||||
memset_zero_256( buf, 112>>3 );
|
||||
buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
|
||||
buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
|
||||
@@ -1629,8 +1589,4 @@ blake512_4way_close(void *cc, void *dst)
|
||||
blake64_4way_close( cc, dst );
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -4,7 +4,149 @@
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
||||
#if defined (BLAKECOIN_4WAY)
|
||||
#define rounds 8
|
||||
|
||||
#if defined (BLAKECOIN_16WAY)
|
||||
|
||||
int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*16] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
|
||||
__m512i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m512i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( ((__m512i*)hash32)[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
uint32_t phash[8] __attribute__ ((aligned (64))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = (const uint32_t) n;
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, rounds );
|
||||
|
||||
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm512_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm512_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm512_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm512_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm512_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
block_buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
block_buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
block_buf[3] =
|
||||
_mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
block_buf, rounds );
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
block_buf[3] = _mm512_add_epi32( block_buf[3], sixteen );
|
||||
n += 16;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (BLAKECOIN_8WAY)
|
||||
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*8] __attribute__ ((aligned (32)));
|
||||
__m256i block0_hash[8] __attribute__ ((aligned (32)));
|
||||
__m256i block_buf[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( ((__m256i*)hash32)[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t n = pdata[19];
|
||||
const uint32_t first_nonce = (const uint32_t) n;
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, rounds );
|
||||
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces in block_buf[3].
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
block_buf, rounds );
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
block_buf[3] = _mm256_add_epi32( block_buf[3], eight );
|
||||
n += 8;
|
||||
} while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined (BLAKECOIN_4WAY)
|
||||
|
||||
blake256r8_4way_context blakecoin_4w_ctx;
|
||||
|
||||
@@ -61,7 +203,8 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(BLAKECOIN_8WAY)
|
||||
#if 0
|
||||
//#if defined(BLAKECOIN_8WAY)
|
||||
|
||||
blake256r8_8way_context blakecoin_8w_ctx;
|
||||
|
||||
@@ -78,11 +221,84 @@ void blakecoin_8way_hash( void *state, const void *input )
|
||||
state+160, state+192, state+224, vhash, 256 );
|
||||
}
|
||||
|
||||
/*
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t hash32[8*8] __attribute__ ((aligned (64)));
|
||||
uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
|
||||
__m256i block0_hash[8] __attribute__ ((aligned (64)));
|
||||
__m256i block_buf[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t phash[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = (uint32_t*)work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0, 8 );
|
||||
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
block0_hash[2] = _mm256_set1_epi32( phash[2] );
|
||||
block0_hash[3] = _mm256_set1_epi32( phash[3] );
|
||||
block0_hash[4] = _mm256_set1_epi32( phash[4] );
|
||||
block0_hash[5] = _mm256_set1_epi32( phash[5] );
|
||||
block0_hash[6] = _mm256_set1_epi32( phash[6] );
|
||||
block0_hash[7] = _mm256_set1_epi32( phash[7] );
|
||||
|
||||
// Build vectored second block, interleave last 16 bytes of data using
|
||||
// unique nonces.
|
||||
block_buf[0] = _mm256_set1_epi32( pdata[16] );
|
||||
block_buf[1] = _mm256_set1_epi32( pdata[17] );
|
||||
block_buf[2] = _mm256_set1_epi32( pdata[18] );
|
||||
block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
|
||||
// Partialy prehash second block without touching nonces
|
||||
blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
|
||||
|
||||
do {
|
||||
blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
|
||||
block_buf );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
block_buf[3] = _mm256_add_epi32( block_buf[3], eight );
|
||||
n += 8;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
uint32_t hash[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash32[8*8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
blake256r8_8way_context ctx __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( ((__m256i*)hash32)[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
@@ -101,15 +317,22 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
*noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
|
||||
n+3, n+2, n+1, n ) );
|
||||
pdata[19] = n;
|
||||
blakecoin_8way_hash( hash, vdata );
|
||||
|
||||
for ( int i = 0; i < 8; i++ )
|
||||
if ( (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget )
|
||||
&& !opt_benchmark )
|
||||
memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
|
||||
blake256r8_8way_update( &ctx, (const void*)vdata + (64<<3), 16 );
|
||||
blake256r8_8way_close( &ctx, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash32_d7[ lane ] <= HTarget )
|
||||
{
|
||||
pdata[19] = n+i;
|
||||
submit_solution( work, hash+(i<<3), mythr );
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
|
||||
n += 8;
|
||||
} while ( (n < max_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
|
@@ -4,10 +4,10 @@
|
||||
// vanilla uses default gen merkle root, otherwise identical to blakecoin
|
||||
bool register_vanilla_algo( algo_gate_t* gate )
|
||||
{
|
||||
#if defined(BLAKECOIN_8WAY)
|
||||
#if defined(BLAKECOIN_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_blakecoin_16way;
|
||||
#elif defined(BLAKECOIN_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_blakecoin_8way;
|
||||
gate->hash = (void*)&blakecoin_8way_hash;
|
||||
|
||||
#elif defined(BLAKECOIN_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_blakecoin_4way;
|
||||
gate->hash = (void*)&blakecoin_4way_hash;
|
||||
@@ -15,14 +15,14 @@ bool register_vanilla_algo( algo_gate_t* gate )
|
||||
gate->scanhash = (void*)&scanhash_blakecoin;
|
||||
gate->hash = (void*)&blakecoinhash;
|
||||
#endif
|
||||
gate->optimizations = SSE42_OPT | AVX2_OPT;
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool register_blakecoin_algo( algo_gate_t* gate )
|
||||
{
|
||||
register_vanilla_algo( gate );
|
||||
gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
|
||||
gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@@ -1,30 +1,36 @@
|
||||
#ifndef __BLAKECOIN_GATE_H__
|
||||
#define __BLAKECOIN_GATE_H__ 1
|
||||
#ifndef BLAKECOIN_GATE_H__
|
||||
#define BLAKECOIN_GATE_H__ 1
|
||||
|
||||
#include "algo-gate-api.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#if defined(__SSE4_2__)
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define BLAKECOIN_16WAY
|
||||
#elif defined(__AVX2__)
|
||||
#define BLAKECOIN_8WAY
|
||||
#elif defined(__SSE2__) // always true
|
||||
#define BLAKECOIN_4WAY
|
||||
#endif
|
||||
#if defined(__AVX2__)
|
||||
#define BLAKECOIN_8WAY
|
||||
#endif
|
||||
|
||||
#if defined (BLAKECOIN_8WAY)
|
||||
void blakecoin_8way_hash(void *state, const void *input);
|
||||
#if defined (BLAKECOIN_16WAY)
|
||||
int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#elif defined (BLAKECOIN_8WAY)
|
||||
//void blakecoin_8way_hash(void *state, const void *input);
|
||||
int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
#if defined (BLAKECOIN_4WAY)
|
||||
#elif defined (BLAKECOIN_4WAY)
|
||||
void blakecoin_4way_hash(void *state, const void *input);
|
||||
int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
#else // never used
|
||||
|
||||
void blakecoinhash( void *state, const void *input );
|
||||
int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@@ -1,6 +1,6 @@
|
||||
#include "blakecoin-gate.h"
|
||||
|
||||
#if !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
|
||||
#if !defined(BLAKECOIN_16WAY) && !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
|
||||
|
||||
#define BLAKE32_ROUNDS 8
|
||||
#include "sph_blake.h"
|
||||
@@ -12,7 +12,6 @@ void blakecoin_close(void *cc, void *dst);
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
#include <openssl/sha.h>
|
||||
|
||||
// context management is staged for efficiency.
|
||||
// 1. global initial ctx cached on startup
|
||||
@@ -35,8 +34,8 @@ void blakecoinhash( void *state, const void *input )
|
||||
uint8_t hash[64] __attribute__ ((aligned (32)));
|
||||
uint8_t *ending = (uint8_t*) input + 64;
|
||||
|
||||
// copy cached midstate
|
||||
memcpy( &ctx, &blake_mid_ctx, sizeof ctx );
|
||||
// copy cached midstate
|
||||
memcpy( &ctx, &blake_mid_ctx, sizeof ctx );
|
||||
blakecoin( &ctx, ending, 16 );
|
||||
blakecoin_close( &ctx, hash );
|
||||
memcpy( state, hash, 32 );
|
||||
@@ -45,8 +44,8 @@ void blakecoinhash( void *state, const void *input )
|
||||
int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
uint32_t HTarget = ptarget[7];
|
||||
int thr_id = mythr->id; // thr_id arg is deprecated
|
||||
@@ -60,10 +59,10 @@ int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
|
||||
HTarget = 0x7f;
|
||||
|
||||
// we need big endian data...
|
||||
for (int kk=0; kk < 19; kk++)
|
||||
be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
|
||||
for (int kk=0; kk < 19; kk++)
|
||||
be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
|
||||
|
||||
blake_midstate_init( endiandata );
|
||||
blake_midstate_init( endiandata );
|
||||
|
||||
#ifdef DEBUG_ALGO
|
||||
applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);
|
||||
|
@@ -48,7 +48,7 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
|
||||
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
||||
allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
|
||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
|
||||
@@ -217,7 +217,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
|
||||
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
// Prehash first block.
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
blake256_transform_le( phash, pdata, 512, 0, 14 );
|
||||
|
||||
// Interleave hash for second block prehash.
|
||||
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
@@ -286,7 +286,7 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
|
||||
uint64_t *hash7 = (uint64_t*)hash+28;
|
||||
allium_8way_ctx_holder ctx __attribute__ ((aligned (64)));
|
||||
|
||||
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block );
|
||||
blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
vhashA, 256 );
|
||||
@@ -401,7 +401,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
blake256_transform_le( phash, pdata, 512, 0, 14 );
|
||||
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
|
@@ -35,7 +35,7 @@ static void lyra2z_16way_hash( void *state, const void *midstate_vars,
|
||||
uint32_t hash14[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash15[8] __attribute__ ((aligned (32)));
|
||||
|
||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
|
||||
blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
|
||||
dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
|
||||
hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
|
||||
@@ -108,7 +108,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
|
||||
if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
blake256_transform_le( phash, pdata, 512, 0, 14 );
|
||||
|
||||
block0_hash[0] = _mm512_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm512_set1_epi32( phash[1] );
|
||||
@@ -170,7 +170,7 @@ static void lyra2z_8way_hash( void *state, const void *midstate_vars,
|
||||
uint32_t hash7[8] __attribute__ ((aligned (32)));
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
|
||||
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block );
|
||||
blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );
|
||||
|
||||
dintrlv_8x32( hash0, hash1, hash2, hash3,
|
||||
hash4, hash5, hash6, hash7, vhash, 256 );
|
||||
@@ -216,7 +216,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
|
||||
// Prehash first block
|
||||
blake256_transform_le( phash, pdata, 512, 0 );
|
||||
blake256_transform_le( phash, pdata, 512, 0, 14 );
|
||||
|
||||
block0_hash[0] = _mm256_set1_epi32( phash[0] );
|
||||
block0_hash[1] = _mm256_set1_epi32( phash[1] );
|
||||
|
@@ -67,7 +67,7 @@ void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
|
||||
void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
|
||||
int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
|
||||
const __m128i *state_in );
|
||||
const __m128i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
@@ -95,7 +95,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
|
||||
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in );
|
||||
const __m256i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
@@ -123,7 +123,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const __m512i *state_mid, const __m512i *X );
|
||||
|
||||
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in );
|
||||
const __m512i *state_in, const uint32_t *target );
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
@@ -658,43 +658,14 @@ int scanhash_sha256d_pooler( struct work *work, uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t _ALIGN(128) hash[8];
|
||||
uint32_t _ALIGN(64) data[20];
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
uint32_t n = pdata[19] - 1;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
int thr_id = mythr->id;
|
||||
|
||||
memcpy( data, pdata, 80 );
|
||||
|
||||
do {
|
||||
data[19] = ++n;
|
||||
sha256d( (unsigned char*)hash, (const unsigned char*)data, 80 );
|
||||
if ( unlikely( swab32( hash[7] ) <= Htarg ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
sha256d_80_swap(hash, pdata);
|
||||
if ( fulltest( hash, ptarget ) && !opt_benchmark )
|
||||
submit_solution( work, hash, mythr );
|
||||
}
|
||||
} while ( likely( n < max_nonce && !work_restart[thr_id].restart ) );
|
||||
*hashes_done = n - first_nonce + 1;
|
||||
pdata[19] = n;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
bool register_sha256d_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA256D_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_16way;
|
||||
#elif defined(SHA256D_SHA)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256d_sha;
|
||||
//#elif defined(SHA256D_8WAY)
|
||||
// gate->scanhash = (void*)&scanhash_sha256d_8way;
|
||||
#else
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -50,65 +50,6 @@ void sha256_update( sha256_context *ctx, const void *data, size_t len )
|
||||
memcpy( ctx->buf, src, len );
|
||||
}
|
||||
|
||||
#if 0
|
||||
void sha256_final( sha256_context *ctx, uint32_t *hash )
|
||||
{
|
||||
size_t r;
|
||||
|
||||
|
||||
/* Figure out how many bytes we have buffered. */
|
||||
r = ctx->count & 0x3f;
|
||||
// r = ( ctx->count >> 3 ) & 0x3f;
|
||||
|
||||
//printf("final: count= %d, r= %d\n", ctx->count, r );
|
||||
|
||||
/* Pad to 56 mod 64, transforming if we finish a block en route. */
|
||||
if ( r < 56 )
|
||||
{
|
||||
/* Pad to 56 mod 64. */
|
||||
memcpy( &ctx->buf[r], SHA256_PAD, 56 - r );
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Finish the current block and mix. */
|
||||
memcpy( &ctx->buf[r], SHA256_PAD, 64 - r );
|
||||
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||
|
||||
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
|
||||
|
||||
/* The start of the final block is all zeroes. */
|
||||
memset( &ctx->buf[0], 0, 56 );
|
||||
}
|
||||
|
||||
/* Add the terminating bit-count. */
|
||||
ctx->buf[56] = bswap_64( ctx->count << 3 );
|
||||
// ctx->buf[56] = bswap_64( ctx->count );
|
||||
// be64enc( &ctx->buf[56], ctx->count );
|
||||
|
||||
/* Mix in the final block. */
|
||||
sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
|
||||
|
||||
// SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
|
||||
|
||||
for ( int i = 0; i < 8; i++ ) hash[i] = bswap_32( ctx->state[i] );
|
||||
|
||||
// for ( int i = 0; i < 8; i++ ) be32enc( hash + 4*i, ctx->state + i );
|
||||
|
||||
/*
|
||||
// be32enc_vect(digest, ctx->state, 4);
|
||||
// be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
|
||||
// Encode vector, two words at a time.
|
||||
do {
|
||||
be32enc(&dst[0], src[0]);
|
||||
be32enc(&dst[4], src[1]);
|
||||
src += 2;
|
||||
dst += 8;
|
||||
} while (--len);
|
||||
*/
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
void sha256_final( sha256_context *ctx, void *hash )
|
||||
{
|
||||
int ptr = ctx->count & 0x3f;
|
||||
|
@@ -3,10 +3,194 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha256-hash.h"
|
||||
#include "sha-hash-4way.h"
|
||||
|
||||
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
|
||||
#if defined(SHA256D_SHA)
|
||||
|
||||
int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block0[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform_le( mstate, pdata, sha256_iv );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy( block0, pdata + 16, 16 );
|
||||
memcpy( block1, pdata + 16, 16 );
|
||||
block0[ 3] = n;
|
||||
block1[ 3] = n+1;
|
||||
block0[ 4] = block1[ 4] = 0x80000000;
|
||||
memset( block0 + 5, 0, 40 );
|
||||
memset( block1 + 5, 0, 40 );
|
||||
block0[15] = block1[15] = 80*8; // bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
|
||||
mstate, mstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
block0[ 8] = block1[ 8] = 0x80000000;
|
||||
memset( block0 + 9, 0, 24 );
|
||||
memset( block1 + 9, 0, 24 );
|
||||
block0[15] = block1[15] = 32*8; // bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
|
||||
sha256_iv, sha256_iv );
|
||||
|
||||
if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hash0, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash0, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash0, mythr );
|
||||
}
|
||||
}
|
||||
|
||||
if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hash1, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash1, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_solution( work, hash1, mythr );
|
||||
}
|
||||
}
|
||||
n += 2;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256D_16WAY)
|
||||
|
||||
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i hash32[8] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i buf[16] __attribute__ ((aligned (64)));
|
||||
__m512i mstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i mstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i istate[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[8] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
// prehash first block directly from pdata
|
||||
sha256_transform_le( phash, pdata, sha256_iv );
|
||||
|
||||
// vectorize block 0 hash for second block
|
||||
mstate1[0] = _mm512_set1_epi32( phash[0] );
|
||||
mstate1[1] = _mm512_set1_epi32( phash[1] );
|
||||
mstate1[2] = _mm512_set1_epi32( phash[2] );
|
||||
mstate1[3] = _mm512_set1_epi32( phash[3] );
|
||||
mstate1[4] = _mm512_set1_epi32( phash[4] );
|
||||
mstate1[5] = _mm512_set1_epi32( phash[5] );
|
||||
mstate1[6] = _mm512_set1_epi32( phash[6] );
|
||||
mstate1[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
buf[4] = last_byte;
|
||||
memset_zero_512( buf+5, 10 );
|
||||
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for 2nd & 3rd sha256
|
||||
istate[0] = _mm512_set1_epi32( sha256_iv[0] );
|
||||
istate[1] = _mm512_set1_epi32( sha256_iv[1] );
|
||||
istate[2] = _mm512_set1_epi32( sha256_iv[2] );
|
||||
istate[3] = _mm512_set1_epi32( sha256_iv[3] );
|
||||
istate[4] = _mm512_set1_epi32( sha256_iv[4] );
|
||||
istate[5] = _mm512_set1_epi32( sha256_iv[5] );
|
||||
istate[6] = _mm512_set1_epi32( sha256_iv[6] );
|
||||
istate[7] = _mm512_set1_epi32( sha256_iv[7] );
|
||||
|
||||
// initialize padding for 2nd sha256
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
do
|
||||
{
|
||||
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
|
||||
|
||||
if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
|
||||
{
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
|
||||
{
|
||||
extr_lane_16x32( phash, hash32, lane, 256 );
|
||||
casti_m256i( phash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
|
||||
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, phash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
buf[3] = _mm512_add_epi32( buf[3], sixteen );
|
||||
n += 16;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
@@ -67,20 +251,18 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
if ( sha256_16way_transform_le_short( hash32, block, initstate ) )
|
||||
{
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
sha256_16way_transform_le( hash32, block, initstate );
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
@@ -90,6 +272,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
@@ -104,7 +287,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
__m256i mexp_pre[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -154,21 +337,18 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
if ( unlikely(
|
||||
sha256_8way_transform_le_short( hash32, block, initstate ) ) )
|
||||
{
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
sha256_8way_transform_le( hash32, block, initstate );
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
@@ -191,8 +371,6 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m128i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -232,31 +410,25 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate1, vdata, initstate );
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
sha256_4way_transform_le( block, vdata+16, initstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
if ( unlikely(
|
||||
sha256_4way_transform_le_short( hash32, block, initstate ) ) )
|
||||
{
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
@@ -268,21 +440,3 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
bool register_sha256d_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA256D_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_16way;
|
||||
#elif defined(SHA256D_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_8way;
|
||||
#elif defined(SHA256D_4WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256d_4way;
|
||||
#endif
|
||||
|
||||
// gate->hash = (void*)&sha256d;
|
||||
return true;
|
||||
};
|
||||
*/
|
||||
|
||||
|
@@ -6,6 +6,8 @@
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA256D_16WAY 1
|
||||
#elif defined(__SHA__)
|
||||
#define SHA256D_SHA 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA256D_8WAY 1
|
||||
#else
|
||||
@@ -32,15 +34,12 @@ int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
#endif
|
||||
|
||||
#if defined(SHA256D_SHA)
|
||||
|
||||
/*
|
||||
#if defined(__SHA__)
|
||||
|
||||
int scanhash_sha256d( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
*/
|
||||
int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr );
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -3,99 +3,201 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha256-hash.h"
|
||||
#include "sha-hash-4way.h"
|
||||
|
||||
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
|
||||
#define SHA256DT_16WAY 1
|
||||
#elif defined(__SHA__)
|
||||
#define SHA256DT_SHA 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA256DT_8WAY 1
|
||||
#else
|
||||
#define SHA256DT_4WAY 1
|
||||
#endif
|
||||
|
||||
static const uint32_t sha256dt_iv[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0xdfa9bf2c, 0xb72074d4, 0x6bb01122, 0xd338e869,
|
||||
0xaa3ff126, 0x475bbf30, 0x8fd52e5b, 0x9f75c9ad
|
||||
};
|
||||
|
||||
#if defined(SHA256DT_16WAY)
|
||||
|
||||
int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i vdata[32] __attribute__ ((aligned (128)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (64)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
__m512i buf[16] __attribute__ ((aligned (64)));
|
||||
__m512i mstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i mstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i istate[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[8] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
// uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
|
||||
// const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm512_set1_epi32( pdata[i] );
|
||||
// prehash first block directly from pdata
|
||||
sha256_transform_le( phash, pdata, sha256dt_iv );
|
||||
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
// vectorize block 0 hash for second block
|
||||
mstate1[0] = _mm512_set1_epi32( phash[0] );
|
||||
mstate1[1] = _mm512_set1_epi32( phash[1] );
|
||||
mstate1[2] = _mm512_set1_epi32( phash[2] );
|
||||
mstate1[3] = _mm512_set1_epi32( phash[3] );
|
||||
mstate1[4] = _mm512_set1_epi32( phash[4] );
|
||||
mstate1[5] = _mm512_set1_epi32( phash[5] );
|
||||
mstate1[6] = _mm512_set1_epi32( phash[6] );
|
||||
mstate1[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_512( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm512_set1_epi32( 0x480 );
|
||||
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
buf[4] = last_byte;
|
||||
memset_zero_512( buf+5, 10 );
|
||||
buf[15] = _mm512_set1_epi32( 0x480 ); // sha256dt funky bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for 2nd sha256
|
||||
istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
|
||||
istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
|
||||
istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
|
||||
istate[3] = _mm512_set1_epi32( sha256dt_iv[3] );
|
||||
istate[4] = _mm512_set1_epi32( sha256dt_iv[4] );
|
||||
istate[5] = _mm512_set1_epi32( sha256dt_iv[5] );
|
||||
istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
|
||||
istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );
|
||||
|
||||
// initialize padding for 2nd sha256
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 0x300 );
|
||||
|
||||
initstate[0] = _mm512_set1_epi64( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = _mm512_set1_epi64( 0xb72074d4b72074d4 );
|
||||
initstate[2] = _mm512_set1_epi64( 0x6bb011226bb01122 );
|
||||
initstate[3] = _mm512_set1_epi64( 0xd338e869d338e869 );
|
||||
initstate[4] = _mm512_set1_epi64( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = _mm512_set1_epi64( 0x475bbf30475bbf30 );
|
||||
initstate[6] = _mm512_set1_epi64( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = _mm512_set1_epi64( 0x9f75c9ad9f75c9ad );
|
||||
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
|
||||
memset_zero_512( block+9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 0x300 ); // bit count
|
||||
|
||||
do
|
||||
{
|
||||
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
sha256_16way_transform_le( hash32, block, initstate );
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
// finish second block with nonces
|
||||
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
|
||||
if ( unlikely( sha256_16way_transform_le_short(
|
||||
hash32, block, istate, ptarget ) ) )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
// if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_16x32( phash, hash32, lane, 256 );
|
||||
casti_m256i( phash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
|
||||
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, phash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
buf[3] = _mm512_add_epi32( buf[3], sixteen );
|
||||
n += 16;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SHA256DT_SHA)
|
||||
|
||||
int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
uint32_t block0[16] __attribute__ ((aligned (64)));
|
||||
uint32_t block1[16] __attribute__ ((aligned (64)));
|
||||
uint32_t hash0[8] __attribute__ ((aligned (32)));
|
||||
uint32_t hash1[8] __attribute__ ((aligned (32)));
|
||||
uint32_t mstate[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i shuf_bswap32 =
|
||||
_mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
#endif
|
||||
// hash first 64 bytes of data
|
||||
sha256_opt_transform_le( mstate, pdata, sha256dt_iv );
|
||||
|
||||
#if defined(SHA256DT_8WAY)
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
memcpy( block0, pdata + 16, 16 );
|
||||
memcpy( block1, pdata + 16, 16 );
|
||||
block0[ 3] = n;
|
||||
block1[ 3] = n+1;
|
||||
block0[ 4] = block1[ 4] = 0x80000000;
|
||||
memset( block0 + 5, 0, 40 );
|
||||
memset( block1 + 5, 0, 40 );
|
||||
block0[15] = block1[15] = 0x480; // funky bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
|
||||
mstate, mstate );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
memcpy( block0, hash0, 32 );
|
||||
memcpy( block1, hash1, 32 );
|
||||
block0[ 8] = block1[ 8] = 0x80000000;
|
||||
memset( block0 + 9, 0, 24 );
|
||||
memset( block1 + 9, 0, 24 );
|
||||
block0[15] = block1[15] = 0x300; // bit count
|
||||
sha256_ni2way_transform_le( hash0, hash1, block0, block1,
|
||||
sha256dt_iv, sha256dt_iv );
|
||||
|
||||
if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hash0, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash0, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
submit_solution( work, hash0, mythr );
|
||||
}
|
||||
}
|
||||
if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_m128i( hash1, 0 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
|
||||
casti_m128i( hash1, 1 ) =
|
||||
_mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
|
||||
if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
submit_solution( work, hash1, mythr );
|
||||
}
|
||||
}
|
||||
n += 2;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#elif defined(SHA256DT_8WAY)
|
||||
|
||||
int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
@@ -103,15 +205,13 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
__m256i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m256i block[16] __attribute__ ((aligned (32)));
|
||||
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
__m256i istate[8] __attribute__ ((aligned (32)));
|
||||
__m256i mstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i mstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i mexp_pre[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
@@ -120,6 +220,8 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
@@ -135,35 +237,38 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
block[15] = _mm256_set1_epi32( 0x300 );
|
||||
|
||||
// initialize state
|
||||
initstate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
|
||||
initstate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
|
||||
initstate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
|
||||
initstate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
|
||||
initstate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
|
||||
initstate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
|
||||
initstate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
|
||||
initstate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );
|
||||
istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
|
||||
istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
|
||||
istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
|
||||
istate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
|
||||
istate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
|
||||
istate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
|
||||
istate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
|
||||
istate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );
|
||||
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
|
||||
mexp_pre );
|
||||
sha256_8way_transform_le( hash32, block, initstate );
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
if ( unlikely( sha256_8way_transform_le_short(
|
||||
hash32, block, istate, ptarget ) ) )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
casti_m256i( lane_hash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm256_add_epi32( *noncev, eight );
|
||||
@@ -174,10 +279,7 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SHA256DT_4WAY)
|
||||
#elif defined(SHA256DT_4WAY)
|
||||
|
||||
int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
@@ -230,21 +332,25 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
|
||||
do
|
||||
{
|
||||
sha256_4way_transform_le( block, vdata+16, midstate );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
// if ( sha256_4way_transform_le_short( hash32, block, initstate, ptarget ) )
|
||||
// {
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
// }
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
@@ -257,11 +363,14 @@ bool register_sha256dt_algo( algo_gate_t* gate )
|
||||
{
|
||||
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
|
||||
#if defined(SHA256DT_16WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_16way;
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_16way;
|
||||
#elif defined(SHA256DT_SHA)
|
||||
gate->optimizations = SHA_OPT;
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_sha;
|
||||
#elif defined(SHA256DT_8WAY)
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_8way;
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_8way;
|
||||
#else
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_4way;
|
||||
gate->scanhash = (void*)&scanhash_sha256dt_4way;
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
@@ -3,6 +3,7 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include "sha256-hash.h"
|
||||
#include "sha-hash-4way.h"
|
||||
|
||||
#if defined(SHA256T_16WAY)
|
||||
@@ -10,83 +11,96 @@
|
||||
int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m512i vdata[32] __attribute__ ((aligned (128)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (128)));
|
||||
__m512i block[16] __attribute__ ((aligned (64)));
|
||||
__m512i hash32[8] __attribute__ ((aligned (64)));
|
||||
__m512i initstate[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i midstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[16] __attribute__ ((aligned (64)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (64)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
__m512i buf[16] __attribute__ ((aligned (64)));
|
||||
__m512i mstate1[8] __attribute__ ((aligned (64)));
|
||||
__m512i mstate2[8] __attribute__ ((aligned (64)));
|
||||
__m512i istate[8] __attribute__ ((aligned (64)));
|
||||
__m512i mexp_pre[8] __attribute__ ((aligned (64)));
|
||||
uint32_t phash[8] __attribute__ ((aligned (32)));
|
||||
static const uint32_t IV[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
||||
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
||||
};
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
uint32_t *ptarget = work->target;
|
||||
uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 16;
|
||||
uint32_t n = first_nonce;
|
||||
__m512i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = _mm512_set1_epi32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm512_set1_epi32( pdata[i] );
|
||||
// prehash first block directly from pdata
|
||||
sha256_transform_le( phash, pdata, IV );
|
||||
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
// vectorize block 0 hash for second block
|
||||
mstate1[0] = _mm512_set1_epi32( phash[0] );
|
||||
mstate1[1] = _mm512_set1_epi32( phash[1] );
|
||||
mstate1[2] = _mm512_set1_epi32( phash[2] );
|
||||
mstate1[3] = _mm512_set1_epi32( phash[3] );
|
||||
mstate1[4] = _mm512_set1_epi32( phash[4] );
|
||||
mstate1[5] = _mm512_set1_epi32( phash[5] );
|
||||
mstate1[6] = _mm512_set1_epi32( phash[6] );
|
||||
mstate1[7] = _mm512_set1_epi32( phash[7] );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_512( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
// second message block data, with nonce & padding
|
||||
buf[0] = _mm512_set1_epi32( pdata[16] );
|
||||
buf[1] = _mm512_set1_epi32( pdata[17] );
|
||||
buf[2] = _mm512_set1_epi32( pdata[18] );
|
||||
buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
|
||||
buf[4] = last_byte;
|
||||
memset_zero_512( buf+5, 10 );
|
||||
buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for 2nd & 3rd sha256
|
||||
istate[0] = _mm512_set1_epi32( IV[0] );
|
||||
istate[1] = _mm512_set1_epi32( IV[1] );
|
||||
istate[2] = _mm512_set1_epi32( IV[2] );
|
||||
istate[3] = _mm512_set1_epi32( IV[3] );
|
||||
istate[4] = _mm512_set1_epi32( IV[4] );
|
||||
istate[5] = _mm512_set1_epi32( IV[5] );
|
||||
istate[6] = _mm512_set1_epi32( IV[6] );
|
||||
istate[7] = _mm512_set1_epi32( IV[7] );
|
||||
|
||||
// initialize padding for 2nd & 3rd sha256
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_512( block + 9, 6 );
|
||||
block[15] = _mm512_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
|
||||
initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
sha256_16way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, pre-padded
|
||||
sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
sha256_16way_transform_le( block, block, initstate );
|
||||
sha256_16way_transform_le( block, block, istate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
if ( unlikely(
|
||||
sha256_16way_transform_le_short( hash32, block, initstate ) ) )
|
||||
if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
|
||||
{
|
||||
// byte swap final hash for testing
|
||||
mm512_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
|
||||
{
|
||||
extr_lane_16x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
extr_lane_16x32( phash, hash32, lane, 256 );
|
||||
casti_m256i( phash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
|
||||
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
submit_solution( work, phash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm512_add_epi32( *noncev, sixteen );
|
||||
buf[3] = _mm512_add_epi32( buf[3], sixteen );
|
||||
n += 16;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
@@ -94,26 +108,23 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(SHA256T_8WAY)
|
||||
|
||||
|
||||
int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m256i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m256i block[16] __attribute__ ((aligned (32)));
|
||||
__m256i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m256i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
__m256i istate[8] __attribute__ ((aligned (32)));
|
||||
__m256i mstate1[8] __attribute__ ((aligned (32)));
|
||||
__m256i mstate2[8] __attribute__ ((aligned (32)));
|
||||
__m256i mexp_pre[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 8;
|
||||
uint32_t n = first_nonce;
|
||||
@@ -122,6 +133,8 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
|
||||
const __m256i eight = _mm256_set1_epi32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm256_set1_epi32( pdata[i] );
|
||||
@@ -135,42 +148,40 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_256( block + 9, 6 );
|
||||
block[15] = _mm256_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
sha256_8way_transform_le( midstate1, vdata, initstate );
|
||||
// initialize state
|
||||
istate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
istate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
istate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
istate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
istate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
|
||||
istate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
|
||||
istate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
istate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
sha256_8way_transform_le( block, block, initstate );
|
||||
sha256_8way_transform_le( block, block, istate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
if ( unlikely(
|
||||
sha256_8way_transform_le_short( hash32, block, initstate ) ) )
|
||||
if ( unlikely( sha256_8way_transform_le_short(
|
||||
hash32, block, istate, ptarget ) ) )
|
||||
{
|
||||
// byte swap final hash for testing
|
||||
mm256_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( hash32_d7[ lane ] <= targ32_d7 )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
casti_m256i( lane_hash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
@@ -188,109 +199,18 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(SHA256T_4WAY)
|
||||
|
||||
// Optimizations are slower with AVX/SSE2
|
||||
// https://github.com/JayDDee/cpuminer-opt/issues/344
|
||||
/*
|
||||
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m128i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m128i block[16] __attribute__ ((aligned (32)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate1[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate2[8] __attribute__ ((aligned (32)));
|
||||
__m128i mexp_pre[16] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
const uint32_t *ptarget = work->target;
|
||||
const uint32_t targ32_d7 = ptarget[7];
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 4;
|
||||
uint32_t n = first_nonce;
|
||||
__m128i *noncev = vdata + 19;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
|
||||
const __m128i four = _mm_set1_epi32( 4 );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = _mm_set1_epi32( pdata[i] );
|
||||
|
||||
*noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
|
||||
|
||||
vdata[16+4] = last_byte;
|
||||
memset_zero_128( vdata+16 + 5, 10 );
|
||||
vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
|
||||
|
||||
block[ 8] = last_byte;
|
||||
memset_zero_128( block + 9, 6 );
|
||||
block[15] = _mm_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate1, vdata, initstate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
sha256_4way_transform_le( block, block, initstate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
if ( unlikely(
|
||||
sha256_4way_transform_le_short( hash32, block, initstate ) ) )
|
||||
{
|
||||
// byte swap final hash for testing
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
pdata[19] = n;
|
||||
*hashes_done = n - first_nonce;
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
uint64_t *hashes_done, struct thr_info *mythr )
|
||||
{
|
||||
__m128i vdata[32] __attribute__ ((aligned (64)));
|
||||
__m128i block[16] __attribute__ ((aligned (32)));
|
||||
__m128i hash32[8] __attribute__ ((aligned (32)));
|
||||
__m128i initstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i midstate[8] __attribute__ ((aligned (32)));
|
||||
__m128i istate[8] __attribute__ ((aligned (32)));
|
||||
__m128i mstate[8] __attribute__ ((aligned (32)));
|
||||
// __m128i mstate2[8] __attribute__ ((aligned (32)));
|
||||
// __m128i mexp_pre[8] __attribute__ ((aligned (32)));
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -319,35 +239,44 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
block[15] = _mm_set1_epi32( 32*8 ); // bit count
|
||||
|
||||
// initialize state
|
||||
initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
istate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
|
||||
istate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
|
||||
istate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
|
||||
istate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
|
||||
istate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
|
||||
istate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
|
||||
istate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
|
||||
istate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
|
||||
|
||||
// hash first 64 bytes of data
|
||||
sha256_4way_transform_le( midstate, vdata, initstate );
|
||||
sha256_4way_transform_le( mstate, vdata, istate );
|
||||
|
||||
// sha256_4way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
sha256_4way_transform_le( block, vdata+16, midstate );
|
||||
sha256_4way_transform_le( block, block, initstate );
|
||||
sha256_4way_transform_le( hash32, block, initstate );
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
// sha256_4way_final_rounds( block, vdata+16, mstate1, mstate2,
|
||||
// mexp_pre );
|
||||
|
||||
sha256_4way_transform_le( block, vdata+16, mstate );
|
||||
sha256_4way_transform_le( block, block, istate );
|
||||
sha256_4way_transform_le( hash32, block, istate );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
// if ( unlikely( sha256_4way_transform_le_short(
|
||||
// hash32, block, initstate, ptarget ) ))
|
||||
// {
|
||||
mm128_block_bswap_32( hash32, hash32 );
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
extr_lane_4x32( lane_hash, hash32, lane, 256 );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
submit_solution( work, lane_hash, mythr );
|
||||
}
|
||||
}
|
||||
}
|
||||
// }
|
||||
*noncev = _mm_add_epi32( *noncev, four );
|
||||
n += 4;
|
||||
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
|
||||
@@ -356,6 +285,5 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
@@ -23,7 +23,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
const uint32_t last_nonce = max_nonce - 1;
|
||||
const uint32_t last_nonce = max_nonce - 2;
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
|
@@ -39,9 +39,9 @@
|
||||
#define SPH_SMALL_FOOTPRINT_SHA2 1
|
||||
#endif
|
||||
|
||||
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
|
||||
#define CH(X, Y, Z) ( ( ( (Y) ^ (Z) ) & (X)) ^ (Z) )
|
||||
//#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X)))
|
||||
#define MAJ( X, Y, Z ) ( Y ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) )
|
||||
#define MAJ( X, Y, Z ) ( (Y) ^ ( ( (X_xor_Y) = (X) ^ (Y) ) & (Y_xor_Z) ) )
|
||||
#define ROTR SPH_ROTR32
|
||||
|
||||
#define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
|
||||
|
20
configure
vendored
20
configure
vendored
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.0.
|
||||
# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.1.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
|
||||
@@ -608,8 +608,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.23.0'
|
||||
PACKAGE_STRING='cpuminer-opt 3.23.0'
|
||||
PACKAGE_VERSION='3.23.1'
|
||||
PACKAGE_STRING='cpuminer-opt 3.23.1'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.23.0 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.23.1 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1432,7 +1432,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.23.0:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.23.1:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1538,7 +1538,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.23.0
|
||||
cpuminer-opt configure 3.23.1
|
||||
generated by GNU Autoconf 2.71
|
||||
|
||||
Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.23.0, which was
|
||||
It was created by cpuminer-opt $as_me 3.23.1, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
$ $0$ac_configure_args_raw
|
||||
@@ -3593,7 +3593,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.23.0'
|
||||
VERSION='3.23.1'
|
||||
|
||||
|
||||
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
|
||||
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.23.0, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.23.1, which was
|
||||
generated by GNU Autoconf 2.71. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config='$ac_cs_config_escaped'
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.23.0
|
||||
cpuminer-opt config.status 3.23.1
|
||||
configured by $0, generated by GNU Autoconf 2.71,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
AC_INIT([cpuminer-opt], [3.23.0])
|
||||
AC_INIT([cpuminer-opt], [3.23.1])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
20
configure~
20
configure~
@@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.0.
|
||||
# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.23.1.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
|
||||
@@ -577,8 +577,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='cpuminer-opt'
|
||||
PACKAGE_TARNAME='cpuminer-opt'
|
||||
PACKAGE_VERSION='3.23.0'
|
||||
PACKAGE_STRING='cpuminer-opt 3.23.0'
|
||||
PACKAGE_VERSION='3.23.1'
|
||||
PACKAGE_STRING='cpuminer-opt 3.23.1'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures cpuminer-opt 3.23.0 to adapt to many kinds of systems.
|
||||
\`configure' configures cpuminer-opt 3.23.1 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@@ -1404,7 +1404,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.23.0:";;
|
||||
short | recursive ) echo "Configuration of cpuminer-opt 3.23.1:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@@ -1509,7 +1509,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
cpuminer-opt configure 3.23.0
|
||||
cpuminer-opt configure 3.23.1
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by cpuminer-opt $as_me 3.23.0, which was
|
||||
It was created by cpuminer-opt $as_me 3.23.1, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@@ -2993,7 +2993,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='cpuminer-opt'
|
||||
VERSION='3.23.0'
|
||||
VERSION='3.23.1'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by cpuminer-opt $as_me 3.23.0, which was
|
||||
This file was extended by cpuminer-opt $as_me 3.23.1, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@@ -6784,7 +6784,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
cpuminer-opt config.status 3.23.0
|
||||
cpuminer-opt config.status 3.23.1
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
17
cpu-miner.c
17
cpu-miner.c
@@ -1966,7 +1966,7 @@ static bool wanna_mine(int thr_id)
|
||||
|
||||
// Common target functions, default usually listed first.
|
||||
|
||||
// default
|
||||
// default, double sha256 for root hash
|
||||
void sha256d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
|
||||
{
|
||||
sha256d( merkle_root, sctx->job.coinbase, (int) sctx->job.coinbase_size );
|
||||
@@ -1976,6 +1976,17 @@ void sha256d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
|
||||
sha256d( merkle_root, merkle_root, 64 );
|
||||
}
|
||||
}
|
||||
// single sha256 root hash
|
||||
void sha256_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
|
||||
{
|
||||
sha256_full( merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size );
|
||||
for ( int i = 0; i < sctx->job.merkle_count; i++ )
|
||||
{
|
||||
memcpy( merkle_root + 32, sctx->job.merkle[i], 32 );
|
||||
sha256d( merkle_root, merkle_root, 64 );
|
||||
}
|
||||
}
|
||||
// OpenSSL single sha256, deprecated
|
||||
void SHA256_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
|
||||
{
|
||||
SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
|
||||
@@ -2073,7 +2084,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
applog( LOG_BLUE, "New Work: Block %d, Tx %d, Netdiff %.5g, Job %s",
|
||||
sctx->block_height, sctx->job.merkle_count,
|
||||
net_diff, g_work->job_id );
|
||||
else if ( !opt_quiet )
|
||||
else if ( opt_debug )
|
||||
{
|
||||
unsigned char *xnonce2str = bebin2hex( g_work->xnonce2,
|
||||
g_work->xnonce2_len );
|
||||
@@ -2095,7 +2106,7 @@ static void stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
|
||||
lowest_share = 9e99;
|
||||
}
|
||||
|
||||
if ( !opt_quiet )
|
||||
if ( new_job && !opt_quiet )
|
||||
{
|
||||
applog2( LOG_INFO, "Diff: Net %.5g, Stratum %.5g, Target %.5g",
|
||||
net_diff, stratum_diff, g_work->targetdiff );
|
||||
|
@@ -1216,13 +1216,13 @@ static inline void dintrlv_16x32_512( void *dst00, void *dst01, void *dst02,
|
||||
static inline void extr_lane_16x32( void *d, const void *s,
|
||||
const int lane, const int bit_len )
|
||||
{
|
||||
((uint32_t*)d)[ 0] = ((const uint32_t*)s)[ lane ];
|
||||
((uint32_t*)d)[ 1] = ((const uint32_t*)s)[ lane+16 ];
|
||||
((uint32_t*)d)[ 2] = ((const uint32_t*)s)[ lane+32 ];
|
||||
((uint32_t*)d)[ 3] = ((const uint32_t*)s)[ lane+48 ];
|
||||
((uint32_t*)d)[ 4] = ((const uint32_t*)s)[ lane+64 ];
|
||||
((uint32_t*)d)[ 5] = ((const uint32_t*)s)[ lane+80 ];
|
||||
((uint32_t*)d)[ 6] = ((const uint32_t*)s)[ lane+96 ];
|
||||
((uint32_t*)d)[ 0] = ((const uint32_t*)s)[ lane ];
|
||||
((uint32_t*)d)[ 1] = ((const uint32_t*)s)[ lane+ 16 ];
|
||||
((uint32_t*)d)[ 2] = ((const uint32_t*)s)[ lane+ 32 ];
|
||||
((uint32_t*)d)[ 3] = ((const uint32_t*)s)[ lane+ 48 ];
|
||||
((uint32_t*)d)[ 4] = ((const uint32_t*)s)[ lane+ 64 ];
|
||||
((uint32_t*)d)[ 5] = ((const uint32_t*)s)[ lane+ 80 ];
|
||||
((uint32_t*)d)[ 6] = ((const uint32_t*)s)[ lane+ 96 ];
|
||||
((uint32_t*)d)[ 7] = ((const uint32_t*)s)[ lane+112 ];
|
||||
if ( bit_len <= 256 ) return;
|
||||
((uint32_t*)d)[ 8] = ((const uint32_t*)s)[ lane+128 ];
|
||||
|
@@ -274,11 +274,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
// Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements.
|
||||
// Effectively a sign test.
|
||||
|
||||
#define mm_movmask_64( v ) \
|
||||
_mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) )
|
||||
#define mm128_movmask_64( v ) \
|
||||
_mm_movemask_pd( (__m128d)(v) )
|
||||
|
||||
#define mm_movmask_32( v ) \
|
||||
_mm_castps_si128( _mm_movmask_ps( _mm_castsi128_ps( v ) ) )
|
||||
#define mm128_movmask_32( v ) \
|
||||
_mm_movemask_ps( (__m128)(v) )
|
||||
|
||||
//
|
||||
// Bit rotations
|
||||
|
@@ -209,10 +209,10 @@ static inline __m256i mm256_not( const __m256i v )
|
||||
// Effectively a sign test.
|
||||
|
||||
#define mm256_movmask_64( v ) \
|
||||
_mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
|
||||
_mm256_movemask_pd( _mm256_castsi256_pd( v ) )
|
||||
|
||||
#define mm256_movmask_32( v ) \
|
||||
_mm256_castps_si256( _mm256_movmask_ps( _mm256_castsi256_ps( v ) ) )
|
||||
_mm256_movemask_ps( _mm256_castsi256_ps( v ) )
|
||||
|
||||
//
|
||||
// Bit rotations.
|
||||
|
Reference in New Issue
Block a user