mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v25.4
This commit is contained in:
@@ -31,7 +31,7 @@
|
||||
#include "hmac-sha256-hash-4way.h"
|
||||
#include "compat.h"
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
// HMAC 4-way SSE2
|
||||
|
||||
/**
|
||||
@@ -62,30 +62,30 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
|
||||
/* If Klen > 64, the key is really SHA256(K). */
|
||||
if ( Klen > 64 )
|
||||
{
|
||||
sha256_4way_init( &ctx->ictx );
|
||||
sha256_4way_update( &ctx->ictx, K, Klen );
|
||||
sha256_4way_close( &ctx->ictx, khash );
|
||||
sha256_4x32_init( &ctx->ictx );
|
||||
sha256_4x32_update( &ctx->ictx, K, Klen );
|
||||
sha256_4x32_close( &ctx->ictx, khash );
|
||||
K = khash;
|
||||
Klen = 32;
|
||||
}
|
||||
|
||||
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
||||
sha256_4way_init( &ctx->ictx );
|
||||
sha256_4x32_init( &ctx->ictx );
|
||||
memset( pad, 0x36, 64*4 );
|
||||
|
||||
for ( i = 0; i < Klen; i++ )
|
||||
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
|
||||
casti_v128u32( K, i ) );
|
||||
casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
|
||||
casti_v128u32( K, i ) );
|
||||
|
||||
sha256_4way_update( &ctx->ictx, pad, 64 );
|
||||
sha256_4x32_update( &ctx->ictx, pad, 64 );
|
||||
|
||||
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
||||
sha256_4way_init( &ctx->octx );
|
||||
sha256_4x32_init( &ctx->octx );
|
||||
memset( pad, 0x5c, 64*4 );
|
||||
for ( i = 0; i < Klen/4; i++ )
|
||||
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
|
||||
casti_v128u32( K, i ) );
|
||||
sha256_4way_update( &ctx->octx, pad, 64 );
|
||||
casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
|
||||
casti_v128u32( K, i ) );
|
||||
sha256_4x32_update( &ctx->octx, pad, 64 );
|
||||
}
|
||||
|
||||
/* Add bytes to the HMAC-SHA256 operation. */
|
||||
@@ -94,7 +94,7 @@ hmac_sha256_4way_update( hmac_sha256_4way_context *ctx, const void *in,
|
||||
size_t len )
|
||||
{
|
||||
/* Feed data to the inner SHA256 operation. */
|
||||
sha256_4way_update( &ctx->ictx, in, len );
|
||||
sha256_4x32_update( &ctx->ictx, in, len );
|
||||
}
|
||||
|
||||
/* Finish an HMAC-SHA256 operation. */
|
||||
@@ -104,13 +104,13 @@ hmac_sha256_4way_close( hmac_sha256_4way_context *ctx, void *digest )
|
||||
unsigned char ihash[32*4] __attribute__ ((aligned (64)));
|
||||
|
||||
/* Finish the inner SHA256 operation. */
|
||||
sha256_4way_close( &ctx->ictx, ihash );
|
||||
sha256_4x32_close( &ctx->ictx, ihash );
|
||||
|
||||
/* Feed the inner hash to the outer SHA256 operation. */
|
||||
sha256_4way_update( &ctx->octx, ihash, 32 );
|
||||
sha256_4x32_update( &ctx->octx, ihash, 32 );
|
||||
|
||||
/* Finish the outer SHA256 operation. */
|
||||
sha256_4way_close( &ctx->octx, digest );
|
||||
sha256_4x32_close( &ctx->octx, digest );
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -126,7 +126,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
|
||||
hmac_sha256_4way_context PShctx, hctx;
|
||||
uint8_t _ALIGN(128) T[32*4];
|
||||
uint8_t _ALIGN(128) U[32*4];
|
||||
__m128i ivec;
|
||||
v128u32_t ivec;
|
||||
size_t i, clen;
|
||||
uint64_t j;
|
||||
int k;
|
||||
@@ -139,7 +139,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
|
||||
for ( i = 0; i * 32 < dkLen; i++ )
|
||||
{
|
||||
/* Generate INT(i + 1). */
|
||||
ivec = _mm_set1_epi32( bswap_32( i+1 ) );
|
||||
ivec = v128_32( bswap_32( i+1 ) );
|
||||
|
||||
/* Compute U_1 = PRF(P, S || INT(i)). */
|
||||
memcpy( &hctx, &PShctx, sizeof(hmac_sha256_4way_context) );
|
||||
@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
|
||||
|
||||
/* ... xor U_j ... */
|
||||
for ( k = 0; k < 8; k++ )
|
||||
casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
|
||||
casti_v128u32( U, k ) );
|
||||
casti_v128u32( T, k ) = v128_xor( casti_v128u32( T, k ),
|
||||
casti_v128u32( U, k ) );
|
||||
}
|
||||
|
||||
/* Copy as many bytes as necessary into buf. */
|
||||
@@ -199,30 +199,30 @@ hmac_sha256_8way_init( hmac_sha256_8way_context *ctx, const void *_K,
|
||||
/* If Klen > 64, the key is really SHA256(K). */
|
||||
if ( Klen > 64 )
|
||||
{
|
||||
sha256_8way_init( &ctx->ictx );
|
||||
sha256_8way_update( &ctx->ictx, K, Klen );
|
||||
sha256_8way_close( &ctx->ictx, khash );
|
||||
sha256_8x32_init( &ctx->ictx );
|
||||
sha256_8x32_update( &ctx->ictx, K, Klen );
|
||||
sha256_8x32_close( &ctx->ictx, khash );
|
||||
K = khash;
|
||||
Klen = 32;
|
||||
}
|
||||
|
||||
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
||||
sha256_8way_init( &ctx->ictx );
|
||||
sha256_8x32_init( &ctx->ictx );
|
||||
memset( pad, 0x36, 64*8);
|
||||
|
||||
for ( i = 0; i < Klen/4; i++ )
|
||||
casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
|
||||
casti_m256i( K, i ) );
|
||||
|
||||
sha256_8way_update( &ctx->ictx, pad, 64 );
|
||||
sha256_8x32_update( &ctx->ictx, pad, 64 );
|
||||
|
||||
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
||||
sha256_8way_init( &ctx->octx );
|
||||
sha256_8x32_init( &ctx->octx );
|
||||
memset( pad, 0x5c, 64*8 );
|
||||
for ( i = 0; i < Klen/4; i++ )
|
||||
casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
|
||||
casti_m256i( K, i ) );
|
||||
sha256_8way_update( &ctx->octx, pad, 64 );
|
||||
sha256_8x32_update( &ctx->octx, pad, 64 );
|
||||
}
|
||||
|
||||
void
|
||||
@@ -230,7 +230,7 @@ hmac_sha256_8way_update( hmac_sha256_8way_context *ctx, const void *in,
|
||||
size_t len )
|
||||
{
|
||||
/* Feed data to the inner SHA256 operation. */
|
||||
sha256_8way_update( &ctx->ictx, in, len );
|
||||
sha256_8x32_update( &ctx->ictx, in, len );
|
||||
}
|
||||
|
||||
/* Finish an HMAC-SHA256 operation. */
|
||||
@@ -240,13 +240,13 @@ hmac_sha256_8way_close( hmac_sha256_8way_context *ctx, void *digest )
|
||||
unsigned char ihash[32*8] __attribute__ ((aligned (128)));
|
||||
|
||||
/* Finish the inner SHA256 operation. */
|
||||
sha256_8way_close( &ctx->ictx, ihash );
|
||||
sha256_8x32_close( &ctx->ictx, ihash );
|
||||
|
||||
/* Feed the inner hash to the outer SHA256 operation. */
|
||||
sha256_8way_update( &ctx->octx, ihash, 32 );
|
||||
sha256_8x32_update( &ctx->octx, ihash, 32 );
|
||||
|
||||
/* Finish the outer SHA256 operation. */
|
||||
sha256_8way_close( &ctx->octx, digest );
|
||||
sha256_8x32_close( &ctx->octx, digest );
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -332,21 +332,21 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
|
||||
/* If Klen > 64, the key is really SHA256(K). */
|
||||
if ( Klen > 64 )
|
||||
{
|
||||
sha256_16way_init( &ctx->ictx );
|
||||
sha256_16way_update( &ctx->ictx, K, Klen );
|
||||
sha256_16way_close( &ctx->ictx, khash );
|
||||
sha256_16x32_init( &ctx->ictx );
|
||||
sha256_16x32_update( &ctx->ictx, K, Klen );
|
||||
sha256_16x32_close( &ctx->ictx, khash );
|
||||
K = khash;
|
||||
Klen = 32;
|
||||
}
|
||||
|
||||
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
|
||||
sha256_16way_init( &ctx->ictx );
|
||||
sha256_16x32_init( &ctx->ictx );
|
||||
memset( pad, 0x36, 64*16 );
|
||||
|
||||
for ( i = 0; i < Klen; i++ )
|
||||
casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
|
||||
casti_m512i( K, i ) );
|
||||
sha256_16way_update( &ctx->ictx, pad, 64 );
|
||||
sha256_16x32_update( &ctx->ictx, pad, 64 );
|
||||
|
||||
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
|
||||
sha256_16way_init( &ctx->octx );
|
||||
@@ -354,7 +354,7 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
|
||||
for ( i = 0; i < Klen/4; i++ )
|
||||
casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
|
||||
casti_m512i( K, i ) );
|
||||
sha256_16way_update( &ctx->octx, pad, 64 );
|
||||
sha256_16x32_update( &ctx->octx, pad, 64 );
|
||||
}
|
||||
|
||||
void
|
||||
@@ -362,7 +362,7 @@ hmac_sha256_16way_update( hmac_sha256_16way_context *ctx, const void *in,
|
||||
size_t len )
|
||||
{
|
||||
/* Feed data to the inner SHA256 operation. */
|
||||
sha256_16way_update( &ctx->ictx, in, len );
|
||||
sha256_16x32_update( &ctx->ictx, in, len );
|
||||
}
|
||||
|
||||
/* Finish an HMAC-SHA256 operation. */
|
||||
@@ -372,13 +372,13 @@ hmac_sha256_16way_close( hmac_sha256_16way_context *ctx, void *digest )
|
||||
unsigned char ihash[32*16] __attribute__ ((aligned (128)));
|
||||
|
||||
/* Finish the inner SHA256 operation. */
|
||||
sha256_16way_close( &ctx->ictx, ihash );
|
||||
sha256_16x32_close( &ctx->ictx, ihash );
|
||||
|
||||
/* Feed the inner hash to the outer SHA256 operation. */
|
||||
sha256_16way_update( &ctx->octx, ihash, 32 );
|
||||
sha256_16x32_update( &ctx->octx, ihash, 32 );
|
||||
|
||||
/* Finish the outer SHA256 operation. */
|
||||
sha256_16way_close( &ctx->octx, digest );
|
||||
sha256_16x32_close( &ctx->octx, digest );
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*-
|
||||
* Copyright 2005,2007,2009 Colin Percival
|
||||
* Copyright 2020 JayDDee@gmailcom
|
||||
* Copyright 2020 JayDDee246@gmailcom
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
@@ -38,11 +38,12 @@
|
||||
#include "simd-utils.h"
|
||||
#include "sha256-hash.h"
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
|
||||
typedef struct _hmac_sha256_4way_context
|
||||
{
|
||||
sha256_4way_context ictx;
|
||||
sha256_4way_context octx;
|
||||
sha256_4x32_context ictx;
|
||||
sha256_4x32_context octx;
|
||||
} hmac_sha256_4way_context;
|
||||
|
||||
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
||||
@@ -67,8 +68,8 @@ void pbkdf2_sha256_4way( uint8_t *, size_t, const uint8_t *, size_t,
|
||||
|
||||
typedef struct _hmac_sha256_8way_context
|
||||
{
|
||||
sha256_8way_context ictx;
|
||||
sha256_8way_context octx;
|
||||
sha256_8x32_context ictx;
|
||||
sha256_8x32_context octx;
|
||||
} hmac_sha256_8way_context;
|
||||
|
||||
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
||||
@@ -88,8 +89,8 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
|
||||
|
||||
typedef struct _hmac_sha256_16way_context
|
||||
{
|
||||
sha256_16way_context ictx;
|
||||
sha256_16way_context octx;
|
||||
sha256_16x32_context ictx;
|
||||
sha256_16x32_context octx;
|
||||
} hmac_sha256_16way_context;
|
||||
|
||||
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
|
||||
|
||||
@@ -30,6 +30,7 @@ static const uint32_t K256[64] =
|
||||
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
|
||||
};
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
// SHA-256 4 way SSE2
|
||||
|
||||
#define CHs(X, Y, Z) \
|
||||
@@ -309,142 +310,6 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
|
||||
v128_store( state_out + 7, H );
|
||||
}
|
||||
|
||||
|
||||
# if 0
|
||||
|
||||
// Working correctly but still slower
|
||||
int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
|
||||
const v128_t *state_in, const uint32_t *target )
|
||||
{
|
||||
v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
|
||||
v128_t vmask, targ, hash;
|
||||
int t6_mask, flip;
|
||||
v128_t W[16]; v128_memcpy( W, data, 16 );
|
||||
|
||||
A = v128_load( state_in );
|
||||
B = v128_load( state_in+1 );
|
||||
C = v128_load( state_in+2 );
|
||||
D = v128_load( state_in+3 );
|
||||
E = v128_load( state_in+4 );
|
||||
F = v128_load( state_in+5 );
|
||||
G = v128_load( state_in+6 );
|
||||
H = v128_load( state_in+7 );
|
||||
|
||||
const v128_t IV7 = H;
|
||||
const v128_t IV6 = G;
|
||||
|
||||
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
|
||||
SHA256_4X32_MSG_EXPANSION( W );
|
||||
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
|
||||
SHA256_4X32_MSG_EXPANSION( W );
|
||||
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
|
||||
|
||||
W[ 0] = SHA256_4X32_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
|
||||
W[ 1] = SHA256_4X32_MEXP( W[15], W[10], W[ 2], W[ 1] );
|
||||
W[ 2] = SHA256_4X32_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
|
||||
W[ 3] = SHA256_4X32_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
|
||||
W[ 4] = SHA256_4X32_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
|
||||
W[ 5] = SHA256_4X32_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
|
||||
W[ 6] = SHA256_4X32_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
|
||||
W[ 7] = SHA256_4X32_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
|
||||
W[ 8] = SHA256_4X32_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
|
||||
W[ 9] = SHA256_4X32_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
|
||||
W[10] = SHA256_4X32_MEXP( W[ 8], W[ 3], W[11], W[10] );
|
||||
W[11] = SHA256_4X32_MEXP( W[ 9], W[ 4], W[12], W[11] );
|
||||
W[12] = SHA256_4X32_MEXP( W[10], W[ 5], W[13], W[12] );
|
||||
|
||||
v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
|
||||
|
||||
SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
|
||||
SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
|
||||
SHA256_4X32_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
|
||||
SHA256_4X32_ROUND( F, G, H, A, B, C, D, E, 3, 48 );
|
||||
SHA256_4X32_ROUND( E, F, G, H, A, B, C, D, 4, 48 );
|
||||
SHA256_4X32_ROUND( D, E, F, G, H, A, B, C, 5, 48 );
|
||||
SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 6, 48 );
|
||||
SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 7, 48 );
|
||||
SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 8, 48 );
|
||||
SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
|
||||
|
||||
T0 = v128_add32( v128_32( K256[58] ),
|
||||
v128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
|
||||
B = v128_add32( B, T0 );
|
||||
|
||||
T1 = v128_add32( v128_32( K256[59] ),
|
||||
v128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
|
||||
A = v128_add32( A, T1 );
|
||||
|
||||
T2 = v128_add32( v128_32( K256[60] ),
|
||||
v128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
|
||||
H = v128_add32( H, T2 );
|
||||
|
||||
targ = v128_32( target[7] );
|
||||
hash = v128_bswap32( v128_add32( H, IV7 ) );
|
||||
|
||||
flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
|
||||
|
||||
if ( likely(
|
||||
0xf == ( flip ^ v128_movmask32( v128_cmpgt32( hash, targ ) ) ) ))
|
||||
return 0;
|
||||
|
||||
t6_mask = v128_movmask32( vmask = v128_cmpeq32( hash, targ ) );
|
||||
|
||||
// round 58 part 2
|
||||
F = v128_add32( T0, v128_add32( BSG2_0( G ), MAJs( G, H, A ) ) );
|
||||
|
||||
// round 61 part 1
|
||||
W[13] = SHA256_4X32_MEXP( W[11], W[ 6], W[14], W[13] );
|
||||
T0 = v128_add32( v128_32( K256[61] ),
|
||||
v128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
|
||||
G = v128_add32( G, T0 );
|
||||
|
||||
if ( t6_mask )
|
||||
{
|
||||
targ = v128_and( vmask, v128_32( target[6] ) );
|
||||
hash = v128_bswap32( v128_add32( G, IV6 ) );
|
||||
|
||||
if ( ( 0 != ( t6_mask & v128_movmask32( v128_cmpeq32( hash, targ ) ) ) ))
|
||||
return 0;
|
||||
else
|
||||
{
|
||||
flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
|
||||
if ( 0 != ( t6_mask & ( flip ^ v128_movmask32(
|
||||
v128_cmpgt32( hash, targ ) ) ) ) )
|
||||
return 0;
|
||||
else if ( target[6] == 0x80000000 )
|
||||
{
|
||||
if ( 0 == ( t6_mask & v128_movmask32(
|
||||
v128_cmpgt32( hash, v128_xor( hash, hash ) ) ) ) )
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// rounds 59 to 61 part 2
|
||||
E = v128_add32( T1, v128_add32( BSG2_0( F ), MAJs( F, G, H ) ) );
|
||||
D = v128_add32( T2, v128_add32( BSG2_0( E ), MAJs( E, F, G ) ) );
|
||||
C = v128_add32( T0, v128_add32( BSG2_0( D ), MAJs( D, E, F ) ) );
|
||||
|
||||
// rounds 62 & 63
|
||||
W[14] = SHA256_4X32_MEXP( W[12], W[ 7], W[15], W[14] );
|
||||
W[15] = SHA256_4X32_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
||||
|
||||
SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 14, 48 );
|
||||
SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 15, 48 );
|
||||
|
||||
state_out[0] = v128_add32( state_in[0], A );
|
||||
state_out[1] = v128_add32( state_in[1], B );
|
||||
state_out[2] = v128_add32( state_in[2], C );
|
||||
state_out[3] = v128_add32( state_in[3], D );
|
||||
state_out[4] = v128_add32( state_in[4], E );
|
||||
state_out[5] = v128_add32( state_in[5], F );
|
||||
state_out[6] = v128_add32( state_in[6], G );
|
||||
state_out[7] = v128_add32( state_in[7], H );
|
||||
return 1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void sha256_4x32_init( sha256_4x32_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
@@ -529,29 +394,31 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
|
||||
sha256_4x32_close( &ctx, dst );
|
||||
}
|
||||
|
||||
#endif // SSE2 || NEON
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
// SHA-256 8 way
|
||||
|
||||
#define BSG2_0x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 2 ), \
|
||||
mm256_ror_32( x, 13 ) ), \
|
||||
mm256_ror_32( x, 22 ) )
|
||||
mm256_xor3( mm256_ror_32( x, 2 ), \
|
||||
mm256_ror_32( x, 13 ), \
|
||||
mm256_ror_32( x, 22 ) )
|
||||
|
||||
#define BSG2_1x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 6 ), \
|
||||
mm256_ror_32( x, 11 ) ), \
|
||||
mm256_ror_32( x, 25 ) )
|
||||
mm256_xor3( mm256_ror_32( x, 6 ), \
|
||||
mm256_ror_32( x, 11 ), \
|
||||
mm256_ror_32( x, 25 ) )
|
||||
|
||||
#define SSG2_0x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 7 ), \
|
||||
mm256_ror_32( x, 18 ) ), \
|
||||
_mm256_srli_epi32( x, 3 ) )
|
||||
mm256_xor3( mm256_ror_32( x, 7 ), \
|
||||
mm256_ror_32( x, 18 ), \
|
||||
_mm256_srli_epi32( x, 3 ) )
|
||||
|
||||
#define SSG2_1x(x) \
|
||||
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \
|
||||
mm256_ror_32( x, 19 ) ), \
|
||||
_mm256_srli_epi32( x, 10 ) )
|
||||
mm256_xor3( mm256_ror_32( x, 17 ), \
|
||||
mm256_ror_32( x, 19 ), \
|
||||
_mm256_srli_epi32( x, 10 ) )
|
||||
|
||||
#define SHA256_8WAY_MEXP( a, b, c, d ) \
|
||||
mm256_add4_32( SSG2_1x( a ), b, SSG2_0x( c ), d );
|
||||
@@ -574,13 +441,8 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
|
||||
W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); \
|
||||
W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
|
||||
|
||||
|
||||
// With AVX512VL ternary logic optimizations are available.
|
||||
// If not optimize by forwarding the result of X^Y in MAJ to the next round
|
||||
// to avoid recalculating it as Y^Z. This optimization is not applicable
|
||||
// when MAJ is optimized with ternary logic.
|
||||
|
||||
#if defined(VL256)
|
||||
// AVX512 or AVX10-256
|
||||
|
||||
#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
|
||||
|
||||
@@ -745,7 +607,7 @@ static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
|
||||
}
|
||||
|
||||
// accepts LE input data
|
||||
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
||||
void sha256_8x32_transform_le( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in )
|
||||
{
|
||||
__m256i W[16];
|
||||
@@ -754,7 +616,7 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
|
||||
}
|
||||
|
||||
// Accepts BE input data, need to bswap
|
||||
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
||||
void sha256_8x32_transform_be( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in )
|
||||
{
|
||||
__m256i W[16];
|
||||
@@ -764,7 +626,7 @@ void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
|
||||
}
|
||||
|
||||
// Aggressive prehashing, LE byte order
|
||||
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
void sha256_8x32_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
const __m256i *W, const __m256i *state_in )
|
||||
{
|
||||
__m256i A, B, C, D, E, F, G, H, T1;
|
||||
@@ -813,7 +675,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
|
||||
_mm256_store_si256( state_mid + 7, H );
|
||||
}
|
||||
|
||||
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const __m256i *state_mid, const __m256i *X )
|
||||
{
|
||||
__m256i A, B, C, D, E, F, G, H;
|
||||
@@ -914,14 +776,12 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
_mm256_store_si256( state_out + 7, H );
|
||||
}
|
||||
|
||||
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const uint32_t *target )
|
||||
{
|
||||
__m256i A, B, C, D, E, F, G, H, T0, T1, T2;
|
||||
__m256i vmask, targ, hash;
|
||||
__m256i W[16]; memcpy_256( W, data, 16 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
uint8_t flip, t6_mask;
|
||||
|
||||
A = _mm256_load_si256( state_in );
|
||||
@@ -1012,7 +872,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
|
||||
// Got H, test it.
|
||||
targ = v256_32( target[7] );
|
||||
hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf );
|
||||
hash = mm256_bswap_32( _mm256_add_epi32( H, IV7 ) );
|
||||
if ( target[7] )
|
||||
{
|
||||
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
|
||||
@@ -1035,7 +895,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
{
|
||||
// Testing H was inconclusive: hash7 == target7, need to test G
|
||||
targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
|
||||
hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
|
||||
hash = mm256_bswap_32( _mm256_add_epi32( G, IV6 ) );
|
||||
|
||||
if ( likely( 0 == ( t6_mask & mm256_movmask_32(
|
||||
_mm256_cmpeq_epi32( hash, targ ) ) ) ))
|
||||
@@ -1083,8 +943,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
void sha256_8way_init( sha256_8way_context *sc )
|
||||
void sha256_8x32_init( sha256_8x32_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = v256_32( sha256_iv[0] );
|
||||
@@ -1100,7 +959,7 @@ void sha256_8way_init( sha256_8way_context *sc )
|
||||
// need to handle odd byte length for yespower.
|
||||
// Assume only last update is odd.
|
||||
|
||||
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
|
||||
void sha256_8x32_update( sha256_8x32_context *sc, const void *data, size_t len )
|
||||
{
|
||||
__m256i *vdata = (__m256i*)data;
|
||||
size_t ptr;
|
||||
@@ -1121,7 +980,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
|
||||
sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
|
||||
ptr = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
@@ -1132,7 +991,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
|
||||
}
|
||||
}
|
||||
|
||||
void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
void sha256_8x32_close( sha256_8x32_context *sc, void *dst )
|
||||
{
|
||||
unsigned ptr;
|
||||
uint32_t low, high;
|
||||
@@ -1146,7 +1005,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
if ( ptr > pad )
|
||||
{
|
||||
memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
|
||||
sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
|
||||
memset_zero_256( sc->buf, pad >> 2 );
|
||||
}
|
||||
else
|
||||
@@ -1159,17 +1018,17 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
|
||||
sc->buf[ pad >> 2 ] = v256_32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );
|
||||
|
||||
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
|
||||
sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
mm256_block_bswap_32( dst, sc->val );
|
||||
}
|
||||
|
||||
void sha256_8way_full( void *dst, const void *data, size_t len )
|
||||
void sha256_8x32_full( void *dst, const void *data, size_t len )
|
||||
{
|
||||
sha256_8way_context ctx;
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way_update( &ctx, data, len );
|
||||
sha256_8way_close( &ctx, dst );
|
||||
sha256_8x32_context ctx;
|
||||
sha256_8x32_init( &ctx );
|
||||
sha256_8x32_update( &ctx, data, len );
|
||||
sha256_8x32_close( &ctx, dst );
|
||||
}
|
||||
|
||||
#if defined(SIMD512)
|
||||
@@ -1302,7 +1161,7 @@ static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W,
|
||||
}
|
||||
|
||||
// accepts LE input data
|
||||
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
||||
void sha256_16x32_transform_le( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in )
|
||||
{
|
||||
__m512i W[16];
|
||||
@@ -1311,7 +1170,7 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
|
||||
}
|
||||
|
||||
// Accepts BE input data, need to bswap
|
||||
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
||||
void sha256_16x32_transform_be( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in )
|
||||
{
|
||||
__m512i W[16];
|
||||
@@ -1321,7 +1180,7 @@ void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
|
||||
}
|
||||
|
||||
// Aggressive prehashing, LE byte order
|
||||
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
void sha256_16x32_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
const __m512i *W, const __m512i *state_in )
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H, T1;
|
||||
@@ -1369,7 +1228,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
|
||||
_mm512_store_si512( state_mid + 7, H );
|
||||
}
|
||||
|
||||
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H;
|
||||
@@ -1470,15 +1329,13 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
|
||||
|
||||
// returns 0 if hash aborted early and invalid,
|
||||
// returns 1 for completed hash with at least one valid candidate.
|
||||
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
const __m512i *state_in, const uint32_t *target )
|
||||
{
|
||||
__m512i A, B, C, D, E, F, G, H, hash, targ;
|
||||
__m512i T0, T1, T2;
|
||||
__m512i W[16]; memcpy_512( W, data, 16 );
|
||||
__mmask16 t6_mask;
|
||||
const __m512i bswap_shuf = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
A = _mm512_load_si512( state_in );
|
||||
B = _mm512_load_si512( state_in+1 );
|
||||
@@ -1588,7 +1445,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
H = _mm512_add_epi32( H, T2 );
|
||||
|
||||
// got H, test it against target[7]
|
||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
|
||||
hash = mm512_bswap_32( _mm512_add_epi32( H , IV7 ) );
|
||||
targ = v512_32( target[7] );
|
||||
if ( target[7] )
|
||||
if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
|
||||
@@ -1608,7 +1465,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
// got G, test it against target[6] if indicated
|
||||
if ( (uint16_t)t6_mask )
|
||||
{
|
||||
hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
|
||||
hash = mm512_bswap_32( _mm512_add_epi32( G, IV6 ) );
|
||||
targ = v512_32( target[6] );
|
||||
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
|
||||
return 0;
|
||||
@@ -1644,7 +1501,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
|
||||
return 1;
|
||||
}
|
||||
|
||||
void sha256_16way_init( sha256_16way_context *sc )
|
||||
void sha256_16x32_init( sha256_16x32_context *sc )
|
||||
{
|
||||
sc->count_high = sc->count_low = 0;
|
||||
sc->val[0] = v512_32( sha256_iv[0] );
|
||||
@@ -1657,7 +1514,7 @@ void sha256_16way_init( sha256_16way_context *sc )
|
||||
sc->val[7] = v512_32( sha256_iv[7] );
|
||||
}
|
||||
|
||||
void sha256_16way_update( sha256_16way_context *sc, const void *data,
|
||||
void sha256_16x32_update( sha256_16x32_context *sc, const void *data,
|
||||
size_t len )
|
||||
{
|
||||
__m512i *vdata = (__m512i*)data;
|
||||
@@ -1679,7 +1536,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
|
||||
len -= clen;
|
||||
if ( ptr == buf_size )
|
||||
{
|
||||
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
|
||||
sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
|
||||
ptr = 0;
|
||||
}
|
||||
clow = sc->count_low;
|
||||
@@ -1690,7 +1547,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
|
||||
}
|
||||
}
|
||||
|
||||
void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
||||
void sha256_16x32_close( sha256_16x32_context *sc, void *dst )
|
||||
{
|
||||
unsigned ptr;
|
||||
uint32_t low, high;
|
||||
@@ -1704,7 +1561,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
||||
if ( ptr > pad )
|
||||
{
|
||||
memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
|
||||
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
|
||||
sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
|
||||
memset_zero_512( sc->buf, pad >> 2 );
|
||||
}
|
||||
else
|
||||
@@ -1717,17 +1574,17 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
|
||||
sc->buf[ pad >> 2 ] = v512_32( bswap_32( high ) );
|
||||
sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );
|
||||
|
||||
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
|
||||
sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
|
||||
|
||||
mm512_block_bswap_32( dst, sc->val );
|
||||
}
|
||||
|
||||
void sha256_16way_full( void *dst, const void *data, size_t len )
|
||||
void sha256_16x32_full( void *dst, const void *data, size_t len )
|
||||
{
|
||||
sha256_16way_context ctx;
|
||||
sha256_16way_init( &ctx );
|
||||
sha256_16way_update( &ctx, data, len );
|
||||
sha256_16way_close( &ctx, dst );
|
||||
sha256_16x32_context ctx;
|
||||
sha256_16x32_init( &ctx );
|
||||
sha256_16x32_update( &ctx, data, len );
|
||||
sha256_16x32_close( &ctx, dst );
|
||||
}
|
||||
|
||||
#undef CH
|
||||
|
||||
@@ -180,20 +180,9 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
|
||||
int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
|
||||
const __m256i *state_in, const uint32_t *target );
|
||||
|
||||
// Temporary API during naming transition
|
||||
#define sha256_8way_context sha256_8x32_context
|
||||
#define sha256_8way_init sha256_8x32_init
|
||||
#define sha256_8way_update sha256_8x32_update
|
||||
#define sha256_8way_close sha256_8x32_close
|
||||
#define sha256_8way_full sha256_8x32_full
|
||||
#define sha256_8way_transform_le sha256_8x32_transform_le
|
||||
#define sha256_8way_transform_be sha256_8x32_transform_be
|
||||
#define sha256_8way_prehash_3rounds sha256_8x32_prehash_3rounds
|
||||
#define sha256_8way_final_rounds sha256_8x32_final_rounds
|
||||
#define sha256_8way_transform_le_short sha256_8x32_transform_le_short
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON)
|
||||
// SHA-256 4 way x86_64 with SSE2 or AArch64 with NEON
|
||||
|
||||
typedef struct
|
||||
@@ -219,16 +208,5 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
|
||||
int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
|
||||
const v128_t *state_in, const uint32_t *target );
|
||||
|
||||
// Temporary API during naming transition
|
||||
#define sha256_4way_context sha256_4x32_context
|
||||
#define sha256_4way_init sha256_4x32_init
|
||||
#define sha256_4way_update sha256_4x32_update
|
||||
#define sha256_4way_close sha256_4x32_close
|
||||
#define sha256_4way_full sha256_4x32_full
|
||||
#define sha256_4way_transform_le sha256_4x32_transform_le
|
||||
#define sha256_4way_transform_be sha256_4x32_transform_be
|
||||
#define sha256_4way_prehash_3rounds sha256_4x32_prehash_3rounds
|
||||
#define sha256_4way_final_rounds sha256_4x32_final_rounds
|
||||
#define sha256_4way_transform_le_short sha256_4x32_transform_le_short
|
||||
|
||||
#endif
|
||||
#endif // SSE2 || NEON
|
||||
#endif // SHA256_HASH_H__
|
||||
|
||||
@@ -32,8 +32,6 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
||||
uint32_t n = first_nonce;
|
||||
const int thr_id = mythr->id;
|
||||
const bool bench = opt_benchmark;
|
||||
const v128_t shuf_bswap32 =
|
||||
v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
|
||||
|
||||
// hash first 64 byte block of data
|
||||
sha256_transform_le( mstatea, pdata, sha256_iv );
|
||||
@@ -69,10 +67,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
||||
|
||||
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_v128( hasha, 0 ) =
|
||||
_mm_shuffle_epi8( casti_v128( hasha, 0 ), shuf_bswap32 );
|
||||
casti_v128( hasha, 1 ) =
|
||||
_mm_shuffle_epi8( casti_v128( hasha, 1 ), shuf_bswap32 );
|
||||
casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
|
||||
casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
|
||||
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n;
|
||||
@@ -81,10 +77,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
|
||||
}
|
||||
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
|
||||
{
|
||||
casti_v128( hashb, 0 ) =
|
||||
_mm_shuffle_epi8( casti_v128( hashb, 0 ), shuf_bswap32 );
|
||||
casti_v128( hashb, 1 ) =
|
||||
_mm_shuffle_epi8( casti_v128( hashb, 1 ), shuf_bswap32 );
|
||||
casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
|
||||
casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
|
||||
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n+1;
|
||||
@@ -204,8 +198,6 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
// prehash first block directly from pdata
|
||||
sha256_transform_le( phash, pdata, sha256_iv );
|
||||
@@ -231,7 +223,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
buf[15] = v512_32( 80*8 ); // bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for second hash
|
||||
istate[0] = v512_32( sha256_iv[0] );
|
||||
@@ -250,15 +242,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
do
|
||||
{
|
||||
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
|
||||
if ( unlikely( sha256_16way_transform_le_short(
|
||||
sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
|
||||
if ( unlikely( sha256_16x32_transform_le_short(
|
||||
hash32, block, istate, ptarget ) ) )
|
||||
{
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
{
|
||||
extr_lane_16x32( phash, hash32, lane, 256 );
|
||||
casti_m256i( phash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
|
||||
casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) );
|
||||
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
@@ -299,8 +290,6 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = v256_32( 0x80000000 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = v256_32( pdata[i] );
|
||||
@@ -325,22 +314,22 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
|
||||
istate[6] = v256_32( sha256_iv[6] );
|
||||
istate[7] = v256_32( sha256_iv[7] );
|
||||
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
sha256_8x32_transform_le( mstate1, vdata, istate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
|
||||
sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
|
||||
if ( unlikely( sha256_8way_transform_le_short( hash32, block,
|
||||
sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
|
||||
if ( unlikely( sha256_8x32_transform_le_short( hash32, block,
|
||||
istate, ptarget ) ) )
|
||||
{
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
casti_m256i( lane_hash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
|
||||
mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
#define SHA256D_NEON_SHA2 1
|
||||
#elif defined(__AVX2__)
|
||||
#define SHA256D_8WAY 1
|
||||
#else
|
||||
#elif defined(__SSE2__) || defined(__ARM_NEON)
|
||||
#define SHA256D_4WAY 1
|
||||
#endif
|
||||
|
||||
|
||||
@@ -17,7 +17,6 @@
|
||||
#elif defined (__SSE2__) || defined(__ARM_NEON)
|
||||
#define SHA256DT_4X32 1
|
||||
#endif
|
||||
// else ref, should never happen
|
||||
|
||||
static const uint32_t sha256dt_iv[8] __attribute__ ((aligned (32))) =
|
||||
{
|
||||
@@ -205,8 +204,6 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce,
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
// prehash first block directly from pdata
|
||||
sha256_transform_le( phash, pdata, sha256dt_iv );
|
||||
@@ -258,8 +255,7 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce,
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
{
|
||||
extr_lane_16x32( phash, hash32, lane, 256 );
|
||||
casti_m256i( phash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
|
||||
casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) );
|
||||
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
@@ -298,8 +294,6 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = v256_32( 0x80000000 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = v256_32( pdata[i] );
|
||||
@@ -339,7 +333,7 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce,
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
casti_m256i( lane_hash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
|
||||
mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
@@ -406,7 +400,6 @@ int scanhash_sha256dt_4x32( struct work *work, const uint32_t max_nonce,
|
||||
do
|
||||
{
|
||||
sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
|
||||
// sha256_4x32_transform_le( block, vdata+16, mhash1 );
|
||||
sha256_4x32_transform_le( hash32, block, iv );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
|
||||
@@ -7,28 +7,28 @@
|
||||
|
||||
#if defined(SHA256T_16WAY)
|
||||
|
||||
static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
|
||||
static __thread sha256_16x32_context sha256_ctx16 __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256q_16way_hash( void* output, const void* input )
|
||||
{
|
||||
uint32_t vhash[8*16] __attribute__ ((aligned (64)));
|
||||
sha256_16way_context ctx;
|
||||
sha256_16x32_context ctx;
|
||||
memcpy( &ctx, &sha256_ctx16, sizeof ctx );
|
||||
|
||||
sha256_16way_update( &ctx, input + (64<<4), 16 );
|
||||
sha256_16way_close( &ctx, vhash );
|
||||
sha256_16x32_update( &ctx, input + (64<<4), 16 );
|
||||
sha256_16x32_close( &ctx, vhash );
|
||||
|
||||
sha256_16way_init( &ctx );
|
||||
sha256_16way_update( &ctx, vhash, 32 );
|
||||
sha256_16way_close( &ctx, vhash );
|
||||
sha256_16x32_init( &ctx );
|
||||
sha256_16x32_update( &ctx, vhash, 32 );
|
||||
sha256_16x32_close( &ctx, vhash );
|
||||
|
||||
sha256_16way_init( &ctx );
|
||||
sha256_16way_update( &ctx, vhash, 32 );
|
||||
sha256_16way_close( &ctx, vhash );
|
||||
sha256_16x32_init( &ctx );
|
||||
sha256_16x32_update( &ctx, vhash, 32 );
|
||||
sha256_16x32_close( &ctx, vhash );
|
||||
|
||||
sha256_16way_init( &ctx );
|
||||
sha256_16way_update( &ctx, vhash, 32 );
|
||||
sha256_16way_close( &ctx, output );
|
||||
sha256_16x32_init( &ctx );
|
||||
sha256_16x32_update( &ctx, vhash, 32 );
|
||||
sha256_16x32_close( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
|
||||
@@ -51,8 +51,8 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
|
||||
mm512_bswap32_intrlv80_16x32( vdata, pdata );
|
||||
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
|
||||
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
|
||||
sha256_16way_init( &sha256_ctx16 );
|
||||
sha256_16way_update( &sha256_ctx16, vdata, 64 );
|
||||
sha256_16x32_init( &sha256_ctx16 );
|
||||
sha256_16x32_update( &sha256_ctx16, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -80,28 +80,28 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
#if defined(SHA256T_8WAY)
|
||||
|
||||
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
|
||||
static __thread sha256_8x32_context sha256_ctx8 __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256q_8way_hash( void* output, const void* input )
|
||||
{
|
||||
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
|
||||
sha256_8way_context ctx;
|
||||
sha256_8x32_context ctx;
|
||||
memcpy( &ctx, &sha256_ctx8, sizeof ctx );
|
||||
|
||||
sha256_8way_update( &ctx, input + (64<<3), 16 );
|
||||
sha256_8way_close( &ctx, vhash );
|
||||
sha256_8x32_update( &ctx, input + (64<<3), 16 );
|
||||
sha256_8x32_close( &ctx, vhash );
|
||||
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way_update( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, vhash );
|
||||
sha256_8x32_init( &ctx );
|
||||
sha256_8x32_update( &ctx, vhash, 32 );
|
||||
sha256_8x32_close( &ctx, vhash );
|
||||
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way_update( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, vhash );
|
||||
sha256_8x32_init( &ctx );
|
||||
sha256_8x32_update( &ctx, vhash, 32 );
|
||||
sha256_8x32_close( &ctx, vhash );
|
||||
|
||||
sha256_8way_init( &ctx );
|
||||
sha256_8way_update( &ctx, vhash, 32 );
|
||||
sha256_8way_close( &ctx, output );
|
||||
sha256_8x32_init( &ctx );
|
||||
sha256_8x32_update( &ctx, vhash, 32 );
|
||||
sha256_8x32_close( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
|
||||
@@ -123,8 +123,8 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
mm256_bswap32_intrlv80_8x32( vdata, pdata );
|
||||
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
|
||||
sha256_8way_init( &sha256_ctx8 );
|
||||
sha256_8way_update( &sha256_ctx8, vdata, 64 );
|
||||
sha256_8x32_init( &sha256_ctx8 );
|
||||
sha256_8x32_update( &sha256_ctx8, vdata, 64 );
|
||||
|
||||
do
|
||||
{
|
||||
@@ -152,28 +152,28 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
#if defined(SHA256T_4WAY)
|
||||
|
||||
static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
|
||||
static __thread sha256_4x32_context sha256_ctx4 __attribute__ ((aligned (64)));
|
||||
|
||||
void sha256q_4way_hash( void* output, const void* input )
|
||||
{
|
||||
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
|
||||
sha256_4way_context ctx;
|
||||
sha256_4x32_context ctx;
|
||||
memcpy( &ctx, &sha256_ctx4, sizeof ctx );
|
||||
|
||||
sha256_4way_update( &ctx, input + (64<<2), 16 );
|
||||
sha256_4way_close( &ctx, vhash );
|
||||
sha256_4x32_update( &ctx, input + (64<<2), 16 );
|
||||
sha256_4x32_close( &ctx, vhash );
|
||||
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way_update( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, vhash );
|
||||
sha256_4x32_init( &ctx );
|
||||
sha256_4x32_update( &ctx, vhash, 32 );
|
||||
sha256_4x32_close( &ctx, vhash );
|
||||
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way_update( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, vhash );
|
||||
sha256_4x32_init( &ctx );
|
||||
sha256_4x32_update( &ctx, vhash, 32 );
|
||||
sha256_4x32_close( &ctx, vhash );
|
||||
|
||||
sha256_4way_init( &ctx );
|
||||
sha256_4way_update( &ctx, vhash, 32 );
|
||||
sha256_4way_close( &ctx, output );
|
||||
sha256_4x32_init( &ctx );
|
||||
sha256_4x32_update( &ctx, vhash, 32 );
|
||||
sha256_4x32_close( &ctx, output );
|
||||
}
|
||||
|
||||
int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
|
||||
@@ -205,8 +205,8 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
|
||||
0 };
|
||||
|
||||
v128_bswap32_intrlv80_4x32( vdata, pdata );
|
||||
sha256_4way_init( &sha256_ctx4 );
|
||||
sha256_4way_update( &sha256_ctx4, vdata, 64 );
|
||||
sha256_4x32_init( &sha256_ctx4 );
|
||||
sha256_4x32_update( &sha256_ctx4, vdata, 64 );
|
||||
|
||||
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
|
||||
{
|
||||
|
||||
@@ -35,8 +35,6 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
const int thr_id = mythr->id;
|
||||
const __m512i sixteen = v512_32( 16 );
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
// prehash first block directly from pdata
|
||||
sha256_transform_le( phash, pdata, sha256_iv );
|
||||
@@ -62,7 +60,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
buf[15] = v512_32( 80*8 ); // bit count
|
||||
|
||||
// partially pre-expand & prehash second message block, avoiding the nonces
|
||||
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
|
||||
|
||||
// vectorize IV for 2nd & 3rd sha256
|
||||
istate[0] = v512_32( sha256_iv[0] );
|
||||
@@ -81,18 +79,17 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
|
||||
|
||||
do
|
||||
{
|
||||
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
|
||||
sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
|
||||
|
||||
sha256_16way_transform_le( block, block, istate );
|
||||
sha256_16x32_transform_le( block, block, istate );
|
||||
|
||||
if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
|
||||
if ( sha256_16x32_transform_le_short( hash32, block, istate, ptarget ) )
|
||||
{
|
||||
for ( int lane = 0; lane < 16; lane++ )
|
||||
if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
|
||||
{
|
||||
extr_lane_16x32( phash, hash32, lane, 256 );
|
||||
casti_m256i( phash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
|
||||
casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) );
|
||||
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
@@ -301,8 +298,6 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
const bool bench = opt_benchmark;
|
||||
const __m256i last_byte = v256_32( 0x80000000 );
|
||||
const __m256i eight = v256_32( 8 );
|
||||
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
|
||||
|
||||
for ( int i = 0; i < 19; i++ )
|
||||
vdata[i] = v256_32( pdata[i] );
|
||||
@@ -327,29 +322,29 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
|
||||
istate[6] = v256_32( sha256_iv[6] );
|
||||
istate[7] = v256_32( sha256_iv[7] );
|
||||
|
||||
sha256_8way_transform_le( mstate1, vdata, istate );
|
||||
sha256_8x32_transform_le( mstate1, vdata, istate );
|
||||
|
||||
// Do 3 rounds on the first 12 bytes of the next block
|
||||
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
|
||||
sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
|
||||
|
||||
do
|
||||
{
|
||||
// 1. final 16 bytes of data, with padding
|
||||
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
|
||||
sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2,
|
||||
mexp_pre );
|
||||
|
||||
// 2. 32 byte hash from 1.
|
||||
sha256_8way_transform_le( block, block, istate );
|
||||
sha256_8x32_transform_le( block, block, istate );
|
||||
|
||||
// 3. 32 byte hash from 2.
|
||||
if ( unlikely( sha256_8way_transform_le_short(
|
||||
if ( unlikely( sha256_8x32_transform_le_short(
|
||||
hash32, block, istate, ptarget ) ) )
|
||||
{
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
{
|
||||
extr_lane_8x32( lane_hash, hash32, lane, 256 );
|
||||
casti_m256i( lane_hash, 0 ) =
|
||||
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
|
||||
mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
|
||||
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
|
||||
{
|
||||
pdata[19] = n + lane;
|
||||
@@ -419,8 +414,8 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
|
||||
do
|
||||
{
|
||||
sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
|
||||
sha256_4way_transform_le( block, block, iv );
|
||||
sha256_4way_transform_le( hash32, block, iv );
|
||||
sha256_4x32_transform_le( block, block, iv );
|
||||
sha256_4x32_transform_le( hash32, block, iv );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
{
|
||||
|
||||
@@ -83,15 +83,13 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
|
||||
const uint64_t *state_in )
|
||||
{
|
||||
__m256i STATE0, STATE1;
|
||||
__m256i MSG, TMP, BSWAP64;
|
||||
__m256i MSG, TMP;
|
||||
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
|
||||
__m256i ABEF_SAVE, CDGH_SAVE;
|
||||
|
||||
// Load initial values
|
||||
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
|
||||
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
|
||||
BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
|
||||
0x0001020304050607 ) );
|
||||
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
|
||||
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
|
||||
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
|
||||
@@ -103,7 +101,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
|
||||
|
||||
// Rounds 0-3
|
||||
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
|
||||
TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
|
||||
TMSG0 = mm256_bswap_64( TMSG0 );
|
||||
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128 (MSG ) );
|
||||
@@ -113,7 +111,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
|
||||
|
||||
// Rounds 4-7
|
||||
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
|
||||
TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
|
||||
TMSG1 = mm256_bswap_64( TMSG1 );
|
||||
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
@@ -124,7 +122,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
|
||||
|
||||
// Rounds 8-11
|
||||
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
|
||||
TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
|
||||
TMSG2 = mm256_bswap_64( TMSG2 );
|
||||
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
@@ -135,7 +133,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
|
||||
|
||||
// Rounds 12-15
|
||||
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
|
||||
TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
|
||||
TMSG3 = mm256_bswap_64( TMSG3 );
|
||||
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
|
||||
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
|
||||
_mm256_castsi256_si128( MSG ) );
|
||||
@@ -735,8 +733,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst )
|
||||
unsigned ptr;
|
||||
const int buf_size = 128;
|
||||
const int pad = buf_size - 16;
|
||||
const __m512i shuff_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = v512_64( 0x80 );
|
||||
@@ -750,10 +746,8 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst )
|
||||
else
|
||||
memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
|
||||
|
||||
sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
|
||||
v512_64( sc->count >> 61 ), shuff_bswap64 );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
|
||||
v512_64( sc->count << 3 ), shuff_bswap64 );
|
||||
sc->buf[ pad >> 3 ] = v512_64( bswap_64( sc->count >> 61 ) );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] = v512_64( bswap_64( sc->count << 3 ) );
|
||||
sha512_8x64_round( sc, sc->buf, sc->val );
|
||||
|
||||
mm512_block_bswap_64( dst, sc->val );
|
||||
@@ -957,8 +951,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst )
|
||||
unsigned ptr;
|
||||
const int buf_size = 128;
|
||||
const int pad = buf_size - 16;
|
||||
const __m256i shuff_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
|
||||
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
|
||||
|
||||
ptr = (unsigned)sc->count & (buf_size - 1U);
|
||||
sc->buf[ ptr>>3 ] = v256_64( 0x80 );
|
||||
@@ -972,10 +964,8 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst )
|
||||
else
|
||||
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
|
||||
|
||||
sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
|
||||
v256_64( sc->count >> 61 ), shuff_bswap64 );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8(
|
||||
v256_64( sc->count << 3 ), shuff_bswap64 );
|
||||
sc->buf[ pad >> 3 ] = v256_64( bswap_64( sc->count >> 61 ) );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] = v256_64( bswap_64( sc->count << 3 ) );
|
||||
sha512_4x64_round( sc, sc->buf, sc->val );
|
||||
|
||||
mm256_block_bswap_64( dst, sc->val );
|
||||
@@ -1138,8 +1128,8 @@ void sha512_2x64_close( sha512_2x64_context *sc, void *dst )
|
||||
else
|
||||
v128_memset_zero( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
|
||||
|
||||
sc->buf[ pad >> 3 ] = v128_bswap64( v128_64( sc->count >> 61 ) );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] = v128_bswap64( v128_64( sc->count << 3 ) );
|
||||
sc->buf[ pad >> 3 ] = v128_64( bswap_64( sc->count >> 61 ) );
|
||||
sc->buf[ ( pad+8 ) >> 3 ] = v128_64( bswap_64( sc->count << 3 ) );
|
||||
sha512_2x64_round( sc, sc->buf, sc->val );
|
||||
|
||||
v128_block_bswap64( castp_v128u64( dst ), sc->val );
|
||||
|
||||
@@ -36,7 +36,6 @@ typedef struct
|
||||
uint64_t count;
|
||||
bool initialized;
|
||||
} sha512_8x64_context __attribute__ ((aligned (128)));
|
||||
#define sha512_8way_context sha512_8x64_context
|
||||
|
||||
void sha512_8x64_init( sha512_8x64_context *sc);
|
||||
void sha512_8x64_update( sha512_8x64_context *sc, const void *data,
|
||||
@@ -45,10 +44,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst );
|
||||
void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
|
||||
#define sha512_8way_init sha512_8x64_init
|
||||
#define sha512_8way_update sha512_8x64_update
|
||||
#define sha512_8way_close sha512_8x64_close
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
#if defined (__AVX2__)
|
||||
@@ -62,7 +57,6 @@ typedef struct
|
||||
uint64_t count;
|
||||
bool initialized;
|
||||
} sha512_4x64_context __attribute__ ((aligned (64)));
|
||||
#define sha512_4way_context sha512_4x64_context
|
||||
|
||||
void sha512_4x64_init( sha512_4x64_context *sc);
|
||||
void sha512_4x64_update( sha512_4x64_context *sc, const void *data,
|
||||
@@ -71,10 +65,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst );
|
||||
void sha512_4x64_ctx( sha512_4x64_context *sc, void *dst, const void *data,
|
||||
size_t len );
|
||||
|
||||
#define sha512_4way_init sha512_4x64_init
|
||||
#define sha512_4way_update sha512_4x64_update
|
||||
#define sha512_4way_close sha512_4x64_close
|
||||
|
||||
#endif // AVX2
|
||||
|
||||
typedef struct
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
#if defined(SHA512256D_8WAY)
|
||||
|
||||
static void sha512256d_8way_init( sha512_8way_context *ctx )
|
||||
static void sha512256d_8x64_init( sha512_8x64_context *ctx )
|
||||
{
|
||||
ctx->count = 0;
|
||||
ctx->initialized = true;
|
||||
@@ -33,7 +33,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint64_t hash[8*8] __attribute__ ((aligned (128)));
|
||||
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
|
||||
sha512_8way_context ctx;
|
||||
sha512_8x64_context ctx;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint64_t *hash_q3 = &(hash[3*8]);
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -53,13 +53,13 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
|
||||
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
|
||||
do
|
||||
{
|
||||
sha512256d_8way_init( &ctx );
|
||||
sha512_8way_update( &ctx, vdata, 80 );
|
||||
sha512_8way_close( &ctx, hash );
|
||||
sha512256d_8x64_init( &ctx );
|
||||
sha512_8x64_update( &ctx, vdata, 80 );
|
||||
sha512_8x64_close( &ctx, hash );
|
||||
|
||||
sha512256d_8way_init( &ctx );
|
||||
sha512_8way_update( &ctx, hash, 32 );
|
||||
sha512_8way_close( &ctx, hash );
|
||||
sha512256d_8x64_init( &ctx );
|
||||
sha512_8x64_update( &ctx, hash, 32 );
|
||||
sha512_8x64_close( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 8; lane++ )
|
||||
if ( unlikely( hash_q3[ lane ] <= targ_q3 && !bench ) )
|
||||
@@ -82,7 +82,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
|
||||
|
||||
#elif defined(SHA512256D_4WAY)
|
||||
|
||||
static void sha512256d_4way_init( sha512_4way_context *ctx )
|
||||
static void sha512256d_4x64_init( sha512_4x64_context *ctx )
|
||||
{
|
||||
ctx->count = 0;
|
||||
ctx->initialized = true;
|
||||
@@ -101,7 +101,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
|
||||
{
|
||||
uint64_t hash[8*4] __attribute__ ((aligned (64)));
|
||||
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
|
||||
sha512_4way_context ctx;
|
||||
sha512_4x64_context ctx;
|
||||
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
|
||||
uint64_t *hash_q3 = &(hash[3*4]);
|
||||
uint32_t *pdata = work->data;
|
||||
@@ -119,13 +119,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
|
||||
n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) );
|
||||
do
|
||||
{
|
||||
sha512256d_4way_init( &ctx );
|
||||
sha512_4way_update( &ctx, vdata, 80 );
|
||||
sha512_4way_close( &ctx, hash );
|
||||
sha512256d_4x64_init( &ctx );
|
||||
sha512_4x64_update( &ctx, vdata, 80 );
|
||||
sha512_4x64_close( &ctx, hash );
|
||||
|
||||
sha512256d_4way_init( &ctx );
|
||||
sha512_4way_update( &ctx, hash, 32 );
|
||||
sha512_4way_close( &ctx, hash );
|
||||
sha512256d_4x64_init( &ctx );
|
||||
sha512_4x64_update( &ctx, hash, 32 );
|
||||
sha512_4x64_close( &ctx, hash );
|
||||
|
||||
for ( int lane = 0; lane < 4; lane++ )
|
||||
if ( hash_q3[ lane ] <= targ_q3 )
|
||||
|
||||
Reference in New Issue
Block a user