This commit is contained in:
Jay D Dee
2025-06-20 20:31:41 -04:00
parent dd99580a4c
commit 66191db93c
86 changed files with 2701 additions and 4322 deletions

View File

@@ -31,7 +31,7 @@
#include "hmac-sha256-hash-4way.h"
#include "compat.h"
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
// HMAC 4-way SSE2
/**
@@ -62,30 +62,30 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
/* If Klen > 64, the key is really SHA256(K). */
if ( Klen > 64 )
{
sha256_4way_init( &ctx->ictx );
sha256_4way_update( &ctx->ictx, K, Klen );
sha256_4way_close( &ctx->ictx, khash );
sha256_4x32_init( &ctx->ictx );
sha256_4x32_update( &ctx->ictx, K, Klen );
sha256_4x32_close( &ctx->ictx, khash );
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
sha256_4way_init( &ctx->ictx );
sha256_4x32_init( &ctx->ictx );
memset( pad, 0x36, 64*4 );
for ( i = 0; i < Klen; i++ )
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4way_update( &ctx->ictx, pad, 64 );
sha256_4x32_update( &ctx->ictx, pad, 64 );
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
sha256_4way_init( &ctx->octx );
sha256_4x32_init( &ctx->octx );
memset( pad, 0x5c, 64*4 );
for ( i = 0; i < Klen/4; i++ )
casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4way_update( &ctx->octx, pad, 64 );
casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
casti_v128u32( K, i ) );
sha256_4x32_update( &ctx->octx, pad, 64 );
}
/* Add bytes to the HMAC-SHA256 operation. */
@@ -94,7 +94,7 @@ hmac_sha256_4way_update( hmac_sha256_4way_context *ctx, const void *in,
size_t len )
{
/* Feed data to the inner SHA256 operation. */
sha256_4way_update( &ctx->ictx, in, len );
sha256_4x32_update( &ctx->ictx, in, len );
}
/* Finish an HMAC-SHA256 operation. */
@@ -104,13 +104,13 @@ hmac_sha256_4way_close( hmac_sha256_4way_context *ctx, void *digest )
unsigned char ihash[32*4] __attribute__ ((aligned (64)));
/* Finish the inner SHA256 operation. */
sha256_4way_close( &ctx->ictx, ihash );
sha256_4x32_close( &ctx->ictx, ihash );
/* Feed the inner hash to the outer SHA256 operation. */
sha256_4way_update( &ctx->octx, ihash, 32 );
sha256_4x32_update( &ctx->octx, ihash, 32 );
/* Finish the outer SHA256 operation. */
sha256_4way_close( &ctx->octx, digest );
sha256_4x32_close( &ctx->octx, digest );
}
/**
@@ -126,7 +126,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
hmac_sha256_4way_context PShctx, hctx;
uint8_t _ALIGN(128) T[32*4];
uint8_t _ALIGN(128) U[32*4];
__m128i ivec;
v128u32_t ivec;
size_t i, clen;
uint64_t j;
int k;
@@ -139,7 +139,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
for ( i = 0; i * 32 < dkLen; i++ )
{
/* Generate INT(i + 1). */
ivec = _mm_set1_epi32( bswap_32( i+1 ) );
ivec = v128_32( bswap_32( i+1 ) );
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy( &hctx, &PShctx, sizeof(hmac_sha256_4way_context) );
@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
/* ... xor U_j ... */
for ( k = 0; k < 8; k++ )
casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
casti_v128u32( U, k ) );
casti_v128u32( T, k ) = v128_xor( casti_v128u32( T, k ),
casti_v128u32( U, k ) );
}
/* Copy as many bytes as necessary into buf. */
@@ -199,30 +199,30 @@ hmac_sha256_8way_init( hmac_sha256_8way_context *ctx, const void *_K,
/* If Klen > 64, the key is really SHA256(K). */
if ( Klen > 64 )
{
sha256_8way_init( &ctx->ictx );
sha256_8way_update( &ctx->ictx, K, Klen );
sha256_8way_close( &ctx->ictx, khash );
sha256_8x32_init( &ctx->ictx );
sha256_8x32_update( &ctx->ictx, K, Klen );
sha256_8x32_close( &ctx->ictx, khash );
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
sha256_8way_init( &ctx->ictx );
sha256_8x32_init( &ctx->ictx );
memset( pad, 0x36, 64*8);
for ( i = 0; i < Klen/4; i++ )
casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
casti_m256i( K, i ) );
sha256_8way_update( &ctx->ictx, pad, 64 );
sha256_8x32_update( &ctx->ictx, pad, 64 );
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
sha256_8way_init( &ctx->octx );
sha256_8x32_init( &ctx->octx );
memset( pad, 0x5c, 64*8 );
for ( i = 0; i < Klen/4; i++ )
casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
casti_m256i( K, i ) );
sha256_8way_update( &ctx->octx, pad, 64 );
sha256_8x32_update( &ctx->octx, pad, 64 );
}
void
@@ -230,7 +230,7 @@ hmac_sha256_8way_update( hmac_sha256_8way_context *ctx, const void *in,
size_t len )
{
/* Feed data to the inner SHA256 operation. */
sha256_8way_update( &ctx->ictx, in, len );
sha256_8x32_update( &ctx->ictx, in, len );
}
/* Finish an HMAC-SHA256 operation. */
@@ -240,13 +240,13 @@ hmac_sha256_8way_close( hmac_sha256_8way_context *ctx, void *digest )
unsigned char ihash[32*8] __attribute__ ((aligned (128)));
/* Finish the inner SHA256 operation. */
sha256_8way_close( &ctx->ictx, ihash );
sha256_8x32_close( &ctx->ictx, ihash );
/* Feed the inner hash to the outer SHA256 operation. */
sha256_8way_update( &ctx->octx, ihash, 32 );
sha256_8x32_update( &ctx->octx, ihash, 32 );
/* Finish the outer SHA256 operation. */
sha256_8way_close( &ctx->octx, digest );
sha256_8x32_close( &ctx->octx, digest );
}
/**
@@ -332,21 +332,21 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
/* If Klen > 64, the key is really SHA256(K). */
if ( Klen > 64 )
{
sha256_16way_init( &ctx->ictx );
sha256_16way_update( &ctx->ictx, K, Klen );
sha256_16way_close( &ctx->ictx, khash );
sha256_16x32_init( &ctx->ictx );
sha256_16x32_update( &ctx->ictx, K, Klen );
sha256_16x32_close( &ctx->ictx, khash );
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
sha256_16way_init( &ctx->ictx );
sha256_16x32_init( &ctx->ictx );
memset( pad, 0x36, 64*16 );
for ( i = 0; i < Klen; i++ )
casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
casti_m512i( K, i ) );
sha256_16way_update( &ctx->ictx, pad, 64 );
sha256_16x32_update( &ctx->ictx, pad, 64 );
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
sha256_16way_init( &ctx->octx );
@@ -354,7 +354,7 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
for ( i = 0; i < Klen/4; i++ )
casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
casti_m512i( K, i ) );
sha256_16way_update( &ctx->octx, pad, 64 );
sha256_16x32_update( &ctx->octx, pad, 64 );
}
void
@@ -362,7 +362,7 @@ hmac_sha256_16way_update( hmac_sha256_16way_context *ctx, const void *in,
size_t len )
{
/* Feed data to the inner SHA256 operation. */
sha256_16way_update( &ctx->ictx, in, len );
sha256_16x32_update( &ctx->ictx, in, len );
}
/* Finish an HMAC-SHA256 operation. */
@@ -372,13 +372,13 @@ hmac_sha256_16way_close( hmac_sha256_16way_context *ctx, void *digest )
unsigned char ihash[32*16] __attribute__ ((aligned (128)));
/* Finish the inner SHA256 operation. */
sha256_16way_close( &ctx->ictx, ihash );
sha256_16x32_close( &ctx->ictx, ihash );
/* Feed the inner hash to the outer SHA256 operation. */
sha256_16way_update( &ctx->octx, ihash, 32 );
sha256_16x32_update( &ctx->octx, ihash, 32 );
/* Finish the outer SHA256 operation. */
sha256_16way_close( &ctx->octx, digest );
sha256_16x32_close( &ctx->octx, digest );
}
/**

View File

@@ -1,6 +1,6 @@
/*-
* Copyright 2005,2007,2009 Colin Percival
* Copyright 2020 JayDDee@gmailcom
* Copyright 2020 JayDDee246@gmailcom
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -38,11 +38,12 @@
#include "simd-utils.h"
#include "sha256-hash.h"
#if defined(__SSE2__)
#if defined(__SSE2__) || defined(__ARM_NEON)
typedef struct _hmac_sha256_4way_context
{
sha256_4way_context ictx;
sha256_4way_context octx;
sha256_4x32_context ictx;
sha256_4x32_context octx;
} hmac_sha256_4way_context;
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
@@ -67,8 +68,8 @@ void pbkdf2_sha256_4way( uint8_t *, size_t, const uint8_t *, size_t,
typedef struct _hmac_sha256_8way_context
{
sha256_8way_context ictx;
sha256_8way_context octx;
sha256_8x32_context ictx;
sha256_8x32_context octx;
} hmac_sha256_8way_context;
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
@@ -88,8 +89,8 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,
typedef struct _hmac_sha256_16way_context
{
sha256_16way_context ictx;
sha256_16way_context octx;
sha256_16x32_context ictx;
sha256_16x32_context octx;
} hmac_sha256_16way_context;
//void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );

View File

@@ -30,6 +30,7 @@ static const uint32_t K256[64] =
0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};
#if defined(__SSE2__) || defined(__ARM_NEON)
// SHA-256 4 way SSE2
#define CHs(X, Y, Z) \
@@ -309,142 +310,6 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
v128_store( state_out + 7, H );
}
# if 0
// Working correctly but still slower
int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
const v128_t *state_in, const uint32_t *target )
{
v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
v128_t vmask, targ, hash;
int t6_mask, flip;
v128_t W[16]; v128_memcpy( W, data, 16 );
A = v128_load( state_in );
B = v128_load( state_in+1 );
C = v128_load( state_in+2 );
D = v128_load( state_in+3 );
E = v128_load( state_in+4 );
F = v128_load( state_in+5 );
G = v128_load( state_in+6 );
H = v128_load( state_in+7 );
const v128_t IV7 = H;
const v128_t IV6 = G;
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
SHA256_4X32_MSG_EXPANSION( W );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
SHA256_4X32_MSG_EXPANSION( W );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
W[ 0] = SHA256_4X32_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
W[ 1] = SHA256_4X32_MEXP( W[15], W[10], W[ 2], W[ 1] );
W[ 2] = SHA256_4X32_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
W[ 3] = SHA256_4X32_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
W[ 4] = SHA256_4X32_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
W[ 5] = SHA256_4X32_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
W[ 6] = SHA256_4X32_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
W[ 7] = SHA256_4X32_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
W[ 8] = SHA256_4X32_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
W[ 9] = SHA256_4X32_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
W[10] = SHA256_4X32_MEXP( W[ 8], W[ 3], W[11], W[10] );
W[11] = SHA256_4X32_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA256_4X32_MEXP( W[10], W[ 5], W[13], W[12] );
v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
SHA256_4X32_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
SHA256_4X32_ROUND( F, G, H, A, B, C, D, E, 3, 48 );
SHA256_4X32_ROUND( E, F, G, H, A, B, C, D, 4, 48 );
SHA256_4X32_ROUND( D, E, F, G, H, A, B, C, 5, 48 );
SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 6, 48 );
SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 7, 48 );
SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 8, 48 );
SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
T0 = v128_add32( v128_32( K256[58] ),
v128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
B = v128_add32( B, T0 );
T1 = v128_add32( v128_32( K256[59] ),
v128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
A = v128_add32( A, T1 );
T2 = v128_add32( v128_32( K256[60] ),
v128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
H = v128_add32( H, T2 );
targ = v128_32( target[7] );
hash = v128_bswap32( v128_add32( H, IV7 ) );
flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
if ( likely(
0xf == ( flip ^ v128_movmask32( v128_cmpgt32( hash, targ ) ) ) ))
return 0;
t6_mask = v128_movmask32( vmask = v128_cmpeq32( hash, targ ) );
// round 58 part 2
F = v128_add32( T0, v128_add32( BSG2_0( G ), MAJs( G, H, A ) ) );
// round 61 part 1
W[13] = SHA256_4X32_MEXP( W[11], W[ 6], W[14], W[13] );
T0 = v128_add32( v128_32( K256[61] ),
v128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
G = v128_add32( G, T0 );
if ( t6_mask )
{
targ = v128_and( vmask, v128_32( target[6] ) );
hash = v128_bswap32( v128_add32( G, IV6 ) );
if ( ( 0 != ( t6_mask & v128_movmask32( v128_cmpeq32( hash, targ ) ) ) ))
return 0;
else
{
flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
if ( 0 != ( t6_mask & ( flip ^ v128_movmask32(
v128_cmpgt32( hash, targ ) ) ) ) )
return 0;
else if ( target[6] == 0x80000000 )
{
if ( 0 == ( t6_mask & v128_movmask32(
v128_cmpgt32( hash, v128_xor( hash, hash ) ) ) ) )
return 0;
}
}
}
// rounds 59 to 61 part 2
E = v128_add32( T1, v128_add32( BSG2_0( F ), MAJs( F, G, H ) ) );
D = v128_add32( T2, v128_add32( BSG2_0( E ), MAJs( E, F, G ) ) );
C = v128_add32( T0, v128_add32( BSG2_0( D ), MAJs( D, E, F ) ) );
// rounds 62 & 63
W[14] = SHA256_4X32_MEXP( W[12], W[ 7], W[15], W[14] );
W[15] = SHA256_4X32_MEXP( W[13], W[ 8], W[ 0], W[15] );
SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 14, 48 );
SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 15, 48 );
state_out[0] = v128_add32( state_in[0], A );
state_out[1] = v128_add32( state_in[1], B );
state_out[2] = v128_add32( state_in[2], C );
state_out[3] = v128_add32( state_in[3], D );
state_out[4] = v128_add32( state_in[4], E );
state_out[5] = v128_add32( state_in[5], F );
state_out[6] = v128_add32( state_in[6], G );
state_out[7] = v128_add32( state_in[7], H );
return 1;
}
#endif
void sha256_4x32_init( sha256_4x32_context *sc )
{
sc->count_high = sc->count_low = 0;
@@ -529,29 +394,31 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
sha256_4x32_close( &ctx, dst );
}
#endif // SSE2 || NEON
#if defined(__AVX2__)
// SHA-256 8 way
#define BSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 2 ), \
mm256_ror_32( x, 13 ) ), \
mm256_ror_32( x, 22 ) )
mm256_xor3( mm256_ror_32( x, 2 ), \
mm256_ror_32( x, 13 ), \
mm256_ror_32( x, 22 ) )
#define BSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 6 ), \
mm256_ror_32( x, 11 ) ), \
mm256_ror_32( x, 25 ) )
mm256_xor3( mm256_ror_32( x, 6 ), \
mm256_ror_32( x, 11 ), \
mm256_ror_32( x, 25 ) )
#define SSG2_0x(x) \
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 7 ), \
mm256_ror_32( x, 18 ) ), \
_mm256_srli_epi32( x, 3 ) )
mm256_xor3( mm256_ror_32( x, 7 ), \
mm256_ror_32( x, 18 ), \
_mm256_srli_epi32( x, 3 ) )
#define SSG2_1x(x) \
_mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \
mm256_ror_32( x, 19 ) ), \
_mm256_srli_epi32( x, 10 ) )
mm256_xor3( mm256_ror_32( x, 17 ), \
mm256_ror_32( x, 19 ), \
_mm256_srli_epi32( x, 10 ) )
#define SHA256_8WAY_MEXP( a, b, c, d ) \
mm256_add4_32( SSG2_1x( a ), b, SSG2_0x( c ), d );
@@ -574,13 +441,8 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); \
W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] );
// With AVX512VL ternary logic optimizations are available.
// If not optimize by forwarding the result of X^Y in MAJ to the next round
// to avoid recalculating it as Y^Z. This optimization is not applicable
// when MAJ is optimized with ternary logic.
#if defined(VL256)
// AVX512 or AVX10-256
#define CHx(X, Y, Z) _mm256_ternarylogic_epi32( X, Y, Z, 0xca )
@@ -745,7 +607,7 @@ static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
}
// accepts LE input data
void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
void sha256_8x32_transform_le( __m256i *state_out, const __m256i *data,
const __m256i *state_in )
{
__m256i W[16];
@@ -754,7 +616,7 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
}
// Accepts BE input data, need to bswap
void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
void sha256_8x32_transform_be( __m256i *state_out, const __m256i *data,
const __m256i *state_in )
{
__m256i W[16];
@@ -764,7 +626,7 @@ void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
}
// Aggressive prehashing, LE byte order
void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
void sha256_8x32_prehash_3rounds( __m256i *state_mid, __m256i *X,
const __m256i *W, const __m256i *state_in )
{
__m256i A, B, C, D, E, F, G, H, T1;
@@ -813,7 +675,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
_mm256_store_si256( state_mid + 7, H );
}
void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const __m256i *state_mid, const __m256i *X )
{
__m256i A, B, C, D, E, F, G, H;
@@ -914,14 +776,12 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
_mm256_store_si256( state_out + 7, H );
}
int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const uint32_t *target )
{
__m256i A, B, C, D, E, F, G, H, T0, T1, T2;
__m256i vmask, targ, hash;
__m256i W[16]; memcpy_256( W, data, 16 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
uint8_t flip, t6_mask;
A = _mm256_load_si256( state_in );
@@ -1012,7 +872,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
// Got H, test it.
targ = v256_32( target[7] );
hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf );
hash = mm256_bswap_32( _mm256_add_epi32( H, IV7 ) );
if ( target[7] )
{
flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
@@ -1035,7 +895,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
{
// Testing H was inconclusive: hash7 == target7, need to test G
targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
hash = mm256_bswap_32( _mm256_add_epi32( G, IV6 ) );
if ( likely( 0 == ( t6_mask & mm256_movmask_32(
_mm256_cmpeq_epi32( hash, targ ) ) ) ))
@@ -1083,8 +943,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
return 1;
}
void sha256_8way_init( sha256_8way_context *sc )
void sha256_8x32_init( sha256_8x32_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = v256_32( sha256_iv[0] );
@@ -1100,7 +959,7 @@ void sha256_8way_init( sha256_8way_context *sc )
// need to handle odd byte length for yespower.
// Assume only last update is odd.
void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
void sha256_8x32_update( sha256_8x32_context *sc, const void *data, size_t len )
{
__m256i *vdata = (__m256i*)data;
size_t ptr;
@@ -1121,7 +980,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
len -= clen;
if ( ptr == buf_size )
{
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
ptr = 0;
}
clow = sc->count_low;
@@ -1132,7 +991,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
}
}
void sha256_8way_close( sha256_8way_context *sc, void *dst )
void sha256_8x32_close( sha256_8x32_context *sc, void *dst )
{
unsigned ptr;
uint32_t low, high;
@@ -1146,7 +1005,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
if ( ptr > pad )
{
memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
memset_zero_256( sc->buf, pad >> 2 );
}
else
@@ -1159,17 +1018,17 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
sc->buf[ pad >> 2 ] = v256_32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );
sha256_8way_transform_be( sc->val, sc->buf, sc->val );
sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
mm256_block_bswap_32( dst, sc->val );
}
void sha256_8way_full( void *dst, const void *data, size_t len )
void sha256_8x32_full( void *dst, const void *data, size_t len )
{
sha256_8way_context ctx;
sha256_8way_init( &ctx );
sha256_8way_update( &ctx, data, len );
sha256_8way_close( &ctx, dst );
sha256_8x32_context ctx;
sha256_8x32_init( &ctx );
sha256_8x32_update( &ctx, data, len );
sha256_8x32_close( &ctx, dst );
}
#if defined(SIMD512)
@@ -1302,7 +1161,7 @@ static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W,
}
// accepts LE input data
void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
void sha256_16x32_transform_le( __m512i *state_out, const __m512i *data,
const __m512i *state_in )
{
__m512i W[16];
@@ -1311,7 +1170,7 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
}
// Accepts BE input data, need to bswap
void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
void sha256_16x32_transform_be( __m512i *state_out, const __m512i *data,
const __m512i *state_in )
{
__m512i W[16];
@@ -1321,7 +1180,7 @@ void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
}
// Aggressive prehashing, LE byte order
void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
void sha256_16x32_prehash_3rounds( __m512i *state_mid, __m512i *X,
const __m512i *W, const __m512i *state_in )
{
__m512i A, B, C, D, E, F, G, H, T1;
@@ -1369,7 +1228,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
_mm512_store_si512( state_mid + 7, H );
}
void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
{
__m512i A, B, C, D, E, F, G, H;
@@ -1470,15 +1329,13 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
// returns 0 if hash aborted early and invalid,
// returns 1 for completed hash with at least one valid candidate.
int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
const __m512i *state_in, const uint32_t *target )
{
__m512i A, B, C, D, E, F, G, H, hash, targ;
__m512i T0, T1, T2;
__m512i W[16]; memcpy_512( W, data, 16 );
__mmask16 t6_mask;
const __m512i bswap_shuf = mm512_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
A = _mm512_load_si512( state_in );
B = _mm512_load_si512( state_in+1 );
@@ -1588,7 +1445,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
H = _mm512_add_epi32( H, T2 );
// got H, test it against target[7]
hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
hash = mm512_bswap_32( _mm512_add_epi32( H , IV7 ) );
targ = v512_32( target[7] );
if ( target[7] )
if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
@@ -1608,7 +1465,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
// got G, test it against target[6] if indicated
if ( (uint16_t)t6_mask )
{
hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
hash = mm512_bswap_32( _mm512_add_epi32( G, IV6 ) );
targ = v512_32( target[6] );
if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
return 0;
@@ -1644,7 +1501,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
return 1;
}
void sha256_16way_init( sha256_16way_context *sc )
void sha256_16x32_init( sha256_16x32_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = v512_32( sha256_iv[0] );
@@ -1657,7 +1514,7 @@ void sha256_16way_init( sha256_16way_context *sc )
sc->val[7] = v512_32( sha256_iv[7] );
}
void sha256_16way_update( sha256_16way_context *sc, const void *data,
void sha256_16x32_update( sha256_16x32_context *sc, const void *data,
size_t len )
{
__m512i *vdata = (__m512i*)data;
@@ -1679,7 +1536,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
len -= clen;
if ( ptr == buf_size )
{
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
ptr = 0;
}
clow = sc->count_low;
@@ -1690,7 +1547,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
}
}
void sha256_16way_close( sha256_16way_context *sc, void *dst )
void sha256_16x32_close( sha256_16x32_context *sc, void *dst )
{
unsigned ptr;
uint32_t low, high;
@@ -1704,7 +1561,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
if ( ptr > pad )
{
memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
memset_zero_512( sc->buf, pad >> 2 );
}
else
@@ -1717,17 +1574,17 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
sc->buf[ pad >> 2 ] = v512_32( bswap_32( high ) );
sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );
sha256_16way_transform_be( sc->val, sc->buf, sc->val );
sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
mm512_block_bswap_32( dst, sc->val );
}
void sha256_16way_full( void *dst, const void *data, size_t len )
void sha256_16x32_full( void *dst, const void *data, size_t len )
{
sha256_16way_context ctx;
sha256_16way_init( &ctx );
sha256_16way_update( &ctx, data, len );
sha256_16way_close( &ctx, dst );
sha256_16x32_context ctx;
sha256_16x32_init( &ctx );
sha256_16x32_update( &ctx, data, len );
sha256_16x32_close( &ctx, dst );
}
#undef CH

View File

@@ -180,20 +180,9 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
const __m256i *state_in, const uint32_t *target );
// Temporary API during naming transition
#define sha256_8way_context sha256_8x32_context
#define sha256_8way_init sha256_8x32_init
#define sha256_8way_update sha256_8x32_update
#define sha256_8way_close sha256_8x32_close
#define sha256_8way_full sha256_8x32_full
#define sha256_8way_transform_le sha256_8x32_transform_le
#define sha256_8way_transform_be sha256_8x32_transform_be
#define sha256_8way_prehash_3rounds sha256_8x32_prehash_3rounds
#define sha256_8way_final_rounds sha256_8x32_final_rounds
#define sha256_8way_transform_le_short sha256_8x32_transform_le_short
#endif // AVX2
#if defined(__SSE2__) || defined(__ARM_NEON)
// SHA-256 4 way x86_64 with SSE2 or AArch64 with NEON
typedef struct
@@ -219,16 +208,5 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
const v128_t *state_in, const uint32_t *target );
// Temporary API during naming transition
#define sha256_4way_context sha256_4x32_context
#define sha256_4way_init sha256_4x32_init
#define sha256_4way_update sha256_4x32_update
#define sha256_4way_close sha256_4x32_close
#define sha256_4way_full sha256_4x32_full
#define sha256_4way_transform_le sha256_4x32_transform_le
#define sha256_4way_transform_be sha256_4x32_transform_be
#define sha256_4way_prehash_3rounds sha256_4x32_prehash_3rounds
#define sha256_4way_final_rounds sha256_4x32_final_rounds
#define sha256_4way_transform_le_short sha256_4x32_transform_le_short
#endif
#endif // SSE2 || NEON
#endif // SHA256_HASH_H__

View File

@@ -32,8 +32,6 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
uint32_t n = first_nonce;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128_t shuf_bswap32 =
v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
// hash first 64 byte block of data
sha256_transform_le( mstatea, pdata, sha256_iv );
@@ -69,10 +67,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
{
casti_v128( hasha, 0 ) =
_mm_shuffle_epi8( casti_v128( hasha, 0 ), shuf_bswap32 );
casti_v128( hasha, 1 ) =
_mm_shuffle_epi8( casti_v128( hasha, 1 ), shuf_bswap32 );
casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
{
pdata[19] = n;
@@ -81,10 +77,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
}
if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
{
casti_v128( hashb, 0 ) =
_mm_shuffle_epi8( casti_v128( hashb, 0 ), shuf_bswap32 );
casti_v128( hashb, 1 ) =
_mm_shuffle_epi8( casti_v128( hashb, 1 ), shuf_bswap32 );
casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
{
pdata[19] = n+1;
@@ -204,8 +198,6 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
const int thr_id = mythr->id;
const __m512i sixteen = v512_32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
// prehash first block directly from pdata
sha256_transform_le( phash, pdata, sha256_iv );
@@ -231,7 +223,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
buf[15] = v512_32( 80*8 ); // bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for second hash
istate[0] = v512_32( sha256_iv[0] );
@@ -250,15 +242,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
do
{
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
if ( unlikely( sha256_16way_transform_le_short(
sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
if ( unlikely( sha256_16x32_transform_le_short(
hash32, block, istate, ptarget ) ) )
{
for ( int lane = 0; lane < 16; lane++ )
{
extr_lane_16x32( phash, hash32, lane, 256 );
casti_m256i( phash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) );
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
@@ -299,8 +290,6 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i last_byte = v256_32( 0x80000000 );
const __m256i eight = v256_32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = v256_32( pdata[i] );
@@ -325,22 +314,22 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
istate[6] = v256_32( sha256_iv[6] );
istate[7] = v256_32( sha256_iv[7] );
sha256_8way_transform_le( mstate1, vdata, istate );
sha256_8x32_transform_le( mstate1, vdata, istate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
do
{
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
if ( unlikely( sha256_8way_transform_le_short( hash32, block,
sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
if ( unlikely( sha256_8x32_transform_le_short( hash32, block,
istate, ptarget ) ) )
{
for ( int lane = 0; lane < 8; lane++ )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
casti_m256i( lane_hash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;

View File

@@ -12,7 +12,7 @@
#define SHA256D_NEON_SHA2 1
#elif defined(__AVX2__)
#define SHA256D_8WAY 1
#else
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SHA256D_4WAY 1
#endif

View File

@@ -17,7 +17,6 @@
#elif defined (__SSE2__) || defined(__ARM_NEON)
#define SHA256DT_4X32 1
#endif
// else ref, should never happen
static const uint32_t sha256dt_iv[8] __attribute__ ((aligned (32))) =
{
@@ -205,8 +204,6 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce,
const int thr_id = mythr->id;
const __m512i sixteen = v512_32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
// prehash first block directly from pdata
sha256_transform_le( phash, pdata, sha256dt_iv );
@@ -258,8 +255,7 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce,
for ( int lane = 0; lane < 16; lane++ )
{
extr_lane_16x32( phash, hash32, lane, 256 );
casti_m256i( phash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) );
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
@@ -298,8 +294,6 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i last_byte = v256_32( 0x80000000 );
const __m256i eight = v256_32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = v256_32( pdata[i] );
@@ -339,7 +333,7 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce,
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
casti_m256i( lane_hash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
@@ -406,7 +400,6 @@ int scanhash_sha256dt_4x32( struct work *work, const uint32_t max_nonce,
do
{
sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
// sha256_4x32_transform_le( block, vdata+16, mhash1 );
sha256_4x32_transform_le( hash32, block, iv );
for ( int lane = 0; lane < 4; lane++ )

View File

@@ -7,28 +7,28 @@
#if defined(SHA256T_16WAY)
static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
static __thread sha256_16x32_context sha256_ctx16 __attribute__ ((aligned (64)));
void sha256q_16way_hash( void* output, const void* input )
{
uint32_t vhash[8*16] __attribute__ ((aligned (64)));
sha256_16way_context ctx;
sha256_16x32_context ctx;
memcpy( &ctx, &sha256_ctx16, sizeof ctx );
sha256_16way_update( &ctx, input + (64<<4), 16 );
sha256_16way_close( &ctx, vhash );
sha256_16x32_update( &ctx, input + (64<<4), 16 );
sha256_16x32_close( &ctx, vhash );
sha256_16way_init( &ctx );
sha256_16way_update( &ctx, vhash, 32 );
sha256_16way_close( &ctx, vhash );
sha256_16x32_init( &ctx );
sha256_16x32_update( &ctx, vhash, 32 );
sha256_16x32_close( &ctx, vhash );
sha256_16way_init( &ctx );
sha256_16way_update( &ctx, vhash, 32 );
sha256_16way_close( &ctx, vhash );
sha256_16x32_init( &ctx );
sha256_16x32_update( &ctx, vhash, 32 );
sha256_16x32_close( &ctx, vhash );
sha256_16way_init( &ctx );
sha256_16way_update( &ctx, vhash, 32 );
sha256_16way_close( &ctx, output );
sha256_16x32_init( &ctx );
sha256_16x32_update( &ctx, vhash, 32 );
sha256_16x32_close( &ctx, output );
}
int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
@@ -51,8 +51,8 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
mm512_bswap32_intrlv80_16x32( vdata, pdata );
*noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
sha256_16way_init( &sha256_ctx16 );
sha256_16way_update( &sha256_ctx16, vdata, 64 );
sha256_16x32_init( &sha256_ctx16 );
sha256_16x32_update( &sha256_ctx16, vdata, 64 );
do
{
@@ -80,28 +80,28 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
#if defined(SHA256T_8WAY)
static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
static __thread sha256_8x32_context sha256_ctx8 __attribute__ ((aligned (64)));
void sha256q_8way_hash( void* output, const void* input )
{
uint32_t vhash[8*8] __attribute__ ((aligned (64)));
sha256_8way_context ctx;
sha256_8x32_context ctx;
memcpy( &ctx, &sha256_ctx8, sizeof ctx );
sha256_8way_update( &ctx, input + (64<<3), 16 );
sha256_8way_close( &ctx, vhash );
sha256_8x32_update( &ctx, input + (64<<3), 16 );
sha256_8x32_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way_update( &ctx, vhash, 32 );
sha256_8way_close( &ctx, vhash );
sha256_8x32_init( &ctx );
sha256_8x32_update( &ctx, vhash, 32 );
sha256_8x32_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way_update( &ctx, vhash, 32 );
sha256_8way_close( &ctx, vhash );
sha256_8x32_init( &ctx );
sha256_8x32_update( &ctx, vhash, 32 );
sha256_8x32_close( &ctx, vhash );
sha256_8way_init( &ctx );
sha256_8way_update( &ctx, vhash, 32 );
sha256_8way_close( &ctx, output );
sha256_8x32_init( &ctx );
sha256_8x32_update( &ctx, vhash, 32 );
sha256_8x32_close( &ctx, output );
}
int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
@@ -123,8 +123,8 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
mm256_bswap32_intrlv80_8x32( vdata, pdata );
*noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
sha256_8way_init( &sha256_ctx8 );
sha256_8way_update( &sha256_ctx8, vdata, 64 );
sha256_8x32_init( &sha256_ctx8 );
sha256_8x32_update( &sha256_ctx8, vdata, 64 );
do
{
@@ -152,28 +152,28 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
#if defined(SHA256T_4WAY)
static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
static __thread sha256_4x32_context sha256_ctx4 __attribute__ ((aligned (64)));
void sha256q_4way_hash( void* output, const void* input )
{
uint32_t vhash[8*4] __attribute__ ((aligned (64)));
sha256_4way_context ctx;
sha256_4x32_context ctx;
memcpy( &ctx, &sha256_ctx4, sizeof ctx );
sha256_4way_update( &ctx, input + (64<<2), 16 );
sha256_4way_close( &ctx, vhash );
sha256_4x32_update( &ctx, input + (64<<2), 16 );
sha256_4x32_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way_update( &ctx, vhash, 32 );
sha256_4way_close( &ctx, vhash );
sha256_4x32_init( &ctx );
sha256_4x32_update( &ctx, vhash, 32 );
sha256_4x32_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way_update( &ctx, vhash, 32 );
sha256_4way_close( &ctx, vhash );
sha256_4x32_init( &ctx );
sha256_4x32_update( &ctx, vhash, 32 );
sha256_4x32_close( &ctx, vhash );
sha256_4way_init( &ctx );
sha256_4way_update( &ctx, vhash, 32 );
sha256_4way_close( &ctx, output );
sha256_4x32_init( &ctx );
sha256_4x32_update( &ctx, vhash, 32 );
sha256_4x32_close( &ctx, output );
}
int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
@@ -205,8 +205,8 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
0 };
v128_bswap32_intrlv80_4x32( vdata, pdata );
sha256_4way_init( &sha256_ctx4 );
sha256_4way_update( &sha256_ctx4, vdata, 64 );
sha256_4x32_init( &sha256_ctx4 );
sha256_4x32_update( &sha256_ctx4, vdata, 64 );
for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
{

View File

@@ -35,8 +35,6 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
const int thr_id = mythr->id;
const __m512i sixteen = v512_32( 16 );
const bool bench = opt_benchmark;
const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
// prehash first block directly from pdata
sha256_transform_le( phash, pdata, sha256_iv );
@@ -62,7 +60,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
buf[15] = v512_32( 80*8 ); // bit count
// partially pre-expand & prehash second message block, avoiding the nonces
sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
// vectorize IV for 2nd & 3rd sha256
istate[0] = v512_32( sha256_iv[0] );
@@ -81,18 +79,17 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
do
{
sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
sha256_16way_transform_le( block, block, istate );
sha256_16x32_transform_le( block, block, istate );
if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
if ( sha256_16x32_transform_le_short( hash32, block, istate, ptarget ) )
{
for ( int lane = 0; lane < 16; lane++ )
if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
{
extr_lane_16x32( phash, hash32, lane, 256 );
casti_m256i( phash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) );
if ( likely( valid_hash( phash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
@@ -301,8 +298,6 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
const bool bench = opt_benchmark;
const __m256i last_byte = v256_32( 0x80000000 );
const __m256i eight = v256_32( 8 );
const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
for ( int i = 0; i < 19; i++ )
vdata[i] = v256_32( pdata[i] );
@@ -327,29 +322,29 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
istate[6] = v256_32( sha256_iv[6] );
istate[7] = v256_32( sha256_iv[7] );
sha256_8way_transform_le( mstate1, vdata, istate );
sha256_8x32_transform_le( mstate1, vdata, istate );
// Do 3 rounds on the first 12 bytes of the next block
sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
do
{
// 1. final 16 bytes of data, with padding
sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2,
mexp_pre );
// 2. 32 byte hash from 1.
sha256_8way_transform_le( block, block, istate );
sha256_8x32_transform_le( block, block, istate );
// 3. 32 byte hash from 2.
if ( unlikely( sha256_8way_transform_le_short(
if ( unlikely( sha256_8x32_transform_le_short(
hash32, block, istate, ptarget ) ) )
{
for ( int lane = 0; lane < 8; lane++ )
{
extr_lane_8x32( lane_hash, hash32, lane, 256 );
casti_m256i( lane_hash, 0 ) =
_mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
@@ -419,8 +414,8 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
do
{
sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
sha256_4way_transform_le( block, block, iv );
sha256_4way_transform_le( hash32, block, iv );
sha256_4x32_transform_le( block, block, iv );
sha256_4x32_transform_le( hash32, block, iv );
for ( int lane = 0; lane < 4; lane++ )
{

View File

@@ -83,15 +83,13 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
const uint64_t *state_in )
{
__m256i STATE0, STATE1;
__m256i MSG, TMP, BSWAP64;
__m256i MSG, TMP;
__m256i TMSG0, TMSG1, TMSG2, TMSG3;
__m256i ABEF_SAVE, CDGH_SAVE;
// Load initial values
TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
0x0001020304050607 ) );
TMP = _mm256_permute4x64_epi64( TMP, 0xB1 ); // CDAB
STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B ); // EFGH
STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
@@ -103,7 +101,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
// Rounds 0-3
TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
TMSG0 = mm256_bswap_64( TMSG0 );
MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128 (MSG ) );
@@ -113,7 +111,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
// Rounds 4-7
TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
TMSG1 = mm256_bswap_64( TMSG1 );
MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
@@ -124,7 +122,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
// Rounds 8-11
TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
TMSG2 = mm256_bswap_64( TMSG2 );
MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
@@ -135,7 +133,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
// Rounds 12-15
TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
TMSG3 = mm256_bswap_64( TMSG3 );
MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
_mm256_castsi256_si128( MSG ) );
@@ -735,8 +733,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst )
unsigned ptr;
const int buf_size = 128;
const int pad = buf_size - 16;
const __m512i shuff_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = v512_64( 0x80 );
@@ -750,10 +746,8 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst )
else
memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
v512_64( sc->count >> 61 ), shuff_bswap64 );
sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
v512_64( sc->count << 3 ), shuff_bswap64 );
sc->buf[ pad >> 3 ] = v512_64( bswap_64( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] = v512_64( bswap_64( sc->count << 3 ) );
sha512_8x64_round( sc, sc->buf, sc->val );
mm512_block_bswap_64( dst, sc->val );
@@ -957,8 +951,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst )
unsigned ptr;
const int buf_size = 128;
const int pad = buf_size - 16;
const __m256i shuff_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
ptr = (unsigned)sc->count & (buf_size - 1U);
sc->buf[ ptr>>3 ] = v256_64( 0x80 );
@@ -972,10 +964,8 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst )
else
memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
v256_64( sc->count >> 61 ), shuff_bswap64 );
sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8(
v256_64( sc->count << 3 ), shuff_bswap64 );
sc->buf[ pad >> 3 ] = v256_64( bswap_64( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] = v256_64( bswap_64( sc->count << 3 ) );
sha512_4x64_round( sc, sc->buf, sc->val );
mm256_block_bswap_64( dst, sc->val );
@@ -1138,8 +1128,8 @@ void sha512_2x64_close( sha512_2x64_context *sc, void *dst )
else
v128_memset_zero( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = v128_bswap64( v128_64( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] = v128_bswap64( v128_64( sc->count << 3 ) );
sc->buf[ pad >> 3 ] = v128_64( bswap_64( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] = v128_64( bswap_64( sc->count << 3 ) );
sha512_2x64_round( sc, sc->buf, sc->val );
v128_block_bswap64( castp_v128u64( dst ), sc->val );

View File

@@ -36,7 +36,6 @@ typedef struct
uint64_t count;
bool initialized;
} sha512_8x64_context __attribute__ ((aligned (128)));
#define sha512_8way_context sha512_8x64_context
void sha512_8x64_init( sha512_8x64_context *sc);
void sha512_8x64_update( sha512_8x64_context *sc, const void *data,
@@ -45,10 +44,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst );
void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
size_t len );
#define sha512_8way_init sha512_8x64_init
#define sha512_8way_update sha512_8x64_update
#define sha512_8way_close sha512_8x64_close
#endif // AVX512
#if defined (__AVX2__)
@@ -62,7 +57,6 @@ typedef struct
uint64_t count;
bool initialized;
} sha512_4x64_context __attribute__ ((aligned (64)));
#define sha512_4way_context sha512_4x64_context
void sha512_4x64_init( sha512_4x64_context *sc);
void sha512_4x64_update( sha512_4x64_context *sc, const void *data,
@@ -71,10 +65,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst );
void sha512_4x64_ctx( sha512_4x64_context *sc, void *dst, const void *data,
size_t len );
#define sha512_4way_init sha512_4x64_init
#define sha512_4way_update sha512_4x64_update
#define sha512_4way_close sha512_4x64_close
#endif // AVX2
typedef struct

View File

@@ -14,7 +14,7 @@
#if defined(SHA512256D_8WAY)
static void sha512256d_8way_init( sha512_8way_context *ctx )
static void sha512256d_8x64_init( sha512_8x64_context *ctx )
{
ctx->count = 0;
ctx->initialized = true;
@@ -33,7 +33,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
{
uint64_t hash[8*8] __attribute__ ((aligned (128)));
uint32_t vdata[20*8] __attribute__ ((aligned (64)));
sha512_8way_context ctx;
sha512_8x64_context ctx;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint64_t *hash_q3 = &(hash[3*8]);
uint32_t *pdata = work->data;
@@ -53,13 +53,13 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
n+3, 0, n+2, 0, n+1, 0, n , 0 ), *noncev );
do
{
sha512256d_8way_init( &ctx );
sha512_8way_update( &ctx, vdata, 80 );
sha512_8way_close( &ctx, hash );
sha512256d_8x64_init( &ctx );
sha512_8x64_update( &ctx, vdata, 80 );
sha512_8x64_close( &ctx, hash );
sha512256d_8way_init( &ctx );
sha512_8way_update( &ctx, hash, 32 );
sha512_8way_close( &ctx, hash );
sha512256d_8x64_init( &ctx );
sha512_8x64_update( &ctx, hash, 32 );
sha512_8x64_close( &ctx, hash );
for ( int lane = 0; lane < 8; lane++ )
if ( unlikely( hash_q3[ lane ] <= targ_q3 && !bench ) )
@@ -82,7 +82,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
#elif defined(SHA512256D_4WAY)
static void sha512256d_4way_init( sha512_4way_context *ctx )
static void sha512256d_4x64_init( sha512_4x64_context *ctx )
{
ctx->count = 0;
ctx->initialized = true;
@@ -101,7 +101,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
{
uint64_t hash[8*4] __attribute__ ((aligned (64)));
uint32_t vdata[20*4] __attribute__ ((aligned (64)));
sha512_4way_context ctx;
sha512_4x64_context ctx;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint64_t *hash_q3 = &(hash[3*4]);
uint32_t *pdata = work->data;
@@ -119,13 +119,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) );
do
{
sha512256d_4way_init( &ctx );
sha512_4way_update( &ctx, vdata, 80 );
sha512_4way_close( &ctx, hash );
sha512256d_4x64_init( &ctx );
sha512_4x64_update( &ctx, vdata, 80 );
sha512_4x64_close( &ctx, hash );
sha512256d_4way_init( &ctx );
sha512_4way_update( &ctx, hash, 32 );
sha512_4way_close( &ctx, hash );
sha512256d_4x64_init( &ctx );
sha512_4x64_update( &ctx, hash, 32 );
sha512_4x64_close( &ctx, hash );
for ( int lane = 0; lane < 4; lane++ )
if ( hash_q3[ lane ] <= targ_q3 )