Compare commits

...

2 Commits
v23.5 ... v23.7

Author SHA1 Message Date
Jay D Dee
e043698442 v23.7 2023-11-07 04:59:44 -05:00
Jay D Dee
46dca7a493 v23.6 2023-10-28 16:22:14 -04:00
39 changed files with 2578 additions and 2666 deletions

View File

@@ -156,7 +156,6 @@ cpuminer_SOURCES = \
algo/sha/hmac-sha256-hash.c \
algo/sha/hmac-sha256-hash-4way.c \
algo/sha/sha256d.c \
algo/sha/sha2.c \
algo/sha/sha256d-4way.c \
algo/sha/sha256t-gate.c \
algo/sha/sha256t-4way.c \

View File

@@ -73,6 +73,22 @@ If not what makes it happen or not happen?
Change Log
----------
v23.7
Fixed blakes2s, broken in v3.23.4.
ARM: SHA2 extension tested and working.
ARM: sha512256d fully optimized.
ARM: X17 more optimizations.
ARM: AES extension working for Shavite.
ARM errata: CPU features AES & SHA256 are not reported when available.
v23.6
ARM: Sha256dt, Sha256t, Sha256d 4-way now working and fully optimized for NEON, SHA also enabled but untested.
x86: Sha256dt, Sha256t, Sha256d faster SSE2 4-way.
ARM: Scrypt, Scryptn2 fully optimized for NEON, SHA also enabled but untested.
Linux: added a log when miner is started as root to discourage doing so.
v23.5
New version numbering drops the leading 3, the major version will now be the calendar year, the minor version identifies planned releases during the year.

View File

@@ -136,10 +136,10 @@ static void fill_block( __m256i *state, const block *ref_block,
#else // SSE2
static void fill_block( v128_t *state, const block *ref_block,
static void fill_block( v128u64_t *state, const block *ref_block,
block *next_block, int with_xor )
{
v128_t block_XY[ARGON2_OWORDS_IN_BLOCK];
v128u64_t block_XY[ARGON2_OWORDS_IN_BLOCK];
unsigned int i;
if ( with_xor )
@@ -242,7 +242,7 @@ void fill_segment(const argon2_instance_t *instance,
#elif defined(__AVX2__)
__m256i state[ARGON2_HWORDS_IN_BLOCK];
#else
v128_t state[ARGON2_OWORDS_IN_BLOCK];
v128u64_t state[ARGON2_OWORDS_IN_BLOCK];
#endif
// int data_independent_addressing;

View File

@@ -23,56 +23,46 @@
#if !defined(__AVX512F__)
#if !defined(__AVX2__)
static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y) {
const v128_t z = v128_mulw32(x, y);
return v128_add64(v128_add64(x, y), v128_add64(z, z));
static BLAKE2_INLINE v128_t fBlaMka(v128_t x, v128_t y)
{
const v128u64_t z = v128_mulw32( x, y );
return (v128u32_t)v128_add64( v128_add64( (v128u64_t)x, (v128u64_t)y ),
v128_add64( z, z ) );
}
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
A0 = fBlaMka(A0, B0); \
A1 = fBlaMka(A1, B1); \
\
D0 = v128_xor(D0, A0); \
D1 = v128_xor(D1, A1); \
\
D0 = v128_ror64(D0, 32); \
D1 = v128_ror64(D1, 32); \
\
C0 = fBlaMka(C0, D0); \
C1 = fBlaMka(C1, D1); \
\
B0 = v128_xor(B0, C0); \
B1 = v128_xor(B1, C1); \
\
B0 = v128_ror64(B0, 24); \
B1 = v128_ror64(B1, 24); \
} while ((void)0, 0)
#define G1( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
A0 = fBlaMka( A0, B0 ); \
A1 = fBlaMka( A1, B1 ); \
D0 = v128_xor( D0, A0 ); \
D1 = v128_xor( D1, A1 ); \
D0 = v128_ror64( D0, 32 ); \
D1 = v128_ror64( D1, 32 ); \
C0 = fBlaMka( C0, D0 ); \
C1 = fBlaMka( C1, D1 ); \
B0 = v128_xor( B0, C0 ); \
B1 = v128_xor( B1, C1 ); \
B0 = v128_ror64( B0, 24 ); \
B1 = v128_ror64( B1, 24 ); \
}
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
do { \
A0 = fBlaMka(A0, B0); \
A1 = fBlaMka(A1, B1); \
\
D0 = v128_xor(D0, A0); \
D1 = v128_xor(D1, A1); \
\
D0 = v128_ror64(D0, 16); \
D1 = v128_ror64(D1, 16); \
\
C0 = fBlaMka(C0, D0); \
C1 = fBlaMka(C1, D1); \
\
B0 = v128_xor(B0, C0); \
B1 = v128_xor(B1, C1); \
\
B0 = v128_ror64(B0, 63); \
B1 = v128_ror64(B1, 63); \
} while ((void)0, 0)
#define G2( A0, B0, C0, D0, A1, B1, C1, D1 ) \
{ \
A0 = fBlaMka( A0, B0 ); \
A1 = fBlaMka( A1, B1 ); \
D0 = v128_xor( D0, A0 ); \
D1 = v128_xor( D1, A1 ); \
D0 = v128_ror64( D0, 16 ); \
D1 = v128_ror64( D1, 16 ); \
C0 = fBlaMka( C0, D0 ); \
C1 = fBlaMka( C1, D1 ); \
B0 = v128_xor( B0, C0 ); \
B1 = v128_xor( B1, C1 ); \
B0 = v128_ror64( B0, 63 ); \
B1 = v128_ror64( B1, 63 ); \
}
#if defined(__SSSE3__) || defined(__ARM_NEON)

View File

@@ -465,6 +465,7 @@ void blake512_update(blake512_context *sc, const void *data, size_t len)
{
if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 += 1;
blake512_transform( sc->H, (uint64_t*)sc->buf, sc->T0, sc->T1 );
sc->ptr = 0;
}
@@ -474,7 +475,7 @@ void blake512_update(blake512_context *sc, const void *data, size_t len)
void blake512_close( blake512_context *sc, void *dst )
{
unsigned char buf[128] __attribute__((aligned(32)));
size_t ptr, k;
size_t ptr;
unsigned bit_len;
uint64_t th, tl;
@@ -517,11 +518,8 @@ void blake512_close( blake512_context *sc, void *dst )
*(uint64_t*)(buf + 120) = bswap_64( tl );
blake512_update( sc, buf, 128 );
}
//TODO vectored bswap
for ( k = 0; k < 8; k ++ )
((uint64_t*)dst)[k] = bswap_64( sc->H[k] );
v128_block_bswap64_512( dst, sc->H );
}
void blake512_full( blake512_context *sc, void *dst, const void *data,
@@ -1779,13 +1777,11 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
v256_64( 0x0100000000000000ULL ) );
buf[112>>3] = v256_64( bswap_64( th ) );
buf[120>>3] = v256_64( bswap_64( tl ) );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
}
else
{
memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
@@ -1793,9 +1789,9 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
buf[104>>3] = v256_64( 0x0100000000000000ULL );
buf[112>>3] = v256_64( bswap_64( th ) );
buf[120>>3] = v256_64( bswap_64( tl ) );
blake64_4way( sc, buf, 128 );
}
mm256_block_bswap_64( (__m256i*)dst, sc->H );
}
@@ -1960,21 +1956,21 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
#else // SSE2 & NEON
M0 = v128_bswap64( sc->buf[ 0] );
M1 = v128_bswap64( sc->buf[ 0] );
M2 = v128_bswap64( sc->buf[ 0] );
M3 = v128_bswap64( sc->buf[ 0] );
M4 = v128_bswap64( sc->buf[ 0] );
M5 = v128_bswap64( sc->buf[ 0] );
M6 = v128_bswap64( sc->buf[ 0] );
M7 = v128_bswap64( sc->buf[ 0] );
M8 = v128_bswap64( sc->buf[ 0] );
M9 = v128_bswap64( sc->buf[ 0] );
MA = v128_bswap64( sc->buf[ 0] );
MB = v128_bswap64( sc->buf[ 0] );
MC = v128_bswap64( sc->buf[ 0] );
MD = v128_bswap64( sc->buf[ 0] );
ME = v128_bswap64( sc->buf[ 0] );
MF = v128_bswap64( sc->buf[ 0] );
M1 = v128_bswap64( sc->buf[ 1] );
M2 = v128_bswap64( sc->buf[ 2] );
M3 = v128_bswap64( sc->buf[ 3] );
M4 = v128_bswap64( sc->buf[ 4] );
M5 = v128_bswap64( sc->buf[ 5] );
M6 = v128_bswap64( sc->buf[ 6] );
M7 = v128_bswap64( sc->buf[ 7] );
M8 = v128_bswap64( sc->buf[ 8] );
M9 = v128_bswap64( sc->buf[ 9] );
MA = v128_bswap64( sc->buf[10] );
MB = v128_bswap64( sc->buf[11] );
MC = v128_bswap64( sc->buf[12] );
MD = v128_bswap64( sc->buf[13] );
ME = v128_bswap64( sc->buf[14] );
MF = v128_bswap64( sc->buf[15] );
#endif
@@ -2235,7 +2231,6 @@ blake64_2x64( blake_2x64_big_context *sc, const void *data, size_t len)
v128u64_t *buf;
size_t ptr;
const int buf_size = 128; // sizeof/8
DECL_STATE_2X64
buf = sc->buf;
ptr = sc->ptr;
@@ -2247,7 +2242,6 @@ blake64_2x64( blake_2x64_big_context *sc, const void *data, size_t len)
return;
}
READ_STATE64(sc);
while ( len > 0 )
{
size_t clen;
@@ -2260,13 +2254,12 @@ blake64_2x64( blake_2x64_big_context *sc, const void *data, size_t len)
len -= clen;
if ( ptr == buf_size )
{
if ( (T0 = T0 + 1024 ) < 1024 )
T1 = T1 + 1;
if ( (sc->T0 = sc->T0 + 1024 ) < 1024 )
sc->T1 = sc->T1 + 1;
blake512_2x64_compress( sc );
ptr = 0;
}
}
WRITE_STATE64(sc);
sc->ptr = ptr;
}
@@ -2280,37 +2273,35 @@ blake64_2x64_close( blake_2x64_big_context *sc, void *dst )
ptr = sc->ptr;
bit_len = ((unsigned)ptr << 3);
buf[ptr>>3] = v128_64( 0x80 );
sc->buf[ptr>>3] = v128_64( 0x80 );
tl = sc->T0 + bit_len;
th = sc->T1;
if (ptr == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
}
else if ( sc->T0 == 0 )
{
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
sc->T1 = sc->T1 - 1;
sc->T0 = 0xFFFFFFFFFFFFFC00ULL + bit_len;
sc->T1 = sc->T1 - 1;
}
else
{
sc->T0 -= 1024 - bit_len;
}
sc->T0 -= 1024 - bit_len;
if ( ptr <= 104 )
{
v128_memset_zero( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
buf[104>>3] = v128_or( buf[104>>3], v128_64( 0x0100000000000000ULL ) );
buf[112>>3] = v128_64( bswap_64( th ) );
buf[120>>3] = v128_64( bswap_64( tl ) );
blake64_2x64( sc, buf + (ptr>>3), 128 - ptr );
v128_memset_zero( sc->buf + (ptr>>3) + 1, (104-ptr) >> 3 );
sc->buf[104>>3] = v128_or( sc->buf[104>>3],
v128_64( 0x0100000000000000ULL ) );
sc->buf[112>>3] = v128_64( bswap_64( th ) );
sc->buf[120>>3] = v128_64( bswap_64( tl ) );
blake64_2x64( sc, sc->buf + (ptr>>3), 128 - ptr );
}
else
{
v128_memset_zero( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_2x64( sc, buf + (ptr>>3), 128 - ptr );
v128_memset_zero( sc->buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_2x64( sc, sc->buf + (ptr>>3), 128 - ptr );
sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
v128_memset_zero( buf, 112>>3 );
@@ -2319,6 +2310,7 @@ blake64_2x64_close( blake_2x64_big_context *sc, void *dst )
buf[120>>3] = v128_64( bswap_64( tl ) );
blake64_2x64( sc, buf, 128 );
}
v128_block_bswap64( (v128u64_t*)dst, sc->H );
}
@@ -2326,7 +2318,6 @@ blake64_2x64_close( blake_2x64_big_context *sc, void *dst )
void blake512_2x64_full( blake_2x64_big_context *sc, void * dst,
const void *data, size_t len )
{
// init
casti_v128u64( sc->H, 0 ) = v128_64( 0x6A09E667F3BCC908 );

View File

@@ -1936,7 +1936,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
#if defined(__SSE4_2__) || defined(__ARM_NEON)
#define DECL_STATE_2x64 \
v128_t c0, c1, c2, c3, c4, c5, c6, c7; \
v128u64_t c0, c1, c2, c3, c4, c5, c6, c7; \
#define READ_STATE_2x64(sc) \
c0 = sc->h[0]; \
@@ -1960,13 +1960,13 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
#define INPUT_2x64 \
{ \
v128_t db = *buf; \
const v128_t zero = v128_zero; \
v128u64_t db = *buf; \
const v128u64_t zero = v128_64( 0ull ); \
const uint64_t *tp = (const uint64_t*)T512; \
m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
for ( int i = 63; i >= 0; i-- ) \
{ \
v128_t dm = v128_cmpgt64( zero, v128_sl64( db, i ) ); \
v128u64_t dm = v128_cmpgt64( zero, v128_sl64( db, i ) ); \
m0 = v128_xor( m0, v128_and( dm, v128_64( tp[0] ) ) ); \
m1 = v128_xor( m1, v128_and( dm, v128_64( tp[1] ) ) ); \
m2 = v128_xor( m2, v128_and( dm, v128_64( tp[2] ) ) ); \
@@ -1982,7 +1982,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
// v3 no ternary logic, 15 instructions, 9 TL equivalent instructions
#define SBOX_2x64( a, b, c, d ) \
{ \
v128_t tb, td; \
v128u64_t tb, td; \
td = v128_xorand( d, a, c ); \
tb = v128_xoror( b, d, a ); \
c = v128_xor3( c, td, b ); \
@@ -2010,7 +2010,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
#define ROUND_2x64( alpha ) \
{ \
v128_t t0, t1, t2, t3, t4, t5; \
v128u64_t t0, t1, t2, t3, t4, t5; \
const v128_t mask = v128_64( 0x00000000ffffffff ); \
s0 = v128_xor( s0, alpha[ 0] ); \
s1 = v128_xor( s1, alpha[ 1] ); \
@@ -2107,7 +2107,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
#define P_2x64 \
{ \
v128_t alpha[16]; \
v128u64_t alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = v128_64( ( (uint64_t*)alpha_n )[i] ); \
@@ -2126,7 +2126,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
#define PF_2x64 \
{ \
v128_t alpha[16]; \
v128u64_t alpha[16]; \
const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
for( int i = 0; i < 16; i++ ) \
alpha[i] = v128_64( ( (uint64_t*)alpha_f )[i] ); \
@@ -2193,7 +2193,7 @@ void hamsi64_big( hamsi_2x64_context *sc, v128_t *buf, size_t num )
void hamsi64_big_final( hamsi_2x64_context *sc, v128_t *buf )
{
v128_t m0, m1, m2, m3, m4, m5, m6, m7;
v128u64_t m0, m1, m2, m3, m4, m5, m6, m7;
DECL_STATE_2x64;
READ_STATE_2x64( sc );
INPUT_2x64;
@@ -2231,15 +2231,15 @@ void hamsi512_2x64_update( hamsi_2x64_context *sc, const void *data,
void hamsi512_2x64_close( hamsi_2x64_context *sc, void *dst )
{
v128_t pad[1];
v128u32_t pad;
uint32_t ch, cl;
ch = bswap_32( sc->count_high );
cl = bswap_32( sc->count_low + ( sc->partial_len << 3 ) );
pad[0] = v128_64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
pad = v128_64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
sc->buf[0] = v128_64( 0x80 );
hamsi64_big( sc, sc->buf, 1 );
hamsi64_big_final( sc, pad );
hamsi64_big_final( sc, &pad );
v128_block_bswap32( (v128_t*)dst, sc->h );
}

View File

@@ -852,48 +852,10 @@ void jh512_4x64_ctx( jh_4x64_context *cc, void *dst, const void *data, size_t le
// SSE2 & NEON
#if defined(__AVX512VL__)
//TODO enable for AVX10_256, not used with AVX512VL
#define v128_notxorandnot( a, b, c ) \
_mm_ternarylogic_epi64( a, b, c, 0x2d )
#else
#define v128_notxorandnot( a, b, c ) \
v128_xor( v128_not( a ), v128_andnot( b, c ) )
#endif
#define Sb(x0, x1, x2, x3, c) \
{ \
v128u64_t cc = v128_64( c ); \
x3 = v128_not( x3 ); \
x0 = v128_xor( x0, v128_andnot( x2, cc ) ); \
tmp = v128_xor( cc, v128_and( x0, x1 ) ); \
x0 = v128_xor( x0, v128_and( x2, x3 ) ); \
x3 = v128_xor( x3, v128_andnot( x1, x2 ) ); \
x1 = v128_xor( x1, v128_and( x0, x2 ) ); \
x2 = v128_xor( x2, v128_andnot( x3, x0 ) ); \
x0 = v128_xor( x0, v128_or( x1, x3 ) ); \
x3 = v128_xor( x3, v128_and( x1, x2 ) ); \
x1 = v128_xor( x1, v128_and( tmp, x0 ) ); \
x2 = v128_xor( x2, tmp ); \
}
#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
{ \
x4 = v128_xor( x4, x1 ); \
x5 = v128_xor( x5, x2 ); \
x6 = v128_xor( x6, v128_xor( x3, x0 ) ); \
x7 = v128_xor( x7, x0 ); \
x0 = v128_xor( x0, x5 ); \
x1 = v128_xor( x1, x6 ); \
x2 = v128_xor( x2, v128_xor( x7, x4 ) ); \
x3 = v128_xor( x3, x4 ); \
}
/*
#define Sb(x0, x1, x2, x3, c) \
{ \
const v128u64_t cc = v128_64( c ); \
@@ -920,7 +882,6 @@ void jh512_4x64_ctx( jh_4x64_context *cc, void *dst, const void *data, size_t le
x2 = v128_xor3( x2, x7, x4 ); \
x3 = v128_xor( x3, x4 ); \
}
*/
#undef Wz
#define Wz(x, c, n) \

View File

@@ -563,7 +563,7 @@ static void keccak64x2_close( keccak64_ctx_v128 *kc, void *dst,
{
unsigned eb;
union {
v128_t tmp[lim + 1];
v128_t tmp[140];
uint64_t dummy; /* for alignment */
} u;
size_t j;

View File

@@ -33,43 +33,39 @@
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
v128_t b = v128_xor( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0 ) ); \
a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_alignr_epi8( b, a1, 4 ); \
}
#elif defined(__SSE4_1__)
#define MULT2( a0, a1 ) do \
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, \
_mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
v128_t b = _mm_shuffle_epi32( a1, 0 ); \
b = v128_xor( a0, v128_mask32( b, 0x4 ) ); \
a0 = _mm_alignr_epi8( a1, b, 4 ); \
a1 = _mm_alignr_epi8( b, a1, 4 ); \
} while(0)
}
#elif defined(__ARM_NEON)
const uint32x4_t mask = { 0xffffffff, 0, 0xffffffff, 0xffffffff };
// { a1_0, 0, a1_0, a1_0 }
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, \
v128_and( v128_32( vgetq_lane_u32( a1, 0 ) ), mask ) ); \
v128_t b = v128_xor( a0, v128_and( vdupq_laneq_u32( a1, 0 ), MASK ) ); \
a0 = v128_alignr32( a1, b, 1 ); \
a1 = v128_alignr32( b, a1, 1 ); \
}
#else // assume SSE2
#define MULT2( a0, a1 ) do \
#define MULT2( a0, a1 ) \
{ \
v128_t b = v128_xor( a0, \
_mm_shuffle_epi32( v128_and( a1, MASK ), 0x10 ) ); \
v128_t b = v128_xor( a0, v128_and( _mm_shuffle_epi32( a1, 0 ), MASK ) ); \
a0 = v128_or( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
a1 = v128_or( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
} while(0)
}
#endif
@@ -137,8 +133,8 @@ const uint32x4_t mask = { 0xffffffff, 0, 0xffffffff, 0xffffffff };
t0 = v128_shufll32( a1 ); \
a1 = v128_unpacklo32( t0, a0 ); \
t0 = v128_unpackhi32( t0, a0 ); \
t1 = v128_swap64( t0 ); \
a0 = v128_swap64( a1 ); \
t1 = v128_rev64( t0 ); \
a0 = v128_rev64( a1 ); \
SUBCRUMB( t1, t0, a0, a1 ); \
t0 = v128_unpacklo32( t0, t1 ); \
a1 = v128_unpacklo32( a1, a0 ); \
@@ -224,9 +220,10 @@ static const uint32_t CNS_INIT[128] __attribute((aligned(16))) = {
};
v128_t CNS128[32];
static v128_t CNS128[32];
#if !defined(__SSE4_1__)
v128_t MASK;
static v128_t MASK;
#endif
int init_luffa(hashState_luffa *state, int hashbitlen)
@@ -235,13 +232,13 @@ int init_luffa(hashState_luffa *state, int hashbitlen)
state->hashbitlen = hashbitlen;
#if !defined(__SSE4_1__)
/* set the lower 32 bits to '1' */
MASK = v128_set32(0x00000000, 0x00000000, 0x00000000, 0xffffffff);
MASK = v128_set32( 0xffffffff, 0, 0xffffffff, 0xffffffff );
#endif
/* set the 32-bit round constant values to the 128-bit data field */
for ( i=0; i<32; i++ )
CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] );
for ( i=0; i<10; i++ )
state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
memset(state->buffer, 0, sizeof state->buffer );
return 0;
}
@@ -268,7 +265,7 @@ int update_luffa( hashState_luffa *state, const void *data,
// remaining data bytes
casti_v128( state->buffer, 0 ) = v128_bswap32( cast_v128( data ) );
// padding of partial block
casti_v128( state->buffer, 1 ) = v128_set32( 0, 0, 0, 0x80000000 );
casti_v128( state->buffer, 1 ) = v128_set32( 0, 0, 0, 0x80000000 );
}
return 0;
@@ -327,7 +324,6 @@ int update_and_final_luffa( hashState_luffa *state, void* output,
return 0;
}
int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
const void* data, size_t inlen )
{
@@ -336,13 +332,13 @@ int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
state->hashbitlen = hashbitlen;
#if !defined(__SSE4_1__)
/* set the lower 32 bits to '1' */
MASK= v128_set64( 0, 0x00000000ffffffff );
MASK= v128_set32( 0xffffffff, 0, 0xffffffff, 0xffffffff );
#endif
/* set the 32-bit round constant values to the 128-bit data field */
for ( i=0; i<32; i++ )
CNS128[i] = v128_load( (v128_t*)&CNS_INIT[i*4] );
CNS128[i] = casti_v128( CNS_INIT, i );
for ( i=0; i<10; i++ )
state->chainv[i] = v128_load( (v128_t*)&IV[i*4] );
state->chainv[i] = casti_v128( IV, i );
memset(state->buffer, 0, sizeof state->buffer );
// update
@@ -376,16 +372,15 @@ int luffa_full( hashState_luffa *state, void* output, int hashbitlen,
return 0;
}
/***************************************************/
/* Round function */
/* state: hash context */
static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 )
{
v128_t t0, t1;
v128_t *chainv = state->chainv;
v128_t x0, x1, x2, x3, x4, x5, x6, x7;
v128u32_t t0, t1;
v128u32_t *chainv = state->chainv;
v128u32_t x0, x1, x2, x3, x4, x5, x6, x7;
t0 = v128_xor3( chainv[0], chainv[2], chainv[4] );
t1 = v128_xor3( chainv[1], chainv[3], chainv[5] );
@@ -472,7 +467,7 @@ static void rnd512( hashState_luffa *state, v128_t msg1, v128_t msg0 )
chainv[5] = v128_rol32( chainv[5], 2 );
chainv[7] = v128_rol32( chainv[7], 3 );
chainv[9] = v128_rol32( chainv[9], 4 );
NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );

View File

@@ -11,7 +11,7 @@
#endif
#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
#if !defined(__AES__) // && !defined(__ARM_FEATURE_AES) )
#include "algo/groestl/sph_groestl.h"
#endif
@@ -19,7 +19,7 @@
#define ALLIUM_16WAY 1
#elif defined(__AVX2__)
#define ALLIUM_8WAY 1
#elif #defined(__SSE2__) || defined(__ARM_NEON)
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define ALLIUM_4WAY 1
#endif
@@ -30,7 +30,7 @@ typedef union {
cube_4way_2buf_context cube;
skein256_8way_context skein;
#if defined(__VAES__)
groestl256_4way_context groestl;
groestl256_4way_context groestl;
#else
hashState_groestl256 groestl;
#endif
@@ -465,12 +465,12 @@ typedef union
{
keccak256_2x64_context keccak;
cubehashParam cube;
#if defined(__x86_64__)
//#if defined(__x86_64__)
skein256_2x64_context skein;
#else
sph_skein512_context skein;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
//#else
// sph_skein512_context skein;
//#endif
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
hashState_groestl256 groestl;
#else
sph_groestl256_context groestl;
@@ -516,7 +516,7 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
#if defined(__x86_64__)
//#if defined(__x86_64__)
intrlv_2x64( vhashA, hash0, hash1, 256 );
skein256_2x64_init( &ctx.skein );
skein256_2x64_update( &ctx.skein, vhashA, 32 );
@@ -527,6 +527,7 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
skein256_2x64_update( &ctx.skein, vhashA, 32 );
skein256_2x64_close( &ctx.skein, vhashA );
dintrlv_2x64( hash2, hash3, vhashA, 256 );
/*
#else
sph_skein256_init( &ctx.skein );
sph_skein256( &ctx.skein, hash0, 32 );
@@ -541,8 +542,8 @@ static void allium_4way_hash( void *hash, const void *midstate_vars,
sph_skein256( &ctx.skein, hash3, 32 );
sph_skein256_close( &ctx.skein, hash3 );
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
*/
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
groestl256_full( &ctx.groestl, hash0, hash0, 256 );
groestl256_full( &ctx.groestl, hash1, hash1, 256 );
groestl256_full( &ctx.groestl, hash2, hash2, 256 );

View File

@@ -2303,9 +2303,8 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
XB[2] = _mm_blend_epi16( t0, t2, 0x0f );
XB[3] = _mm_blend_epi16( t1, t3, 0xc3 );
#elif defined(__SSE2__) || defined(__ARM_NEON)
#else // SSE2 or NEON
/*
const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff, 0);
const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
@@ -2326,9 +2325,10 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
XB[1] = v128_blendv( t1, t3, mask_3c );
XB[2] = v128_blendv( t2, t0, mask_f0 );
XB[3] = v128_blendv( t3, t1, mask_3c );
*/
#endif
/*
v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3;
YA0 = v128_set32( xa[15], xa[10], xa[ 5], xa[ 0] );
@@ -2348,8 +2348,7 @@ static inline void salsa_simd128_shuffle_2buf( uint32_t *xa, uint32_t *xb )
XB[2] = YB2;
XA[3] = YA3;
XB[3] = YB3;
#endif
*/
}
static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
@@ -2357,8 +2356,8 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
v128_t *XA = (v128_t*)xa;
v128_t *XB = (v128_t*)xb;
#if defined(__SSE4_1__)
#if defined(__SSE4_1__)
v128_t t0 = _mm_blend_epi16( XA[0], XA[2], 0xf0 );
v128_t t1 = _mm_blend_epi16( XA[0], XA[2], 0x0f );
@@ -2377,9 +2376,8 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
XB[2] = _mm_blend_epi16( t1, t3, 0xcc );
XB[3] = _mm_blend_epi16( t1, t3, 0x33 );
#elif defined(__SSE2__) || defined(__ARM_NEON)
#else // SSE2 or NEON
/*
const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff, 0);
const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
@@ -2389,19 +2387,21 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
v128_t t2 = v128_blendv( XA[1], XA[3], mask_3c );
v128_t t3 = v128_blendv( XA[3], XA[1], mask_3c );
XA[0] = v128_blendv( t0, t2, mask_cc );
XA[1] = v128_blendv( t1, t3, mask_cc );
XA[2] = v128_blendv( t2, t0, mask_cc );
XA[1] = v128_blendv( t2, t0, mask_cc );
XA[2] = v128_blendv( t1, t3, mask_cc );
XA[3] = v128_blendv( t3, t1, mask_cc );
t0 = v128_blendv( XB[0], XB[2], mask_f0 );
t1 = v128_blendv( XB[1], XB[3], mask_3c );
t2 = v128_blendv( XB[2], XB[0], mask_f0 );
t1 = v128_blendv( XB[2], XB[0], mask_f0 );
t2 = v128_blendv( XB[1], XB[3], mask_3c );
t3 = v128_blendv( XB[3], XB[1], mask_3c );
XB[0] = v128_blendv( t0, t2, mask_cc );
XB[1] = v128_blendv( t1, t3, mask_cc );
XB[2] = v128_blendv( t2, t0, mask_cc );
XB[1] = v128_blendv( t2, t0, mask_cc );
XB[2] = v128_blendv( t1, t3, mask_cc );
XB[3] = v128_blendv( t3, t1, mask_cc );
*/
#endif
/*
v128_ovly ya[4], za[4], yb[4], zb[4];
ya[0].m128 = XA[0];
@@ -2457,9 +2457,7 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb )
XB[2] = zb[2].m128;
XA[3] = za[3].m128;
XB[3] = zb[3].m128;
#endif
*/
}
static void salsa8_simd128_2buf( uint32_t * const ba, uint32_t * const bb,
@@ -2611,7 +2609,7 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
v128_t *XB = (v128_t*)xb;
v128_t *XC = (v128_t*)xc;
#if defined(__SSE4_1__)
#if defined(__SSE4_1__)
v128_t t0 = _mm_blend_epi16( XA[0], XA[1], 0xcc );
v128_t t1 = _mm_blend_epi16( XA[0], XA[1], 0x33 );
@@ -2638,9 +2636,8 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
XC[2] = _mm_blend_epi16( t0, t2, 0x0f );
XC[3] = _mm_blend_epi16( t1, t3, 0xc3 );
#elif defined(__SSE2__) || defined(__ARM_NEON)
#else // SSE2 or NEON
/*
const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff, 0);
const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
@@ -2650,28 +2647,29 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
v128_t t2 = v128_blendv( XA[2], XA[3], mask_cc );
v128_t t3 = v128_blendv( XA[3], XA[2], mask_cc );
XA[0] = v128_blendv( t0, t2, mask_f0 );
XA[1] = v128_blendv( t1, t3, mask_3c );
XA[2] = v128_blendv( t2, t0, mask_f0 );
XA[1] = v128_blendv( t2, t0, mask_f0 );
XA[2] = v128_blendv( t1, t3, mask_3c );
XA[3] = v128_blendv( t3, t1, mask_3c );
t0 = v128_blendv( XB[0], XB[1], mask_cc );
t1 = v128_blendv( XB[1], XB[0], mask_cc );
t2 = v128_blendv( XB[2], XB[3], mask_cc );
t3 = v128_blendv( XB[3], XB[2], mask_cc );
XB[0] = v128_blendv( t0, t2, mask_f0 );
XB[1] = v128_blendv( t1, t3, mask_3c );
XB[2] = v128_blendv( t2, t0, mask_f0 );
XB[1] = v128_blendv( t2, t0, mask_f0 );
XB[2] = v128_blendv( t1, t3, mask_3c );
XB[3] = v128_blendv( t3, t1, mask_3c );
t0 = v128_blendv( XC[0], XC[1], mask_cc );
t1 = v128_blendv( XC[1], XC[0], mask_cc );
t2 = v128_blendv( XC[2], XC[3], mask_cc );
t3 = v128_blendv( XC[3], XC[2], mask_cc );
XC[0] = v128_blendv( t0, t2, mask_f0 );
XC[1] = v128_blendv( t1, t3, mask_3c );
XC[2] = v128_blendv( t2, t0, mask_f0 );
XC[1] = v128_blendv( t2, t0, mask_f0 );
XC[2] = v128_blendv( t1, t3, mask_3c );
XC[3] = v128_blendv( t3, t1, mask_3c );
*/
#endif
/*
v128_t YA0, YA1, YA2, YA3, YB0, YB1, YB2, YB3, YC0, YC1, YC2, YC3;
YA0 = v128_set32( xa[15], xa[10], xa[ 5], xa[ 0] );
@@ -2699,9 +2697,7 @@ static inline void salsa_simd128_shuffle_3buf( uint32_t *xa, uint32_t *xb,
XA[3] = YA3;
XB[3] = YB3;
XC[3] = YC3;
#endif
*/
}
static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
@@ -2738,9 +2734,8 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
XC[2] = _mm_blend_epi16( t1, t3, 0xcc );
XC[3] = _mm_blend_epi16( t1, t3, 0x33 );
#elif defined(__SSE2__) || defined(__ARM_NEON)
#else // SSE2 or NEON
/*
const v128u64_t mask_cc = v128_set64(0xffffffff00000000, 0xffffffff00000000);
const v128u64_t mask_f0 = v128_set64(0xffffffffffffffff, 0);
const v128u64_t mask_3c = v128_set64(0x00000000ffffffff, 0xffffffff00000000);
@@ -2750,27 +2745,29 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
v128_t t2 = v128_blendv( XA[1], XA[3], mask_3c );
v128_t t3 = v128_blendv( XA[3], XA[1], mask_3c );
XA[0] = v128_blendv( t0, t2, mask_cc );
XA[1] = v128_blendv( t1, t3, mask_cc );
XA[2] = v128_blendv( t2, t0, mask_cc );
XA[1] = v128_blendv( t2, t0, mask_cc );
XA[2] = v128_blendv( t1, t3, mask_cc );
XA[3] = v128_blendv( t3, t1, mask_cc );
t0 = v128_blendv( XB[0], XB[2], mask_f0 );
t1 = v128_blendv( XB[1], XB[3], mask_3c );
t2 = v128_blendv( XB[2], XB[0], mask_f0 );
t1 = v128_blendv( XB[2], XB[0], mask_f0 );
t2 = v128_blendv( XB[1], XB[3], mask_3c );
t3 = v128_blendv( XB[3], XB[1], mask_3c );
XB[0] = v128_blendv( t0, t2, mask_cc );
XB[1] = v128_blendv( t1, t3, mask_cc );
XB[2] = v128_blendv( t2, t0, mask_cc );
XB[1] = v128_blendv( t2, t0, mask_cc );
XB[2] = v128_blendv( t1, t3, mask_cc );
XB[3] = v128_blendv( t3, t1, mask_cc );
t0 = v128_blendv( XC[0], XC[2], mask_f0 );
t1 = v128_blendv( XC[1], XC[3], mask_3c );
t2 = v128_blendv( XC[2], XC[0], mask_f0 );
t1 = v128_blendv( XC[2], XC[0], mask_f0 );
t2 = v128_blendv( XC[1], XC[3], mask_3c );
t3 = v128_blendv( XC[3], XC[1], mask_3c );
XC[0] = v128_blendv( t0, t2, mask_cc );
XC[1] = v128_blendv( t1, t3, mask_cc );
XC[2] = v128_blendv( t2, t0, mask_cc );
XC[1] = v128_blendv( t2, t0, mask_cc );
XC[2] = v128_blendv( t1, t3, mask_cc );
XC[3] = v128_blendv( t3, t1, mask_cc );
*/
#endif
/*
v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4];
ya[0].m128 = XA[0];
@@ -2850,9 +2847,7 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb,
XA[3] = za[3].m128;
XB[3] = zb[3].m128;
XC[3] = zc[3].m128;
#endif
*/
}
// Triple buffered, 3x memory usage

View File

@@ -35,41 +35,47 @@
//#include <mm_malloc.h>
#include "malloc-huge.h"
static const uint32_t keypad[12] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
};
static const uint32_t innerpad[11] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
};
static const uint32_t outerpad[8] = {
0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
};
static const uint32_t finalblk[16] = {
0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SCRYPT_THROUGHPUT 16
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
#define SCRYPT_THROUGHPUT 2
#elif defined(__AVX2__)
#define SCRYPT_THROUGHPUT 8
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SCRYPT_THROUGHPUT 4
#else
#define SCRYPT_THROUGHPUT 1
#endif
static const uint32_t sha256_initial_state[8] =
static const uint32_t sha256_initial_state[8] __attribute((aligned(32))) =
{
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SCRYPT_THROUGHPUT 16
#elif defined(__AVX2__)
#define SCRYPT_THROUGHPUT 8
#elif defined(__SHA__) // NEON?
#define SCRYPT_THROUGHPUT 2
#else
#define SCRYPT_THROUGHPUT 4
#endif
// static int scrypt_throughput = 0;
static int scratchbuf_size = 0;
static __thread uint32_t *scratchbuf = NULL;
#if (SCRYPT_THROUGHPUT == 1) || defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
static const uint32_t keypad[12] __attribute((aligned(16))) =
{
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280
};
static const uint32_t innerpad[11] __attribute((aligned(16))) =
{
0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0
};
static const uint32_t outerpad[8] __attribute((aligned(16))) =
{
0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300
};
static const uint32_t finalblk[16] __attribute((aligned(16))) =
{
0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620
};
// change this to a constant to be used directly as input state arg
// vectors still need an init function.
static inline void sha256_init_state( uint32_t *state )
@@ -155,7 +161,9 @@ static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate,
output[i] = bswap_32( ostate[i] );
}
#if defined(__SHA__)
#endif // throughput 1
//
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
static inline void HMAC_SHA256_80_init_SHA_2BUF( const uint32_t *key0,
const uint32_t *key1, uint32_t *tstate0, uint32_t *tstate1,
@@ -266,7 +274,11 @@ static inline void PBKDF2_SHA256_128_32_SHA_2BUF( uint32_t *tstate0,
#endif // SHA
static const uint32_t keypad_4way[4 * 12] = {
static const uint32_t keypad_4way[ 4*12 ] __attribute((aligned(32))) =
{
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
@@ -280,7 +292,8 @@ static const uint32_t keypad_4way[4 * 12] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000280, 0x00000280, 0x00000280, 0x00000280
};
static const uint32_t innerpad_4way[4 * 11] = {
static const uint32_t innerpad_4way[ 4*11 ] __attribute((aligned(32))) =
{
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
@@ -293,7 +306,8 @@ static const uint32_t innerpad_4way[4 * 11] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0
};
static const uint32_t outerpad_4way[4 * 8] = {
static const uint32_t outerpad_4way[ 4*8 ] __attribute((aligned(32))) =
{
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
@@ -1221,10 +1235,10 @@ static int scrypt_N_1_1_256_16way( const uint32_t *input, uint32_t *output,
#endif // AVX512
#if ( SCRYPT_THROUGHPUT == 2 ) && defined(__SHA__)
#if ( SCRYPT_THROUGHPUT == 2 ) && ( defined(__SHA__) || defined(__ARM_FEATURE_SHA2) )
static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
uint32_t *midstate, int N, int thrid )
static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input,
uint32_t *output, uint32_t *midstate, int N, int thrid )
{
uint32_t _ALIGN(128) tstate[ 2*8 ];
uint32_t _ALIGN(128) ostate[ 2*8 ];
@@ -1241,13 +1255,13 @@ static int scrypt_N_1_1_256_sha_2buf( const uint32_t *input, uint32_t *output,
scrypt_core_simd128_2buf( W, scratchbuf, N );
if ( work_restart[thrid].restart ) return 0;
PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate, ostate+8, W, W+32,
output, output+8 );
PBKDF2_SHA256_128_32_SHA_2BUF( tstate, tstate+8, ostate,
ostate+8, W, W+32, output, output+8 );
return 1;
}
#endif
#endif // THROUGHPUT = 2 && SHA
#if ( SCRYPT_THROUGHPUT == 4 )
@@ -1267,13 +1281,10 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
HMAC_SHA256_80_init( input, tstate, ostate );
PBKDF2_SHA256_80_128( tstate, ostate, input, W );
HMAC_SHA256_80_init( input +20, tstate+ 8, ostate+ 8 );
PBKDF2_SHA256_80_128( tstate+ 8, ostate+ 8, input +20, W+32 );
HMAC_SHA256_80_init( input +40, tstate+16, ostate+16 );
PBKDF2_SHA256_80_128( tstate+16, ostate+16, input +40, W+64 );
HMAC_SHA256_80_init( input +60, tstate+24, ostate+24 );
PBKDF2_SHA256_80_128( tstate+24, ostate+24, input +60, W+96 );
@@ -1303,11 +1314,8 @@ static int scrypt_N_1_1_256_4way_sha( const uint32_t *input, uint32_t *output,
if ( work_restart[thrid].restart ) return 0;
PBKDF2_SHA256_128_32( tstate, ostate, W, output );
PBKDF2_SHA256_128_32( tstate+ 8, ostate+ 8, W+32, output+ 8 );
PBKDF2_SHA256_128_32( tstate+16, ostate+16, W+64, output+16 );
PBKDF2_SHA256_128_32( tstate+24, ostate+24, W+96, output+24 );
return 1;
@@ -1418,14 +1426,14 @@ extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
rc = scrypt_N_1_1_256_8way( data, hash, midstate, opt_param_n,
thr_id );
#elif ( SCRYPT_THROUGHPUT == 4 )
#if defined(__SHA__)
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
rc = scrypt_N_1_1_256_4way_sha( data, hash, midstate, opt_param_n,
thr_id );
#else
rc = scrypt_N_1_1_256_4way( data, hash, midstate, opt_param_n,
thr_id );
#endif
#elif ( SCRYPT_THROUGHPUT == 2 ) && defined(__SHA__)
#elif ( SCRYPT_THROUGHPUT == 2 ) && ( defined(__SHA__) || defined(__ARM_FEATURE_SHA2) )
rc = scrypt_N_1_1_256_sha_2buf( data, hash, midstate, opt_param_n,
thr_id );
#else
@@ -1472,10 +1480,10 @@ bool scrypt_miner_thread_init( int thr_id )
bool register_scrypt_algo( algo_gate_t* gate )
{
#if defined(__SHA__)
gate->optimizations = SSE2_OPT | SHA_OPT;
#if defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT;
#else
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT;
gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#endif
gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
gate->scanhash = (void*)&scanhash_scrypt;
@@ -1492,15 +1500,15 @@ bool register_scrypt_algo( algo_gate_t* gate )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 4 * 128; // 4 way
#elif defined(__SHA__) || defined(__ARM_FEATURE_SHA2)
// scrypt_throughput = 2;
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
#elif defined(__AVX2__)
// scrypt_throughput = 8;
if ( opt_param_n > 0x4000 )
scratchbuf_size = opt_param_n * 3 * 128; // 3 buf
else
scratchbuf_size = opt_param_n * 2 * 128; // 2 way
#elif defined(__SHA__)
// scrypt_throughput = 4;
scratchbuf_size = opt_param_n * 2 * 128; // 2 buf
#else
// scrypt_throughput = 4;
if ( opt_param_n > 0x4000 )

View File

@@ -1,681 +0,0 @@
/*
* Copyright 2011 ArtForz
* Copyright 2011-2013 pooler
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version. See COPYING for more details.
*/
#include "sha256d-4way.h"
#include <string.h>
#include <inttypes.h>
#if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)
#define EXTERN_SHA256
#endif
static const uint32_t sha256_h[8] = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
};
static const uint32_t sha256_k[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
void sha256_init(uint32_t *state)
{
memcpy(state, sha256_h, 32);
}
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
do { \
t0 = h + S1(e) + Ch(e, f, g) + k; \
t1 = S0(a) + Maj(a, b, c); \
d += t0; \
h = t0 + t1; \
} while (0)
/* Adjusted round function for rotating state */
#define RNDr(S, W, i) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i] + sha256_k[i])
#ifndef EXTERN_SHA256
/*
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state.
*/
void sha256_transform(uint32_t *state, const uint32_t *block, int swap)
{
uint32_t W[64];
uint32_t S[8];
uint32_t t0, t1;
int i;
/* 1. Prepare message schedule W. */
if (swap) {
for (i = 0; i < 16; i++)
W[i] = swab32(block[i]);
} else
memcpy(W, block, 64);
for (i = 16; i < 64; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
}
/* 2. Initialize working variables. */
memcpy(S, state, 32);
/* 3. Mix. */
RNDr(S, W, 0);
RNDr(S, W, 1);
RNDr(S, W, 2);
RNDr(S, W, 3);
RNDr(S, W, 4);
RNDr(S, W, 5);
RNDr(S, W, 6);
RNDr(S, W, 7);
RNDr(S, W, 8);
RNDr(S, W, 9);
RNDr(S, W, 10);
RNDr(S, W, 11);
RNDr(S, W, 12);
RNDr(S, W, 13);
RNDr(S, W, 14);
RNDr(S, W, 15);
RNDr(S, W, 16);
RNDr(S, W, 17);
RNDr(S, W, 18);
RNDr(S, W, 19);
RNDr(S, W, 20);
RNDr(S, W, 21);
RNDr(S, W, 22);
RNDr(S, W, 23);
RNDr(S, W, 24);
RNDr(S, W, 25);
RNDr(S, W, 26);
RNDr(S, W, 27);
RNDr(S, W, 28);
RNDr(S, W, 29);
RNDr(S, W, 30);
RNDr(S, W, 31);
RNDr(S, W, 32);
RNDr(S, W, 33);
RNDr(S, W, 34);
RNDr(S, W, 35);
RNDr(S, W, 36);
RNDr(S, W, 37);
RNDr(S, W, 38);
RNDr(S, W, 39);
RNDr(S, W, 40);
RNDr(S, W, 41);
RNDr(S, W, 42);
RNDr(S, W, 43);
RNDr(S, W, 44);
RNDr(S, W, 45);
RNDr(S, W, 46);
RNDr(S, W, 47);
RNDr(S, W, 48);
RNDr(S, W, 49);
RNDr(S, W, 50);
RNDr(S, W, 51);
RNDr(S, W, 52);
RNDr(S, W, 53);
RNDr(S, W, 54);
RNDr(S, W, 55);
RNDr(S, W, 56);
RNDr(S, W, 57);
RNDr(S, W, 58);
RNDr(S, W, 59);
RNDr(S, W, 60);
RNDr(S, W, 61);
RNDr(S, W, 62);
RNDr(S, W, 63);
/* 4. Mix local working variables into global state */
for (i = 0; i < 8; i++)
state[i] += S[i];
}
#endif /* EXTERN_SHA256 */
static const uint32_t sha256d_hash1[16] = {
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x80000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000100
};
// this performs the entire hash all over again, why?
// because main function only does 56 rounds.
static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)
{
uint32_t S[16];
int i;
sha256_init(S);
sha256_transform(S, data, 0);
sha256_transform(S, data + 16, 0);
memcpy(S + 8, sha256d_hash1 + 8, 32);
sha256_init(hash);
sha256_transform(hash, S, 0);
for (i = 0; i < 8; i++)
hash[i] = swab32(hash[i]);
}
/*
#if defined (__SHA__)
#include "algo/sha/sph_sha2.h"
void sha256d(unsigned char *hash, const unsigned char *data, int len)
{
sph_sha256_context ctx __attribute__ ((aligned (64)));
sph_sha256_init( &ctx );
sph_sha256( &ctx, data, len );
sph_sha256_close( &ctx, hash );
sph_sha256_init( &ctx );
sph_sha256( &ctx, hash, 32 );
sph_sha256_close( &ctx, hash );
}
#else
void sha256d(unsigned char *hash, const unsigned char *data, int len)
{
uint32_t S[16], T[16];
int i, r;
sha256_init(S);
for (r = len; r > -9; r -= 64) {
if (r < 64)
memset(T, 0, 64);
memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
if (r >= 0 && r < 64)
((unsigned char *)T)[r] = 0x80;
for (i = 0; i < 16; i++)
T[i] = be32dec(T + i);
if (r < 56)
T[15] = 8 * len;
sha256_transform(S, T, 0);
}
memcpy(S + 8, sha256d_hash1 + 8, 32);
sha256_init(T);
sha256_transform(T, S, 0);
for (i = 0; i < 8; i++)
be32enc((uint32_t *)hash + i, T[i]);
}
#endif
*/
static inline void sha256d_preextend(uint32_t *W)
{
W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0];
W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1];
W[18] = s1(W[16]) + W[11] + W[ 2];
W[19] = s1(W[17]) + W[12] + s0(W[ 4]);
W[20] = W[13] + s0(W[ 5]) + W[ 4];
W[21] = W[14] + s0(W[ 6]) + W[ 5];
W[22] = W[15] + s0(W[ 7]) + W[ 6];
W[23] = W[16] + s0(W[ 8]) + W[ 7];
W[24] = W[17] + s0(W[ 9]) + W[ 8];
W[25] = s0(W[10]) + W[ 9];
W[26] = s0(W[11]) + W[10];
W[27] = s0(W[12]) + W[11];
W[28] = s0(W[13]) + W[12];
W[29] = s0(W[14]) + W[13];
W[30] = s0(W[15]) + W[14];
W[31] = s0(W[16]) + W[15];
}
static inline void sha256d_prehash(uint32_t *S, const uint32_t *W)
{
uint32_t t0, t1;
RNDr(S, W, 0);
RNDr(S, W, 1);
RNDr(S, W, 2);
}
#ifdef EXTERN_SHA256
void sha256d_ms(uint32_t *hash, uint32_t *W,
const uint32_t *midstate, const uint32_t *prehash);
#else
static inline void sha256d_ms(uint32_t *hash, uint32_t *W,
const uint32_t *midstate, const uint32_t *prehash)
{
uint32_t S[64];
uint32_t t0, t1;
int i;
S[18] = W[18];
S[19] = W[19];
S[20] = W[20];
S[22] = W[22];
S[23] = W[23];
S[24] = W[24];
S[30] = W[30];
S[31] = W[31];
W[18] += s0(W[3]);
W[19] += W[3];
W[20] += s1(W[18]);
W[21] = s1(W[19]);
W[22] += s1(W[20]);
W[23] += s1(W[21]);
W[24] += s1(W[22]);
W[25] = s1(W[23]) + W[18];
W[26] = s1(W[24]) + W[19];
W[27] = s1(W[25]) + W[20];
W[28] = s1(W[26]) + W[21];
W[29] = s1(W[27]) + W[22];
W[30] += s1(W[28]) + W[23];
W[31] += s1(W[29]) + W[24];
for (i = 32; i < 64; i += 2) {
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
}
memcpy(S, prehash, 32);
RNDr(S, W, 3);
RNDr(S, W, 4);
RNDr(S, W, 5);
RNDr(S, W, 6);
RNDr(S, W, 7);
RNDr(S, W, 8);
RNDr(S, W, 9);
RNDr(S, W, 10);
RNDr(S, W, 11);
RNDr(S, W, 12);
RNDr(S, W, 13);
RNDr(S, W, 14);
RNDr(S, W, 15);
RNDr(S, W, 16);
RNDr(S, W, 17);
RNDr(S, W, 18);
RNDr(S, W, 19);
RNDr(S, W, 20);
RNDr(S, W, 21);
RNDr(S, W, 22);
RNDr(S, W, 23);
RNDr(S, W, 24);
RNDr(S, W, 25);
RNDr(S, W, 26);
RNDr(S, W, 27);
RNDr(S, W, 28);
RNDr(S, W, 29);
RNDr(S, W, 30);
RNDr(S, W, 31);
RNDr(S, W, 32);
RNDr(S, W, 33);
RNDr(S, W, 34);
RNDr(S, W, 35);
RNDr(S, W, 36);
RNDr(S, W, 37);
RNDr(S, W, 38);
RNDr(S, W, 39);
RNDr(S, W, 40);
RNDr(S, W, 41);
RNDr(S, W, 42);
RNDr(S, W, 43);
RNDr(S, W, 44);
RNDr(S, W, 45);
RNDr(S, W, 46);
RNDr(S, W, 47);
RNDr(S, W, 48);
RNDr(S, W, 49);
RNDr(S, W, 50);
RNDr(S, W, 51);
RNDr(S, W, 52);
RNDr(S, W, 53);
RNDr(S, W, 54);
RNDr(S, W, 55);
RNDr(S, W, 56);
RNDr(S, W, 57);
RNDr(S, W, 58);
RNDr(S, W, 59);
RNDr(S, W, 60);
RNDr(S, W, 61);
RNDr(S, W, 62);
RNDr(S, W, 63);
for (i = 0; i < 8; i++)
S[i] += midstate[i];
W[18] = S[18];
W[19] = S[19];
W[20] = S[20];
W[22] = S[22];
W[23] = S[23];
W[24] = S[24];
W[30] = S[30];
W[31] = S[31];
memcpy(S + 8, sha256d_hash1 + 8, 32);
S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0];
S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1];
S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2];
S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3];
S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4];
S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5];
S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6];
S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7];
S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8];
S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9];
S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10];
S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11];
S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12];
S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13];
S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14];
S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15];
for (i = 32; i < 60; i += 2) {
S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16];
S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15];
}
S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44];
sha256_init(hash);
RNDr(hash, S, 0);
RNDr(hash, S, 1);
RNDr(hash, S, 2);
RNDr(hash, S, 3);
RNDr(hash, S, 4);
RNDr(hash, S, 5);
RNDr(hash, S, 6);
RNDr(hash, S, 7);
RNDr(hash, S, 8);
RNDr(hash, S, 9);
RNDr(hash, S, 10);
RNDr(hash, S, 11);
RNDr(hash, S, 12);
RNDr(hash, S, 13);
RNDr(hash, S, 14);
RNDr(hash, S, 15);
RNDr(hash, S, 16);
RNDr(hash, S, 17);
RNDr(hash, S, 18);
RNDr(hash, S, 19);
RNDr(hash, S, 20);
RNDr(hash, S, 21);
RNDr(hash, S, 22);
RNDr(hash, S, 23);
RNDr(hash, S, 24);
RNDr(hash, S, 25);
RNDr(hash, S, 26);
RNDr(hash, S, 27);
RNDr(hash, S, 28);
RNDr(hash, S, 29);
RNDr(hash, S, 30);
RNDr(hash, S, 31);
RNDr(hash, S, 32);
RNDr(hash, S, 33);
RNDr(hash, S, 34);
RNDr(hash, S, 35);
RNDr(hash, S, 36);
RNDr(hash, S, 37);
RNDr(hash, S, 38);
RNDr(hash, S, 39);
RNDr(hash, S, 40);
RNDr(hash, S, 41);
RNDr(hash, S, 42);
RNDr(hash, S, 43);
RNDr(hash, S, 44);
RNDr(hash, S, 45);
RNDr(hash, S, 46);
RNDr(hash, S, 47);
RNDr(hash, S, 48);
RNDr(hash, S, 49);
RNDr(hash, S, 50);
RNDr(hash, S, 51);
RNDr(hash, S, 52);
RNDr(hash, S, 53);
RNDr(hash, S, 54);
RNDr(hash, S, 55);
RNDr(hash, S, 56);
hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5])
+ S[57] + sha256_k[57];
hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4])
+ S[58] + sha256_k[58];
hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3])
+ S[59] + sha256_k[59];
hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2])
+ S[60] + sha256_k[60]
+ sha256_h[7];
}
#endif /* EXTERN_SHA256 */
#ifdef HAVE_SHA256_4WAY
void sha256d_ms_4way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_4way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(128) data[4 * 64];
uint32_t _ALIGN(32) hash[4 * 8];
uint32_t _ALIGN(32) midstate[4 * 8];
uint32_t _ALIGN(32) prehash[4 * 8];
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id;
int i, j;
memcpy(data, pdata + 16, 64);
sha256d_preextend(data);
for (i = 31; i >= 0; i--)
for (j = 0; j < 4; j++)
data[i * 4 + j] = data[i];
sha256_init(midstate);
sha256_transform(midstate, pdata, 0);
memcpy(prehash, midstate, 32);
sha256d_prehash(prehash, pdata + 16);
for (i = 7; i >= 0; i--) {
for (j = 0; j < 4; j++) {
midstate[i * 4 + j] = midstate[i];
prehash[i * 4 + j] = prehash[i];
}
}
do {
for (i = 0; i < 4; i++)
data[4 * 3 + i] = ++n;
sha256d_ms_4way(hash, data, midstate, prehash);
for (i = 0; i < 4; i++) {
if (swab32(hash[4 * 7 + i]) <= Htarg) {
pdata[19] = data[4 * 3 + i];
sha256d_80_swap(hash, pdata);
if ( fulltest( hash, ptarget ) && !opt_benchmark )
submit_solution( work, hash, mythr );
}
}
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
#endif /* HAVE_SHA256_4WAY */
#ifdef HAVE_SHA256_8WAY
void sha256d_ms_8way(uint32_t *hash, uint32_t *data,
const uint32_t *midstate, const uint32_t *prehash);
static inline int scanhash_sha256d_8way_pooler( struct work *work,
uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(128) data[8 * 64];
uint32_t _ALIGN(32) hash[8 * 8];
uint32_t _ALIGN(32) midstate[8 * 8];
uint32_t _ALIGN(32) prehash[8 * 8];
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id;
int i, j;
memcpy(data, pdata + 16, 64);
sha256d_preextend(data);
for (i = 31; i >= 0; i--)
for (j = 0; j < 8; j++)
data[i * 8 + j] = data[i];
sha256_init(midstate);
sha256_transform(midstate, pdata, 0);
memcpy(prehash, midstate, 32);
sha256d_prehash(prehash, pdata + 16);
for (i = 7; i >= 0; i--) {
for (j = 0; j < 8; j++) {
midstate[i * 8 + j] = midstate[i];
prehash[i * 8 + j] = prehash[i];
}
}
do {
for (i = 0; i < 8; i++)
data[8 * 3 + i] = ++n;
sha256d_ms_8way(hash, data, midstate, prehash);
for (i = 0; i < 8; i++) {
if (swab32(hash[8 * 7 + i]) <= Htarg) {
pdata[19] = data[8 * 3 + i];
sha256d_80_swap(hash, pdata);
if ( fulltest( hash, ptarget ) && !opt_benchmark )
submit_solution( work, hash, mythr );
}
}
} while (n < max_nonce && !work_restart[thr_id].restart);
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
#endif /* HAVE_SHA256_8WAY */
int scanhash_sha256d_pooler( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
uint32_t _ALIGN(128) data[64];
uint32_t _ALIGN(32) hash[8];
uint32_t _ALIGN(32) midstate[8];
uint32_t _ALIGN(32) prehash[8];
uint32_t n = pdata[19] - 1;
const uint32_t first_nonce = pdata[19];
const uint32_t Htarg = ptarget[7];
int thr_id = mythr->id; // thr_id arg is deprecated
#ifdef HAVE_SHA256_8WAY
if ( sha256_use_8way() )
return scanhash_sha256d_8way_pooler( work, max_nonce, hashes_done, mythr );
#endif
#ifdef HAVE_SHA256_4WAY
if ( sha256_use_4way() )
return scanhash_sha256d_4way_pooler( work, max_nonce, hashes_done, mythr );
#endif
memcpy(data, pdata + 16, 64);
sha256d_preextend(data);
sha256_init(midstate);
sha256_transform(midstate, pdata, 0);
memcpy(prehash, midstate, 32);
sha256d_prehash(prehash, pdata + 16);
do {
data[3] = ++n;
sha256d_ms(hash, data, midstate, prehash);
if (unlikely(swab32(hash[7]) <= Htarg))
{
pdata[19] = data[3];
sha256d_80_swap(hash, pdata);
if ( fulltest(hash, ptarget) && !opt_benchmark )
submit_solution( work, hash, mythr );
}
} while (likely(n < max_nonce && !work_restart[thr_id].restart));
*hashes_done = n - first_nonce + 1;
pdata[19] = n;
return 0;
}
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#elif defined(SHA256D_SHA)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256d_sha;
#elif defined(SHA256D_NEON_SHA2)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
//#elif defined(SHA256D_8WAY)
// gate->scanhash = (void*)&scanhash_sha256d_8way;
#else
gate->scanhash = (void*)&scanhash_sha256d_pooler;
// gate->scanhash = (void*)&scanhash_sha256d_4way;
#endif
// gate->hash = (void*)&sha256d;
return true;
};

View File

@@ -54,29 +54,29 @@ static const uint32_t K256[64] =
v128_xor( v128_xor( \
v128_ror32(x, 17), v128_ror32(x, 19) ), v128_sr32(x, 10) )
#define SHA2s_MEXP( a, b, c, d ) \
#define SHA256_4X32_MEXP( a, b, c, d ) \
v128_add4_32( SSG2_1( a ), b, SSG2_0( c ), d );
#define SHA256x4_MSG_EXPANSION( W ) \
W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \
W[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] ); \
W[ 2] = SHA2s_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \
W[ 3] = SHA2s_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \
W[ 4] = SHA2s_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \
W[ 5] = SHA2s_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \
W[ 6] = SHA2s_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \
W[ 7] = SHA2s_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \
W[ 8] = SHA2s_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \
W[ 9] = SHA2s_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \
W[10] = SHA2s_MEXP( W[ 8], W[ 3], W[11], W[10] ); \
W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] ); \
W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] ); \
W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] ); \
W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] ); \
W[15] = SHA2s_MEXP( W[13], W[ 8], W[ 0], W[15] );
#define SHA256_4X32_MSG_EXPANSION( W ) \
W[ 0] = SHA256_4X32_MEXP( W[14], W[ 9], W[ 1], W[ 0] ); \
W[ 1] = SHA256_4X32_MEXP( W[15], W[10], W[ 2], W[ 1] ); \
W[ 2] = SHA256_4X32_MEXP( W[ 0], W[11], W[ 3], W[ 2] ); \
W[ 3] = SHA256_4X32_MEXP( W[ 1], W[12], W[ 4], W[ 3] ); \
W[ 4] = SHA256_4X32_MEXP( W[ 2], W[13], W[ 5], W[ 4] ); \
W[ 5] = SHA256_4X32_MEXP( W[ 3], W[14], W[ 6], W[ 5] ); \
W[ 6] = SHA256_4X32_MEXP( W[ 4], W[15], W[ 7], W[ 6] ); \
W[ 7] = SHA256_4X32_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] ); \
W[ 8] = SHA256_4X32_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] ); \
W[ 9] = SHA256_4X32_MEXP( W[ 7], W[ 2], W[10], W[ 9] ); \
W[10] = SHA256_4X32_MEXP( W[ 8], W[ 3], W[11], W[10] ); \
W[11] = SHA256_4X32_MEXP( W[ 9], W[ 4], W[12], W[11] ); \
W[12] = SHA256_4X32_MEXP( W[10], W[ 5], W[13], W[12] ); \
W[13] = SHA256_4X32_MEXP( W[11], W[ 6], W[14], W[13] ); \
W[14] = SHA256_4X32_MEXP( W[12], W[ 7], W[15], W[14] ); \
W[15] = SHA256_4X32_MEXP( W[13], W[ 8], W[ 0], W[15] );
#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
do { \
#define SHA256_4X32_ROUND(A, B, C, D, E, F, G, H, i, j) \
{ \
v128_t T1, T2; \
v128_t K = v128_32( K256[( (j)+(i) )] ); \
T1 = v128_add32( H, v128_add4_32( BSG2_1(E), CHs(E, F, G), \
@@ -85,31 +85,41 @@ do { \
Y_xor_Z = X_xor_Y; \
D = v128_add32( D, T1 ); \
H = v128_add32( T1, T2 ); \
} while (0)
}
#define SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
#define SHA256_4X32_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
{ \
v128_t T1 = v128_add4_32( H, BSG2_1(E), CHs(E, F, G), \
v128_32( K256[(i)+(j)] ) ); \
v128_t T2 = v128_add32( BSG2_0(A), MAJs(A, B, C) ); \
Y_xor_Z = X_xor_Y; \
D = v128_add32( D, T1 ); \
H = v128_add32( T1, T2 ); \
}
#define SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, j ) \
{ \
v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C ); \
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, j ); \
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, j ); \
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, j ); \
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, j ); \
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, j ); \
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, j ); \
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, j ); \
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, j ); \
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, j ); \
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, j ); \
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j ); \
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j ); \
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j ); \
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j ); \
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j ); \
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j ); \
SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 0, j ); \
SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 1, j ); \
SHA256_4X32_ROUND( G, H, A, B, C, D, E, F, 2, j ); \
SHA256_4X32_ROUND( F, G, H, A, B, C, D, E, 3, j ); \
SHA256_4X32_ROUND( E, F, G, H, A, B, C, D, 4, j ); \
SHA256_4X32_ROUND( D, E, F, G, H, A, B, C, 5, j ); \
SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 6, j ); \
SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 7, j ); \
SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 8, j ); \
SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 9, j ); \
SHA256_4X32_ROUND( G, H, A, B, C, D, E, F, 10, j ); \
SHA256_4X32_ROUND( F, G, H, A, B, C, D, E, 11, j ); \
SHA256_4X32_ROUND( E, F, G, H, A, B, C, D, 12, j ); \
SHA256_4X32_ROUND( D, E, F, G, H, A, B, C, 13, j ); \
SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 14, j ); \
SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 15, j ); \
}
// LE data, no need to byte swap
static inline void SHA256_4WAY_TRANSFORM( v128_t *out, v128_t *W,
static inline void SHA256_4X32_TRANSFORM( v128_t *out, v128_t *W,
const v128_t *in )
{
v128_t A, B, C, D, E, F, G, H;
@@ -123,13 +133,13 @@ static inline void SHA256_4WAY_TRANSFORM( v128_t *out, v128_t *W,
G = in[6];
H = in[7];
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
SHA256x4_MSG_EXPANSION( W );
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
SHA256x4_MSG_EXPANSION( W );
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
SHA256x4_MSG_EXPANSION( W );
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
SHA256_4X32_MSG_EXPANSION( W );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
SHA256_4X32_MSG_EXPANSION( W );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
SHA256_4X32_MSG_EXPANSION( W );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
out[0] = v128_add32( in[0], A );
out[1] = v128_add32( in[1], B );
@@ -142,47 +152,37 @@ static inline void SHA256_4WAY_TRANSFORM( v128_t *out, v128_t *W,
}
// LE data, no need to byte swap
void sha256_4way_transform_le( v128_t *state_out, const v128_t *data,
void sha256_4x32_transform_le( v128_t *state_out, const v128_t *data,
const v128_t *state_in )
{
v128_t W[16];
v128_memcpy( W, data, 16 );
SHA256_4WAY_TRANSFORM( state_out, W, state_in );
SHA256_4X32_TRANSFORM( state_out, W, state_in );
}
// BE data, need to byte swap input data
void sha256_4way_transform_be( v128_t *state_out, const v128_t *data,
void sha256_4x32_transform_be( v128_t *state_out, const v128_t *data,
const v128_t *state_in )
{
v128_t W[16];
v128_block_bswap32( W, data );
v128_block_bswap32( W+8, data+8 );
SHA256_4WAY_TRANSFORM( state_out, W, state_in );
SHA256_4X32_TRANSFORM( state_out, W, state_in );
}
// prehash_3rounds & final_rounds are not working
void sha256_4way_prehash_3rounds( v128_t *state_mid, v128_t *X,
const v128_t *W, const v128_t *state_in )
void sha256_4x32_prehash_3rounds( v128_t *state_mid, v128_t *X,
const v128_t *W, const v128_t *state_in )
{
v128_t A, B, C, D, E, F, G, H;
v128_t A, B, C, D, E, F, G, H, T1;
// precalculate constant part msg expansion for second iteration.
X[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
X[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] );
X[ 2] = v128_add32( v128_add32( SSG2_1( X[ 0] ), W[11] ), W[ 2] );
X[ 3] = v128_add32( v128_add32( SSG2_1( X[ 1] ), W[12] ), SSG2_0( W[ 4] ) );
X[ 4] = v128_add32( v128_add32( W[13], SSG2_0( W[ 5] ) ), W[ 4] );
X[ 5] = v128_add32( v128_add32( W[14], SSG2_0( W[ 6] ) ), W[ 5] );
X[ 6] = v128_add32( v128_add32( W[15], SSG2_0( W[ 7] ) ), W[ 6] );
X[ 7] = v128_add32( v128_add32( X[ 0], SSG2_0( W[ 8] ) ), W[ 7] );
X[ 8] = v128_add32( v128_add32( X[ 1], SSG2_0( W[ 9] ) ), W[ 8] );
X[ 9] = v128_add32( SSG2_0( W[10] ), W[ 9] );
X[10] = v128_add32( SSG2_0( W[11] ), W[10] );
X[11] = v128_add32( SSG2_0( W[12] ), W[11] );
X[12] = v128_add32( SSG2_0( W[13] ), W[12] );
X[13] = v128_add32( SSG2_0( W[14] ), W[13] );
X[14] = v128_add32( SSG2_0( W[15] ), W[14] );
X[15] = v128_add32( SSG2_0( X[ 0] ), W[15] );
X[ 0] = v128_add32( SSG2_0( W[ 1] ), W[ 0] );
X[ 1] = v128_add32( v128_add32( SSG2_1( W[15] ), SSG2_0( W[ 2] ) ), W[ 1] );
X[ 2] = v128_add32( SSG2_1( X[ 0] ), W[ 2] );
X[ 3] = v128_add32( SSG2_1( X[ 1] ), SSG2_0( W[ 4] ) );
X[ 4] = SSG2_0( W[15] );
X[ 5] = v128_add32( SSG2_0( X[ 0] ), W[15] );
// W[0] for round 32
X[ 6] = v128_add32( SSG2_0( X[ 1] ), X[ 0] );
A = v128_load( state_in );
B = v128_load( state_in + 1 );
@@ -194,11 +194,16 @@ void sha256_4way_prehash_3rounds( v128_t *state_mid, v128_t *X,
H = v128_load( state_in + 7 );
v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 0 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 0 );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 0 );
SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 0, 0 );
SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 1, 0 );
SHA256_4X32_ROUND( G, H, A, B, C, D, E, F, 2, 0 );
// round 3 part 1, avoid nonces W[3]
T1 = v128_add4_32( E, BSG2_1(B), CHs(B, C, D), v128_32( K256[3] ) );
A = v128_add32( A, T1 );
E = v128_add32( T1, v128_add32( BSG2_0(F), MAJs(F, G, H) ) );
v128_store( state_mid , A );
v128_store( state_mid + 1, B );
v128_store( state_mid + 2, C );
@@ -209,7 +214,7 @@ void sha256_4way_prehash_3rounds( v128_t *state_mid, v128_t *X,
v128_store( state_mid + 7, H );
}
void sha256_4way_final_rounds( v128_t *state_out, const v128_t *data,
void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
const v128_t *state_in, const v128_t *state_mid, const v128_t *X )
{
v128_t A, B, C, D, E, F, G, H;
@@ -226,45 +231,64 @@ void sha256_4way_final_rounds( v128_t *state_out, const v128_t *data,
G = v128_load( state_mid + 6 );
H = v128_load( state_mid + 7 );
v128_t X_xor_Y, Y_xor_Z = v128_xor( G, H );
v128_t X_xor_Y, Y_xor_Z = v128_xor( F, G );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 0 );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 0 );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 0 );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 0 );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 0 );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 0 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 0 );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
// round 3 part 2, add nonces
A = v128_add32( A, W[3] );
E = v128_add32( E, W[3] );
SHA256_4X32_ROUND( E, F, G, H, A, B, C, D, 4, 0 );
SHA256_4X32_ROUND_NOMSG( D, E, F, G, H, A, B, C, 5, 0 );
SHA256_4X32_ROUND_NOMSG( C, D, E, F, G, H, A, B, 6, 0 );
SHA256_4X32_ROUND_NOMSG( B, C, D, E, F, G, H, A, 7, 0 );
SHA256_4X32_ROUND_NOMSG( A, B, C, D, E, F, G, H, 8, 0 );
SHA256_4X32_ROUND_NOMSG( H, A, B, C, D, E, F, G, 9, 0 );
SHA256_4X32_ROUND_NOMSG( G, H, A, B, C, D, E, F, 10, 0 );
SHA256_4X32_ROUND_NOMSG( F, G, H, A, B, C, D, E, 11, 0 );
SHA256_4X32_ROUND_NOMSG( E, F, G, H, A, B, C, D, 12, 0 );
SHA256_4X32_ROUND_NOMSG( D, E, F, G, H, A, B, C, 13, 0 );
SHA256_4X32_ROUND_NOMSG( C, D, E, F, G, H, A, B, 14, 0 );
SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 15, 0 );
// update precalculated msg expansion with new nonce: W[3].
W[ 0] = X[ 0];
W[ 1] = X[ 1];
W[ 2] = v128_add32( X[ 2], SSG2_0( W[ 3] ) );
W[ 3] = v128_add32( X[ 3], W[ 3] );
W[ 4] = v128_add32( X[ 4], SSG2_1( W[ 2] ) );
W[ 5] = v128_add32( X[ 5], SSG2_1( W[ 3] ) );
W[ 6] = v128_add32( X[ 6], SSG2_1( W[ 4] ) );
W[ 7] = v128_add32( X[ 7], SSG2_1( W[ 5] ) );
W[ 8] = v128_add32( X[ 8], SSG2_1( W[ 6] ) );
W[ 9] = v128_add32( X[ 9], v128_add32( SSG2_1( W[ 7] ), W[ 2] ) );
W[10] = v128_add32( X[10], v128_add32( SSG2_1( W[ 8] ), W[ 3] ) );
W[11] = v128_add32( X[11], v128_add32( SSG2_1( W[ 9] ), W[ 4] ) );
W[12] = v128_add32( X[12], v128_add32( SSG2_1( W[10] ), W[ 5] ) );
W[13] = v128_add32( X[13], v128_add32( SSG2_1( W[11] ), W[ 6] ) );
W[14] = v128_add32( X[14], v128_add32( SSG2_1( W[12] ), W[ 7] ) );
W[15] = v128_add32( X[15], v128_add32( SSG2_1( W[13] ), W[ 8] ) );
W[ 4] = v128_add32( W[ 4], SSG2_1( W[ 2] ) );
W[ 5] = SSG2_1( W[ 3] );
W[ 6] = v128_add32( W[15], SSG2_1( W[ 4] ) );
W[ 7] = v128_add32( X[ 0], SSG2_1( W[ 5] ) );
W[ 8] = v128_add32( X[ 1], SSG2_1( W[ 6] ) );
W[ 9] = v128_add32( SSG2_1( W[ 7] ), W[ 2] );
W[10] = v128_add32( SSG2_1( W[ 8] ), W[ 3] );
W[11] = v128_add32( SSG2_1( W[ 9] ), W[ 4] );
W[12] = v128_add32( SSG2_1( W[10] ), W[ 5] );
W[13] = v128_add32( SSG2_1( W[11] ), W[ 6] );
W[14] = v128_add32( X[ 4], v128_add32( SSG2_1( W[12] ), W[ 7] ) );
W[15] = v128_add32( X[ 5], v128_add32( SSG2_1( W[13] ), W[ 8] ) );
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
SHA256x4_MSG_EXPANSION( W );
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
SHA256x4_MSG_EXPANSION( W );
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
W[ 0] = v128_add32( X[ 6], v128_add32( SSG2_1( W[14] ), W[ 9] ) );
W[ 1] = SHA256_4X32_MEXP( W[15], W[10], W[ 2], W[ 1] );
W[ 2] = SHA256_4X32_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
W[ 3] = SHA256_4X32_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
W[ 4] = SHA256_4X32_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
W[ 5] = SHA256_4X32_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
W[ 6] = SHA256_4X32_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
W[ 7] = SHA256_4X32_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
W[ 8] = SHA256_4X32_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
W[ 9] = SHA256_4X32_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
W[10] = SHA256_4X32_MEXP( W[ 8], W[ 3], W[11], W[10] );
W[11] = SHA256_4X32_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA256_4X32_MEXP( W[10], W[ 5], W[13], W[12] );
W[13] = SHA256_4X32_MEXP( W[11], W[ 6], W[14], W[13] );
W[14] = SHA256_4X32_MEXP( W[12], W[ 7], W[15], W[14] );
W[15] = SHA256_4X32_MEXP( W[13], W[ 8], W[ 0], W[15] );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
SHA256_4X32_MSG_EXPANSION( W );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 48 );
A = v128_add32( A, v128_load( state_in ) );
B = v128_add32( B, v128_load( state_in + 1 ) );
@@ -285,10 +309,11 @@ void sha256_4way_final_rounds( v128_t *state_out, const v128_t *data,
v128_store( state_out + 7, H );
}
# if 0
// Working correctly but still slower
int sha256_4way_transform_le_short( v128_t *state_out, const v128_t *data,
int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
const v128_t *state_in, const uint32_t *target )
{
v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
@@ -308,38 +333,38 @@ int sha256_4way_transform_le_short( v128_t *state_out, const v128_t *data,
const v128_t IV7 = H;
const v128_t IV6 = G;
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
SHA256x4_MSG_EXPANSION( W );
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
SHA256x4_MSG_EXPANSION( W );
SHA256x4_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
SHA256_4X32_MSG_EXPANSION( W );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
SHA256_4X32_MSG_EXPANSION( W );
SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
W[ 0] = SHA2s_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
W[ 1] = SHA2s_MEXP( W[15], W[10], W[ 2], W[ 1] );
W[ 2] = SHA2s_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
W[ 3] = SHA2s_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
W[ 4] = SHA2s_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
W[ 5] = SHA2s_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
W[ 6] = SHA2s_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
W[ 7] = SHA2s_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
W[ 8] = SHA2s_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
W[ 9] = SHA2s_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
W[10] = SHA2s_MEXP( W[ 8], W[ 3], W[11], W[10] );
W[11] = SHA2s_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA2s_MEXP( W[10], W[ 5], W[13], W[12] );
W[ 0] = SHA256_4X32_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
W[ 1] = SHA256_4X32_MEXP( W[15], W[10], W[ 2], W[ 1] );
W[ 2] = SHA256_4X32_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
W[ 3] = SHA256_4X32_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
W[ 4] = SHA256_4X32_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
W[ 5] = SHA256_4X32_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
W[ 6] = SHA256_4X32_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
W[ 7] = SHA256_4X32_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
W[ 8] = SHA256_4X32_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
W[ 9] = SHA256_4X32_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
W[10] = SHA256_4X32_MEXP( W[ 8], W[ 3], W[11], W[10] );
W[11] = SHA256_4X32_MEXP( W[ 9], W[ 4], W[12], W[11] );
W[12] = SHA256_4X32_MEXP( W[10], W[ 5], W[13], W[12] );
v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 0, 48 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 1, 48 );
SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 2, 48 );
SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 3, 48 );
SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 4, 48 );
SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 5, 48 );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 6, 48 );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 7, 48 );
SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H, 8, 48 );
SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G, 9, 48 );
SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 0, 48 );
SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 1, 48 );
SHA256_4X32_ROUND( G, H, A, B, C, D, E, F, 2, 48 );
SHA256_4X32_ROUND( F, G, H, A, B, C, D, E, 3, 48 );
SHA256_4X32_ROUND( E, F, G, H, A, B, C, D, 4, 48 );
SHA256_4X32_ROUND( D, E, F, G, H, A, B, C, 5, 48 );
SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 6, 48 );
SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 7, 48 );
SHA256_4X32_ROUND( A, B, C, D, E, F, G, H, 8, 48 );
SHA256_4X32_ROUND( H, A, B, C, D, E, F, G, 9, 48 );
T0 = v128_add32( v128_32( K256[58] ),
v128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
@@ -368,7 +393,7 @@ int sha256_4way_transform_le_short( v128_t *state_out, const v128_t *data,
F = v128_add32( T0, v128_add32( BSG2_0( G ), MAJs( G, H, A ) ) );
// round 61 part 1
W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] );
W[13] = SHA256_4X32_MEXP( W[11], W[ 6], W[14], W[13] );
T0 = v128_add32( v128_32( K256[61] ),
v128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
G = v128_add32( G, T0 );
@@ -401,11 +426,11 @@ int sha256_4way_transform_le_short( v128_t *state_out, const v128_t *data,
C = v128_add32( T0, v128_add32( BSG2_0( D ), MAJs( D, E, F ) ) );
// rounds 62 & 63
W[14] = SHA2s_MEXP( W[12], W[ 7], W[15], W[14] );
W[15] = SHA2s_MEXP( W[13], W[ 8], W[ 0], W[15] );
W[14] = SHA256_4X32_MEXP( W[12], W[ 7], W[15], W[14] );
W[15] = SHA256_4X32_MEXP( W[13], W[ 8], W[ 0], W[15] );
SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 48 );
SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 48 );
SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 14, 48 );
SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 15, 48 );
state_out[0] = v128_add32( state_in[0], A );
state_out[1] = v128_add32( state_in[1], B );
@@ -420,7 +445,7 @@ return 1;
#endif
void sha256_4way_init( sha256_4way_context *sc )
void sha256_4x32_init( sha256_4x32_context *sc )
{
sc->count_high = sc->count_low = 0;
sc->val[0] = v128_32( sha256_iv[0] );
@@ -433,7 +458,7 @@ void sha256_4way_init( sha256_4way_context *sc )
sc->val[7] = v128_32( sha256_iv[7] );
}
void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
void sha256_4x32_update( sha256_4x32_context *sc, const void *data, size_t len )
{
v128_t *vdata = (v128_t*)data;
size_t ptr;
@@ -454,7 +479,7 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
len -= clen;
if ( ptr == buf_size )
{
sha256_4way_transform_be( sc->val, sc->buf, sc->val );
sha256_4x32_transform_be( sc->val, sc->buf, sc->val );
ptr = 0;
}
clow = sc->count_low;
@@ -465,7 +490,7 @@ void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
}
}
void sha256_4way_close( sha256_4way_context *sc, void *dst )
void sha256_4x32_close( sha256_4x32_context *sc, void *dst )
{
unsigned ptr;
uint32_t low, high;
@@ -479,7 +504,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
if ( ptr > pad )
{
v128_memset_zero( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
sha256_4way_transform_be( sc->val, sc->buf, sc->val );
sha256_4x32_transform_be( sc->val, sc->buf, sc->val );
v128_memset_zero( sc->buf, pad >> 2 );
}
else
@@ -491,17 +516,17 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
sc->buf[ pad >> 2 ] = v128_32( bswap_32( high ) );
sc->buf[( pad+4 ) >> 2 ] = v128_32( bswap_32( low ) );
sha256_4way_transform_be( sc->val, sc->buf, sc->val );
sha256_4x32_transform_be( sc->val, sc->buf, sc->val );
v128_block_bswap32( dst, sc->val );
}
void sha256_4way_full( void *dst, const void *data, size_t len )
void sha256_4x32_full( void *dst, const void *data, size_t len )
{
sha256_4way_context ctx;
sha256_4way_init( &ctx );
sha256_4way_update( &ctx, data, len );
sha256_4way_close( &ctx, dst );
sha256_4x32_context ctx;
sha256_4x32_init( &ctx );
sha256_4x32_update( &ctx, data, len );
sha256_4x32_close( &ctx, dst );
}
#if defined(__AVX2__)

View File

@@ -1200,7 +1200,7 @@ void sha256_neon_sha_transform_le( uint32_t *state_out, const void *input,
MSG2_Y = vsha256su1q_u32( MSG2_Y, MSG0_Y, MSG1_Y ); \
/* Rounds 44-47 */ \
MSG3_X = vsha256su0q_u32( MSG3_X, MSG0_X ); \
MSG3_Y = vsha256su0q_u32( MSG3_X, MSG0_Y ); \
MSG3_Y = vsha256su0q_u32( MSG3_Y, MSG0_Y ); \
TMP2_X = STATE0_X; \
TMP2_Y = STATE0_Y; \
TMP0_X = vaddq_u32( MSG0_X, casti_v128( K256, 12 ) ); \

View File

@@ -97,6 +97,14 @@ void sha256_neon_x2sha_final_rounds( uint32_t *state_out_X,
#define sha256_prehash_3rounds sha256_neon_sha_prehash_3rounds
#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds
// generic API
#define sha256_transform_le sha256_neon_sha_transform_le
#define sha256_transform_be sha256_neon_sha_transform_be
#define sha256_2x_transform_le sha256_neon_x2sha_transform_le
#define sha256_2x_transform_be sha256_neon_x2sha_transform_be
#define sha256_prehash_3rounds sha256_neon_sha_prehash_3rounds
#define sha256_2x_final_rounds sha256_neon_x2sha_final_rounds
#else
// without HW acceleration...
#include "sph_sha2.h"

View File

@@ -1,9 +1,9 @@
#include "sha256d-4way.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "sha256-hash.h"
#include "sha256d.h"
static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
{
@@ -360,15 +360,17 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
#if defined(SHA256D_4WAY)
int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
int scanhash_sha256d_4x32( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
v128_t vdata[32] __attribute__ ((aligned (64)));
v128_t block[16] __attribute__ ((aligned (32)));
v128_t hash32[8] __attribute__ ((aligned (32)));
v128_t istate[8] __attribute__ ((aligned (32)));
v128_t mstate[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
v128_t vdata[32] __attribute__ ((aligned (64)));
v128_t block[16] __attribute__ ((aligned (32)));
v128_t hash32[8] __attribute__ ((aligned (32)));
v128_t iv[8] __attribute__ ((aligned (32)));
v128_t mhash1[8] __attribute__ ((aligned (32)));
v128_t mhash2[8] __attribute__ ((aligned (32)));
v128_t mexp_pre[8] __attribute__ ((aligned (32)));
uint32_t lhash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
@@ -376,17 +378,14 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
v128_t *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128_t last_byte = v128_32( 0x80000000 );
const v128_t four = v128_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = v128_32( pdata[i] );
*noncev = v128_set32( n+ 3, n+ 2, n+1, n );
vdata[i] = v128_32( pdata[i] );
vdata[16+3] = v128_set32( n+3, n+2, n+1, n );
vdata[16+4] = last_byte;
v128_memset_zero( vdata+16 + 5, 10 );
vdata[16+15] = v128_32( 80*8 );
@@ -396,36 +395,38 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
block[15] = v128_32( 32*8 );
// initialize state
istate[0] = v128_32( sha256_iv[0] );
istate[1] = v128_32( sha256_iv[1] );
istate[2] = v128_32( sha256_iv[2] );
istate[3] = v128_32( sha256_iv[3] );
istate[4] = v128_32( sha256_iv[4] );
istate[5] = v128_32( sha256_iv[5] );
istate[6] = v128_32( sha256_iv[6] );
istate[7] = v128_32( sha256_iv[7] );
iv[0] = v128_32( sha256_iv[0] );
iv[1] = v128_32( sha256_iv[1] );
iv[2] = v128_32( sha256_iv[2] );
iv[3] = v128_32( sha256_iv[3] );
iv[4] = v128_32( sha256_iv[4] );
iv[5] = v128_32( sha256_iv[5] );
iv[6] = v128_32( sha256_iv[6] );
iv[7] = v128_32( sha256_iv[7] );
// hash first 64 bytes of data
sha256_4way_transform_le( mstate, vdata, istate );
sha256_4x32_transform_le( mhash1, vdata, iv );
sha256_4x32_prehash_3rounds( mhash2, mexp_pre, vdata + 16, mhash1 );
do
{
sha256_4way_transform_le( block, vdata+16, mstate );
sha256_4way_transform_le( hash32, block, istate );
v128_block_bswap32( hash32, hash32 );
sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
sha256_4x32_transform_le( hash32, block, iv );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
if ( unlikely( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
extr_lane_4x32( lhash, hash32, lane, 256 );
casti_v128( lhash, 0 ) = v128_bswap32( casti_v128( lhash, 0 ) );
casti_v128( lhash, 1 ) = v128_bswap32( casti_v128( lhash, 1 ) );
if ( likely( valid_hash( lhash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lhash, mythr );
}
}
}
*noncev = v128_add32( *noncev, four );
vdata[16+3] = v128_add32( vdata[16+3], four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;

View File

@@ -1,3 +1,4 @@
#include "sha256-hash.h"
#include "sha256d.h"
void sha256d( void *hash, const void *data, int len )
@@ -5,4 +6,24 @@ void sha256d( void *hash, const void *data, int len )
sha256_full( hash, data, len );
sha256_full( hash, hash, 32 );
}
bool register_sha256d_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
#if defined(SHA256D_16WAY)
gate->scanhash = (void*)&scanhash_sha256d_16way;
#elif defined(SHA256D_SHA)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256d_sha;
#elif defined(SHA256D_NEON_SHA2)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256d_neon_sha2;
#elif defined(SHA256D_8WAY)
gate->scanhash = (void*)&scanhash_sha256d_8way;
#elif defined(SHA256D_4WAY)
gate->scanhash = (void*)&scanhash_sha256d_4x32;
#else
gate->hash = (void*)&sha256d;
#endif
return true;
};

View File

@@ -1,7 +1,58 @@
#ifndef __SHA256D_4WAY_H__
#define __SHA256D_4WAY_H__ 1
#include <stdint.h>
#include "algo-gate-api.h"
#include <string.h>
#include <inttypes.h>
#include "sha256-hash.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA256D_16WAY 1
#elif defined(__SHA__)
#define SHA256D_SHA 1
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#define SHA256D_NEON_SHA2 1
#elif defined(__AVX2__)
#define SHA256D_8WAY 1
#else
#define SHA256D_4WAY 1
#endif
bool register_sha256d_algo( algo_gate_t* gate );
#if defined(SHA256D_16WAY)
int scanhash_sha256d_16way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_8WAY)
int scanhash_sha256d_8way( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_4WAY)
int scanhash_sha256d_4x32( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_SHA)
int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
#if defined(SHA256D_NEON_SHA2)
int scanhash_sha256d_neon_sha2( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr );
#endif
void sha256d( void *hash, const void *data, int len );
bool register_sha256d_algo( algo_gate_t* gate );
#endif

View File

@@ -7,15 +7,15 @@
#include "sph_sha2.h"
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA256DT_16X64 1
#define SHA256DT_16X32 1
#elif defined(__x86_64__) && defined(__SHA__)
#define SHA256DT_X86_SHA256 1
#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
#define SHA256DT_NEON_SHA256 1
#elif defined(__AVX2__)
#define SHA256DT_8X64 1
#define SHA256DT_8X32 1
#elif defined (__SSE2__) || defined(__ARM_NEON)
#define SHA256DT_4X64 1
#define SHA256DT_4X32 1
#endif
// else ref, should never happen
@@ -183,9 +183,9 @@ int scanhash_sha256dt_neon_x2sha( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SHA256DT_16X64)
#elif defined(SHA256DT_16X32)
int scanhash_sha256dt_16x64( struct work *work, const uint32_t max_nonce,
int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m512i block[16] __attribute__ ((aligned (128)));
@@ -275,9 +275,9 @@ int scanhash_sha256dt_16x64( struct work *work, const uint32_t max_nonce,
return 0;
}
#elif defined(SHA256DT_8X64)
#elif defined(SHA256DT_8X32)
int scanhash_sha256dt_8x64( struct work *work, const uint32_t max_nonce,
int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
__m256i vdata[32] __attribute__ ((aligned (64)));
@@ -355,16 +355,18 @@ int scanhash_sha256dt_8x64( struct work *work, const uint32_t max_nonce,
return 0;
}
#elif defined(SHA256DT_4X64)
#elif defined(SHA256DT_4X32)
int scanhash_sha256dt_4x64( struct work *work, const uint32_t max_nonce,
int scanhash_sha256dt_4x32( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
v128_t vdata[32] __attribute__ ((aligned (64)));
v128_t block[16] __attribute__ ((aligned (32)));
v128_t hash32[8] __attribute__ ((aligned (32)));
v128_t iv[8] __attribute__ ((aligned (32)));
v128_t mhash[8] __attribute__ ((aligned (32)));
v128_t vdata[32] __attribute__ ((aligned (64)));
v128_t block[16] __attribute__ ((aligned (32)));
v128_t hash32[8] __attribute__ ((aligned (32)));
v128_t iv[8] __attribute__ ((aligned (32)));
v128_t mhash1[8] __attribute__ ((aligned (32)));
v128_t mhash2[8] __attribute__ ((aligned (32)));
v128_t mexp_pre[8] __attribute__ ((aligned (32)));
uint32_t lhash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
@@ -373,26 +375,22 @@ int scanhash_sha256dt_4x64( struct work *work, const uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
v128_t *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128_t last_byte = v128_32( 0x80000000 );
const v128_t four = v128_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = v128_32( pdata[i] );
*noncev = v128_set32( n+ 3, n+ 2, n+1, n );
vdata[i] = v128_32( pdata[i] );
vdata[16+3] = v128_set32( n+3, n+2, n+1, n );
vdata[16+4] = last_byte;
v128_memset_zero( vdata+16 + 5, 10 );
v128_memset_zero( vdata+16 + 5, 9 );
vdata[16+15] = v128_32( 0x480 );
block[ 8] = last_byte;
v128_memset_zero( block + 9, 6 );
v128_memset_zero( block + 9, 5 );
block[15] = v128_32( 0x300 );
// initialize state
iv[0] = v128_32( sha256dt_iv[0] );
iv[1] = v128_32( sha256dt_iv[1] );
iv[2] = v128_32( sha256dt_iv[2] );
@@ -402,62 +400,15 @@ int scanhash_sha256dt_4x64( struct work *work, const uint32_t max_nonce,
iv[6] = v128_32( sha256dt_iv[6] );
iv[7] = v128_32( sha256dt_iv[7] );
// hash first 64 bytes of data
sha256_4x32_transform_le( mhash, vdata, iv );
/*
uint32_t m1 [8] __attribute__ ((aligned (32)));
uint32_t h1 [8] __attribute__ ((aligned (32)));
uint32_t b1 [16] __attribute__ ((aligned (32)));
uint32_t e16 [16] __attribute__ ((aligned (32)));
uint32_t *m4 = (uint32_t*)&midstate;
uint32_t *h4 = (uint32_t*)hash32;
sha256_transform_le( m1, pdata, sha256dt_iv );
memcpy( e16, pdata + 16, 12 );
e16[3] = n;
e16[4] = 0x80000000;
memset( &e16[5], 0, 40 );
e16[15] = 0x480; // funky bit count
b1[8] = 0x80000000;
memset( &b1[9], 0, 24 );
b1[9] = b1[10] = b1[11] = b1[12] = b1[13] = b1[14] = 0;
b1[15] = 0x300; // bit count
*/
sha256_4x32_transform_le( mhash1, vdata, iv );
sha256_4x32_prehash_3rounds( mhash2, mexp_pre, vdata + 16, mhash1 );
do
{
sha256_4x32_transform_le( block, vdata+16, mhash );
//sha256_transform_le( b1, e16, m1 );
sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
// sha256_4x32_transform_le( block, vdata+16, mhash1 );
sha256_4x32_transform_le( hash32, block, iv );
/*
sha256_transform_le( h1, b1, sha256dt_iv );
printf("final hash1: %08x %08x %08x %08x %08x %08x %08x %08x\n",
h1[0],h1[1],h1[2],h1[3],h1[4],h1[5],h1[6],h1[7]);
printf("final hash4: %08x %08x %08x %08x %08x %08x %08x %08x\n",
h4[0],h4[4],h4[8],h4[12],h4[16],h4[20],h4[24],h4[28]);
casti_v128( h1,0 ) = v128_bswap32( casti_v128( h1,0 ) );
casti_v128( h1,1 ) = v128_bswap32( casti_v128( h1,1 ) );
*/
// v128_block_bswap32( hash32, hash32 );
/*
printf("bswap hash1: %08x %08x %08x %08x %08x %08x %08x %08x\n",
h1[0],h1[1],h1[2],h1[3],h1[4],h1[5],h1[6],h1[7]);
printf("bswap hash4: %08x %08x %08x %08x %08x %08x %08x %08x\n",
h4[0],h4[4],h4[8],h4[12],h4[16],h4[20],h4[24],h4[28]);
exit(0);
*/
for ( int lane = 0; lane < 4; lane++ )
{
if ( unlikely( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 ) )
@@ -472,7 +423,7 @@ exit(0);
}
}
}
*noncev = v128_add32( *noncev, four );
vdata[16+3] = v128_add32( vdata[16+3], four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
@@ -485,10 +436,10 @@ exit(0);
int scanhash_sha256dt_ref( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint32_t block1a[16] __attribute__ ((aligned (32)));
uint32_t block2a[16] __attribute__ ((aligned (32)));
uint32_t hasha[8] __attribute__ ((aligned (32)));
uint32_t mstate[8] __attribute__ ((aligned (32)));
uint32_t block1[16] __attribute__ ((aligned (32)));
uint32_t block2[16] __attribute__ ((aligned (32)));
uint32_t hash32[8] __attribute__ ((aligned (32)));
uint32_t mstate[8] __attribute__ ((aligned (32)));
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
@@ -497,37 +448,40 @@ int scanhash_sha256dt_ref( struct work *work, uint32_t max_nonce,
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
memset( block1, 0, 64 );
memset( block2, 0, 64 );
// hash first 64 byte block of data
sha256_transform_le( mstate, pdata, sha256dt_iv );
// fill & pad second bock without nonce
memcpy( block1a, pdata + 16, 12 );
block1a[ 3] = 0;
block1a[ 4] = 0x80000000;
memset( block1a + 5, 0, 40 );
block1a[15] = 0x480; // funky bit count
memcpy( block1, pdata + 16, 12 );
block1[ 3] = n;
block1[ 4] = 0x80000000;
memset( block1 + 5, 0, 40 );
block1[15] = 0x480; // funky bit count
// Pad third block
block2a[ 8] = 0x80000000;
memset( block2a + 9, 0, 24 );
block2a[15] = 0x300; // bit count
block2[ 8] = 0x80000000;
memset( block2 + 9, 0, 24 );
block2[15] = 0x300; // bit count
do
{
// Insert nonce for second block
block1a[3] = n;
sha256_transform_le( block2a, block1a, mstate );
block1[3] = n;
sha256_transform_le( block2, block1, mstate );
sha256_transform_le( hasha, block2a, sha256dt_iv );
sha256_transform_le( hash32, block2, sha256dt_iv );
if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
if ( unlikely( bswap_32( hash32[7] ) <= ptarget[7] ) )
{
casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
casti_v128( hash32, 0 ) = v128_bswap32( casti_v128( hash32, 0 ) );
casti_v128( hash32, 1 ) = v128_bswap32( casti_v128( hash32, 1 ) );
if ( likely( valid_hash( hash32, ptarget ) && !bench ) )
{
pdata[19] = n;
submit_solution( work, hasha, mythr );
submit_solution( work, hash32, mythr );
}
}
n += 1;
@@ -543,18 +497,18 @@ int scanhash_sha256dt_ref( struct work *work, uint32_t max_nonce,
bool register_sha256dt_algo( algo_gate_t* gate )
{
gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT;
#if defined(SHA256DT_16X64)
gate->scanhash = (void*)&scanhash_sha256dt_16x64;
#if defined(SHA256DT_16X32)
gate->scanhash = (void*)&scanhash_sha256dt_16x32;
#elif defined(SHA256DT_X86_SHA256)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256dt_x86_x2sha;
#elif defined(SHA256DT_NEON_SHA256)
gate->optimizations = SHA_OPT;
gate->scanhash = (void*)&scanhash_sha256dt_neon_x2sha;
#elif defined(SHA256DT_8X64)
gate->scanhash = (void*)&scanhash_sha256dt_8x64;
#elif defined(SHA256DT_4X64)
gate->scanhash = (void*)&scanhash_sha256dt_4x64;
#elif defined(SHA256DT_8X32)
gate->scanhash = (void*)&scanhash_sha256dt_8x32;
#elif defined(SHA256DT_4X32)
gate->scanhash = (void*)&scanhash_sha256dt_4x32;
#else
gate->scanhash = (void*)&scanhash_sha256dt_ref;
#endif

View File

@@ -372,14 +372,14 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
v128_t vdata[32] __attribute__ ((aligned (64)));
v128_t block[16] __attribute__ ((aligned (32)));
v128_t hash32[8] __attribute__ ((aligned (32)));
v128_t istate[8] __attribute__ ((aligned (32)));
v128_t mstate[8] __attribute__ ((aligned (32)));
// v128_t mstate2[8] __attribute__ ((aligned (32)));
// v128_t mexp_pre[8] __attribute__ ((aligned (32)));
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
v128_t vdata[32] __attribute__ ((aligned (64)));
v128_t block[16] __attribute__ ((aligned (32)));
v128_t hash32[8] __attribute__ ((aligned (32)));
v128_t iv[8] __attribute__ ((aligned (32)));
v128_t mhash1[8] __attribute__ ((aligned (32)));
v128_t mhash2[8] __attribute__ ((aligned (32)));
v128_t mexp_pre[8] __attribute__ ((aligned (32)));
uint32_t lhash[8] __attribute__ ((aligned (32)));
uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
uint32_t *pdata = work->data;
const uint32_t *ptarget = work->target;
@@ -387,62 +387,57 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
v128_t *noncev = vdata + 19;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128_t last_byte = v128_32( 0x80000000 );
const v128_t four = v128_32( 4 );
for ( int i = 0; i < 19; i++ )
vdata[i] = v128_32( pdata[i] );
*noncev = v128_set32( n+ 3, n+ 2, n+1, n );
vdata[i] = v128_32( pdata[i] );
vdata[16+3] = v128_set32( n+3, n+2, n+1, n );
vdata[16+4] = last_byte;
v128_memset_zero( vdata+16 + 5, 10 );
vdata[16+15] = v128_32( 80*8 ); // bit count
vdata[16+15] = v128_32( 80*8 );
block[ 8] = last_byte;
v128_memset_zero( block + 9, 6 );
block[15] = v128_32( 32*8 ); // bit count
block[15] = v128_32( 32*8 );
// initialize state
istate[0] = v128_32( sha256_iv[0] );
istate[1] = v128_32( sha256_iv[1] );
istate[2] = v128_32( sha256_iv[2] );
istate[3] = v128_32( sha256_iv[3] );
istate[4] = v128_32( sha256_iv[4] );
istate[5] = v128_32( sha256_iv[5] );
istate[6] = v128_32( sha256_iv[6] );
istate[7] = v128_32( sha256_iv[7] );
iv[0] = v128_32( sha256_iv[0] );
iv[1] = v128_32( sha256_iv[1] );
iv[2] = v128_32( sha256_iv[2] );
iv[3] = v128_32( sha256_iv[3] );
iv[4] = v128_32( sha256_iv[4] );
iv[5] = v128_32( sha256_iv[5] );
iv[6] = v128_32( sha256_iv[6] );
iv[7] = v128_32( sha256_iv[7] );
// hash first 64 bytes of data
sha256_4way_transform_le( mstate, vdata, istate );
// sha256_4way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
sha256_4x32_transform_le( mhash1, vdata, iv );
sha256_4x32_prehash_3rounds( mhash2, mexp_pre, vdata + 16, mhash1 );
do
{
// sha256_4way_final_rounds( block, vdata+16, mstate1, mstate2,
// mexp_pre );
sha256_4way_transform_le( block, vdata+16, mstate );
sha256_4way_transform_le( block, block, istate );
sha256_4way_transform_le( hash32, block, istate );
sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
sha256_4way_transform_le( block, block, iv );
sha256_4way_transform_le( hash32, block, iv );
v128_block_bswap32( hash32, hash32 );
for ( int lane = 0; lane < 4; lane++ )
if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
for ( int lane = 0; lane < 4; lane++ )
{
if ( unlikely( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 ) )
{
extr_lane_4x32( lane_hash, hash32, lane, 256 );
if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
extr_lane_4x32( lhash, hash32, lane, 256 );
casti_v128( lhash, 0 ) = v128_bswap32( casti_v128( lhash, 0 ) );
casti_v128( lhash, 1 ) = v128_bswap32( casti_v128( lhash, 1 ) );
if ( likely( valid_hash( lhash, ptarget ) && !bench ) )
{
pdata[19] = n + lane;
submit_solution( work, lane_hash, mythr );
submit_solution( work, lhash, mythr );
}
}
*noncev = v128_add32( *noncev, four );
n += 4;
}
vdata[16+3] = v128_add32( vdata[16+3], four );
n += 4;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;

View File

@@ -692,7 +692,7 @@ do { \
_mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z )
#define MAJ(X, Y, Z) \
_mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
_mm256_xor_si256( Y, _mm256_and_si256( (X_xor_Y = _mm256_xor_si256( X, Y )), \
Y_xor_Z ) )
#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
@@ -873,26 +873,26 @@ void sha512_4x64_ctx( sha512_4x64_context *sc, void *dst, const void *data,
// SHA512 2 way 64 SSE2 or NEON
#define BSG5_0_2x64( x ) v128_xor3( v128_ror64( x, 28 ), \
v128_ror64( x, 34 ), \
v128_ror64( x, 39 ) )
v128_ror64( x, 34 ), \
v128_ror64( x, 39 ) )
#define BSG5_1_2x64( x ) v128_xor3( v128_ror64( x, 14 ), \
v128_ror64( x, 18 ), \
v128_ror64( x, 41 ) )
v128_ror64( x, 18 ), \
v128_ror64( x, 41 ) )
#define SSG5_0_2x64( x ) v128_xor3( v128_ror64( x, 1 ), \
v128_ror64( x, 8 ), \
v128_sr64( x, 7 ) )
v128_ror64( x, 8 ), \
v128_sr64( x, 7 ) )
#define SSG5_1_2x64( x ) v128_xor3( v128_ror64( x, 19 ), \
v128_ror64( x, 61 ), \
v128_sr64( x, 6 ) )
v128_ror64( x, 61 ), \
v128_sr64( x, 6 ) )
#define CH_2x64(X, Y, Z) \
v128_xor( v128_and( v128_xor( Y, Z ), X ), Z )
#define MAJ_2x64(X, Y, Z) \
v128_xor( Y, v128_and( X_xor_Y = v128_xor( X, Y ), Y_xor_Z ) )
v128_xor( Y, v128_and( (X_xor_Y = v128_xor( X, Y ) ), Y_xor_Z ) )
#define SHA3_2x64_STEP( A, B, C, D, E, F, G, H, i ) \
do { \
@@ -917,34 +917,20 @@ sha512_2x64_round( sha512_2x64_context *ctx, v128u64_t *in, v128u64_t r[8] )
v128u64_t W[80];
v128_block_bswap64( W , in );
v128_block_bswap64( (&W[8]), (&in[8]) );
v128_block_bswap64( W+8, in+8 );
for ( i = 16; i < 80; i++ )
W[i] = v128_add4_64( SSG5_0_2x64( W[i-15] ), SSG5_1_2x64( W[i-2] ),
W[ i- 7 ], W[ i-16 ] );
if ( ctx->initialized )
{
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
}
else
{
A = v128_64( 0x6A09E667F3BCC908 );
B = v128_64( 0xBB67AE8584CAA73B );
C = v128_64( 0x3C6EF372FE94F82B );
D = v128_64( 0xA54FF53A5F1D36F1 );
E = v128_64( 0x510E527FADE682D1 );
F = v128_64( 0x9B05688C2B3E6C1F );
G = v128_64( 0x1F83D9ABFB41BD6B );
H = v128_64( 0x5BE0CD19137E2179 );
}
A = r[0];
B = r[1];
C = r[2];
D = r[3];
E = r[4];
F = r[5];
G = r[6];
H = r[7];
Y_xor_Z = v128_xor( B, C );
@@ -960,35 +946,28 @@ sha512_2x64_round( sha512_2x64_context *ctx, v128u64_t *in, v128u64_t r[8] )
SHA3_2x64_STEP( B, C, D, E, F, G, H, A, i + 7 );
}
if ( ctx->initialized )
{
r[0] = v128_add64( r[0], A );
r[1] = v128_add64( r[1], B );
r[2] = v128_add64( r[2], C );
r[3] = v128_add64( r[3], D );
r[4] = v128_add64( r[4], E );
r[5] = v128_add64( r[5], F );
r[6] = v128_add64( r[6], G );
r[7] = v128_add64( r[7], H );
}
else
{
ctx->initialized = true;
r[0] = v128_add64( A, v128_64( 0x6A09E667F3BCC908 ) );
r[1] = v128_add64( B, v128_64( 0xBB67AE8584CAA73B ) );
r[2] = v128_add64( C, v128_64( 0x3C6EF372FE94F82B ) );
r[3] = v128_add64( D, v128_64( 0xA54FF53A5F1D36F1 ) );
r[4] = v128_add64( E, v128_64( 0x510E527FADE682D1 ) );
r[5] = v128_add64( F, v128_64( 0x9B05688C2B3E6C1F ) );
r[6] = v128_add64( G, v128_64( 0x1F83D9ABFB41BD6B ) );
r[7] = v128_add64( H, v128_64( 0x5BE0CD19137E2179 ) );
}
r[0] = v128_add64( r[0], A );
r[1] = v128_add64( r[1], B );
r[2] = v128_add64( r[2], C );
r[3] = v128_add64( r[3], D );
r[4] = v128_add64( r[4], E );
r[5] = v128_add64( r[5], F );
r[6] = v128_add64( r[6], G );
r[7] = v128_add64( r[7], H );
}
void sha512_2x64_init( sha512_2x64_context *sc )
{
sc->initialized = false;
sc->val[0] = v128_64( 0x6A09E667F3BCC908 );
sc->val[1] = v128_64( 0xBB67AE8584CAA73B );
sc->val[2] = v128_64( 0x3C6EF372FE94F82B );
sc->val[3] = v128_64( 0xA54FF53A5F1D36F1 );
sc->val[4] = v128_64( 0x510E527FADE682D1 );
sc->val[5] = v128_64( 0x9B05688C2B3E6C1F );
sc->val[6] = v128_64( 0x1F83D9ABFB41BD6B );
sc->val[7] = v128_64( 0x5BE0CD19137E2179 );
sc->count = 0;
sc->initialized = true;
}
void sha512_2x64_update( sha512_2x64_context *sc, const void *data, size_t len )
@@ -1036,7 +1015,7 @@ void sha512_2x64_close( sha512_2x64_context *sc, void *dst )
v128_memset_zero( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
sc->buf[ pad >> 3 ] = v128_bswap64( v128_64( sc->count >> 61 ) );
sc->buf[ ( pad+8 ) >> 3 ] = v128_bswap64( v128_64( sc->count << 3 ) );
sc->buf[ ( pad+8 ) >> 3 ] = v128_bswap64( v128_64( sc->count << 3 ) );
sha512_2x64_round( sc, sc->buf, sc->val );
v128_block_bswap64( castp_v128u64( dst ), sc->val );

View File

@@ -5,9 +5,11 @@
#include <stdint.h>
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#define SHA512256D_8WAY 1
#define SHA512256D_8WAY 1
#elif defined(__AVX2__)
#define SHA512256D_4WAY 1
#define SHA512256D_4WAY 1
#elif defined(__SSE2__) || defined(__ARM_NEON)
#define SHA512256D_2WAY 1
#endif
#if defined(SHA512256D_8WAY)
@@ -145,6 +147,74 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
return 0;
}
#elif defined(SHA512256D_2WAY)
static void sha512256d_2x64_init( sha512_2x64_context *ctx )
{
ctx->count = 0;
ctx->initialized = true;
ctx->val[0] = v128_64( 0x22312194FC2BF72C );
ctx->val[1] = v128_64( 0x9F555FA3C84C64C2 );
ctx->val[2] = v128_64( 0x2393B86B6F53B151 );
ctx->val[3] = v128_64( 0x963877195940EABD );
ctx->val[4] = v128_64( 0x96283EE2A88EFFE3 );
ctx->val[5] = v128_64( 0xBE5E1E2553863992 );
ctx->val[6] = v128_64( 0x2B0199FC2C85B8AA );
ctx->val[7] = v128_64( 0x0EB72DDC81C52CA2 );
}
int scanhash_sha512256d_2x64( struct work *work, uint32_t max_nonce,
uint64_t *hashes_done, struct thr_info *mythr )
{
uint64_t hash[8*2] __attribute__ ((aligned (64)));
uint32_t vdata[20*2] __attribute__ ((aligned (64)));
sha512_2x64_context ctx;
uint32_t lane_hash[8] __attribute__ ((aligned (32)));
uint64_t *hash_q3 = &(hash[3*2]);
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint64_t targ_q3 = ((uint64_t*)ptarget)[3];
const uint32_t first_nonce = pdata[19];
const uint32_t last_nonce = max_nonce - 4;
uint32_t n = first_nonce;
v128u64_t *noncev = (v128u64_t*)vdata + 9;
const int thr_id = mythr->id;
const bool bench = opt_benchmark;
const v128u64_t two = v128_64( 0x0000000200000000 );
v128_bswap32_intrlv80_2x64( vdata, pdata );
*noncev = v128_add32( v128_set32( 1, 0, 0, 0 ), *noncev );
// *noncev = v128_intrlv_blend_32( v128_set32( n+1, 0, n, 0 ), *noncev );
do
{
sha512256d_2x64_init( &ctx );
sha512_2x64_update( &ctx, vdata, 80 );
sha512_2x64_close( &ctx, hash );
sha512256d_2x64_init( &ctx );
sha512_2x64_update( &ctx, hash, 32 );
sha512_2x64_close( &ctx, hash );
for ( int lane = 0; lane < 2; lane++ )
if ( hash_q3[ lane ] <= targ_q3 )
{
extr_lane_2x64( lane_hash, hash, lane, 256 );
if ( valid_hash( lane_hash, ptarget ) && !bench )
{
pdata[19] = bswap_32( n + lane );
submit_solution( work, lane_hash, mythr );
}
}
*noncev = v128_add32( *noncev, two );
n += 2;
} while ( (n < last_nonce) && !work_restart[thr_id].restart );
pdata[19] = n;
*hashes_done = n - first_nonce;
return 0;
}
#else
#include "sph_sha2.h"
@@ -214,6 +284,8 @@ bool register_sha512256d_algo( algo_gate_t* gate )
gate->scanhash = (void*)&scanhash_sha512256d_8way;
#elif defined(SHA512256D_4WAY)
gate->scanhash = (void*)&scanhash_sha512256d_4way;
#elif defined(SHA512256D_2WAY)
gate->scanhash = (void*)&scanhash_sha512256d_2x64;
#else
gate->scanhash = (void*)&scanhash_sha512256d;
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -2,23 +2,68 @@
#define SIMD_HASH_2WAY_H__ 1
#include "simd-compat.h"
#include "simd-utils.h"
#if defined(__SSE2__) || defined (__ARM_NEON)
typedef struct
{
uint32_t A[32];
uint8_t buffer[128];
uint64_t count;
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd512_context __attribute__((aligned(64)));
// datalen is bytes
int simd512_ctx( simd512_context *ctx, void *hashval, const void *data,
int datalen );
int simd512( void *hashval, const void *data, int datalen );
#endif
#if defined(__AVX2__)
#include "simd-utils.h"
typedef struct
{
uint32_t A[ 32*2 ];
uint8_t buffer[ 128*2 ];
uint64_t count;
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd512_2way_context __attribute__((aligned(128)));
#define simd_2way_context simd512_2way_context
// databitlen is bits
int simd_2way_init( simd_2way_context *state, int hashbitlen );
int simd_2way_update( simd_2way_context *state, const void *data,
int databitlen );
int simd_2way_close( simd_2way_context *state, void *hashval );
int simd_2way_update_close( simd_2way_context *state, void *hashval,
const void *data, int databitlen );
int simd512_2way_ctx( simd512_2way_context *state, void *hashval,
const void *data, int datalen );
#define simd512_2way_full simd512_2way_ctx
int simd512_2way( void *hashval, const void *data, int datalen );
#endif
#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
typedef struct {
typedef struct
{
uint32_t A[ 32*4 ];
uint8_t buffer[ 128*4 ];
uint64_t count;
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd_4way_context __attribute__((aligned(128)));
} simd512_4way_context __attribute__((aligned(128)));
#define simd_4way_context simd512_4way_context
int simd_4way_init( simd_4way_context *state, int hashbitlen );
int simd_4way_update( simd_4way_context *state, const void *data,
@@ -26,29 +71,12 @@ int simd_4way_update( simd_4way_context *state, const void *data,
int simd_4way_close( simd_4way_context *state, void *hashval );
int simd_4way_update_close( simd_4way_context *state, void *hashval,
const void *data, int databitlen );
int simd512_4way_full( simd_4way_context *state, void *hashval,
int simd512_4way_ctx( simd_4way_context *state, void *hashval,
const void *data, int datalen );
#define simd512_4way_full simd512_4way_ctx
int simd512_4way( void *hashval, const void *data, int datalen );
#endif
typedef struct {
uint32_t A[ 32*2 ];
uint8_t buffer[ 128*2 ];
uint64_t count;
unsigned int hashbitlen;
unsigned int blocksize;
unsigned int n_feistels;
} simd_2way_context __attribute__((aligned(128)));
int simd_2way_init( simd_2way_context *state, int hashbitlen );
int simd_2way_update( simd_2way_context *state, const void *data,
int databitlen );
int simd_2way_close( simd_2way_context *state, void *hashval );
int simd_2way_update_close( simd_2way_context *state, void *hashval,
const void *data, int databitlen );
int simd512_2way_full( simd_2way_context *state, void *hashval,
const void *data, int datalen );
#endif
#endif

View File

@@ -14,20 +14,19 @@
#include "algo/cubehash/cubehash_sse2.h"
#if defined(__aarch64__)
#include "algo/simd/sph_simd.h"
#include "algo/luffa/sph_luffa.h"
#endif
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/sha/sph_sha2.h"
#include "algo/yespower/yespower.h"
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
//#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#include "algo/echo/aes_ni/hash_api.h"
#include "algo/groestl/aes_ni/hash-groestl.h"
#else
//#else
#include "algo/echo/sph_echo.h"
#include "algo/groestl/sph_groestl.h"
#endif
//#endif
#if defined(__AES__)
#include "algo/fugue/fugue-aesni.h"
#else
@@ -48,7 +47,7 @@ typedef struct TortureGarden TortureGarden;
// Graph of hash algos plus SPH contexts
struct TortureGarden
{
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
hashState_echo echo;
hashState_groestl groestl;
#else
@@ -67,11 +66,7 @@ struct TortureGarden
sph_keccak512_context keccak;
cubehashParam cube;
shavite512_context shavite;
#if defined(__aarch64__)
sph_luffa512_context luffa;
#else
hashState_luffa luffa;
#endif
#if defined(__aarch64__)
sph_simd512_context simd;
#else
@@ -112,7 +107,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
cubehashUpdateDigest( &garden->cube, hash, input, 64 );
break;
case 3:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
echo_full( &garden->echo, hash, 512, input, 64 );
#else
sph_echo512_init( &garden->echo );
@@ -128,7 +123,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
#endif
break;
case 5:
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
groestl512_full( &garden->groestl, hash, input, 512 );
#else
sph_groestl512_init( &garden->groestl) ;
@@ -157,13 +152,7 @@ static int get_hash( void *output, const void *input, TortureGarden *garden,
sph_keccak512_close( &garden->keccak, hash );
break;
case 10:
#if defined(__aarch64__)
sph_luffa512_init( &garden->luffa );
sph_luffa512( &garden->luffa, input, 64 );
sph_luffa512_close( &garden->luffa, hash );
#else
luffa_full( &garden->luffa, hash, 512, input, 64 );
#endif
break;
case 11:
sph_shabal512_init( &garden->shabal );

View File

@@ -929,43 +929,31 @@ int scanhash_x17_4x64( struct work *work, uint32_t max_nonce,
#elif defined(X17_2X64)
// Need sph in some cases
//#include "algo/blake/sph_blake.h"
#include "algo/bmw/sph_bmw.h"
#include "algo/jh/sph_jh.h"
//#include "algo/keccak/sph_keccak.h"
#include "algo/skein/sph_skein.h"
#include "algo/luffa/sph_luffa.h"
#include "algo/luffa/luffa_for_sse2.h"
//#include "algo/cubehash/sph_cubehash.h"
#include "algo/cubehash/cubehash_sse2.h"
#include "algo/shavite/sph_shavite.h"
#include "algo/simd/sph_simd.h"
#include "algo/simd/nist.h"
#include "algo/hamsi/sph_hamsi.h"
#include "algo/shabal/sph_shabal.h"
#include "algo/whirlpool/sph_whirlpool.h"
#include "algo/haval/sph-haval.h"
#include "algo/sha/sph_sha2.h"
#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
//#if !( defined(__AES__) || defined(__ARM_FEATURE_AES) )
#include "algo/groestl/sph_groestl.h"
#include "algo/echo/sph_echo.h"
#endif
//#endif
#include "algo/fugue/sph_fugue.h"
union _x17_context_overlay
{
// blake512_2x64_context blake;
blake512_context blake;
#if defined(__x86_64__)
blake512_2x64_context blake;
bmw512_2x64_context bmw;
#else
sph_bmw512_context bmw;
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
hashState_groestl groestl;
hashState_echo echo;
#else
sph_groestl512_context groestl;
#endif
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
hashState_echo echo;
#else
sph_echo512_context echo;
#endif
#if defined(__AES__)
@@ -973,26 +961,14 @@ union _x17_context_overlay
#else
sph_fugue512_context fugue;
#endif
#if defined(__x86_64__)
jh512_2x64_context jh;
#else
sph_jh512_context jh;
#endif
keccak512_2x64_context keccak;
#if defined(__x86_64__)
skein512_2x64_context skein;
#else
sph_skein512_context skein;
#endif
#if defined(__x86_64__)
hashState_luffa luffa;
#else
sph_luffa512_context luffa;
#endif
cubehashParam cube;
sph_shavite512_context shavite;
#if defined(__x86_64__)
hashState_sd simd;
simd512_context simd;
#else
sph_simd512_context simd;
#endif
@@ -1003,11 +979,7 @@ union _x17_context_overlay
#endif
sph_shabal512_context shabal;
sph_whirlpool_context whirlpool;
#if defined(__x86_64__)
sha512_2x64_context sha;
#else
sph_sha512_context sha;
#endif
sph_haval256_5_context haval;
};
typedef union _x17_context_overlay x17_context_overlay;
@@ -1019,30 +991,16 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
uint8_t hash1[64] __attribute__((aligned(64)));
x17_context_overlay ctx;
// intrlv_2x64( vhash, input, input+80, 640 );
// blake512_2x64_full( &ctx.blake, vhash, vhash, 80 );
// dintrlv_2x64( hash0, hash1, vhash, 512 );
blake512_full( &ctx.blake, hash0, input, 80 );
blake512_full( &ctx.blake, hash1, input+80, 80 );
intrlv_2x64( vhash, input, input+80, 640 );
#if defined(__x86_64__)
intrlv_2x64( vhash, hash0, hash1, 512 );
blake512_2x64_full( &ctx.blake, vhash, vhash, 80 );
bmw512_2x64_init( &ctx.bmw );
bmw512_2x64_update( &ctx.bmw, vhash, 64 );
bmw512_2x64_close( &ctx.bmw, vhash );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#else
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, hash0, 64 );
sph_bmw512_close( &ctx.bmw, hash0 );
sph_bmw512_init( &ctx.bmw );
sph_bmw512( &ctx.bmw, hash1, 64 );
sph_bmw512_close( &ctx.bmw, hash1 );
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
dintrlv_2x64( hash0, hash1, vhash, 512 );
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
groestl512_full( &ctx.groestl, hash0, hash0, 512 );
groestl512_full( &ctx.groestl, hash1, hash1, 512 );
#else
@@ -1054,47 +1012,16 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
sph_groestl512_close( &ctx.groestl, hash1 );
#endif
#if defined(__x86_64__)
intrlv_2x64( vhash, hash0, hash1, 512 );
skein512_2x64_full( &ctx.skein, vhash, vhash, 64 );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#else
sph_skein512_init( &ctx.skein );
sph_skein512( &ctx.skein, hash0, 64 );
sph_skein512_close( &ctx.skein, hash0);
sph_skein512_init( &ctx.skein );
sph_skein512( &ctx.skein, hash1, 64 );
sph_skein512_close( &ctx.skein, hash1 );
#endif
#if defined(__x86_64__)
intrlv_2x64( vhash, hash0, hash1, 512);
jh512_2x64_ctx( &ctx.jh, vhash, vhash, 64 );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#else
sph_jh512_init( &ctx.jh );
sph_jh512( &ctx.jh, hash0, 64 );
sph_jh512_close( &ctx.jh, hash0 );
sph_jh512_init( &ctx.jh);
sph_jh512( &ctx.jh, hash1, 64 );
sph_jh512_close( &ctx.jh, hash1 );
#endif
intrlv_2x64( vhash, hash0, hash1, 512);
keccak512_2x64_ctx( &ctx.keccak, vhash, vhash, 64 );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#if defined(__x86_64__)
luffa_full( &ctx.luffa, hash0, 512, hash0, 64 );
luffa_full( &ctx.luffa, hash1, 512, hash1, 64 );
#else
sph_luffa512_init( &ctx.luffa );
sph_luffa512( &ctx.luffa, hash0, 64 );
sph_luffa512_close( &ctx.luffa, hash0 );
sph_luffa512_init( &ctx.luffa );
sph_luffa512( &ctx.luffa, hash1, 64 );
sph_luffa512_close( &ctx.luffa, hash1 );
#endif
cubehash_full( &ctx.cube, hash0, 512, hash0, 64 );
cubehash_full( &ctx.cube, hash1, 512, hash1, 64 );
@@ -1107,8 +1034,8 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
sph_shavite512_close( &ctx.shavite, hash1 );
#if defined(__x86_64__)
simd_full( &ctx.simd, hash0, hash0, 512 );
simd_full( &ctx.simd, hash1, hash1, 512 );
simd512_ctx( &ctx.simd, hash0, hash0, 64 );
simd512_ctx( &ctx.simd, hash1, hash1, 64 );
#else
sph_simd512_init( &ctx.simd );
sph_simd512( &ctx.simd, hash0, 64 );
@@ -1118,7 +1045,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
sph_simd512_close( &ctx.simd, hash1 );
#endif
#if defined(__AES__) || defined(__ARM_FEATURE_AES)
#if defined(__AES__) // || defined(__ARM_FEATURE_AES)
echo_full( &ctx.echo, hash0, 512, hash0, 64 );
echo_full( &ctx.echo, hash1, 512, hash1, 64 );
#else
@@ -1130,7 +1057,7 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
sph_echo512_close( &ctx.echo, hash1 );
#endif
#if defined(__SSE4_2__) // || defined(__ARM_NEON)
#if defined(__SSE4_2__) // || defined(__ARM_NEON)
intrlv_2x64( vhash, hash0, hash1, 512 );
hamsi512_2x64_ctx( &ctx.hamsi, vhash, vhash, 64 );
dintrlv_2x64( hash0, hash1, vhash, 512 );
@@ -1165,18 +1092,9 @@ int x17_2x64_hash( void *output, const void *input, int thr_id )
sph_whirlpool( &ctx.whirlpool, hash1, 64 );
sph_whirlpool_close( &ctx.whirlpool, hash1 );
#if defined(__x86_64__)
intrlv_2x64( vhash, hash0, hash1, 512 );
sha512_2x64_ctx( &ctx.sha, vhash, vhash, 64 );
dintrlv_2x64( hash0, hash1, vhash, 512 );
#else
sph_sha512_init( &ctx.sha );
sph_sha512( &ctx.sha, hash0, 64 );
sph_sha512_close( &ctx.sha, hash0 );
sph_sha512_init( &ctx.sha );
sph_sha512( &ctx.sha, hash1, 64 );
sph_sha512_close( &ctx.sha, hash1 );
#endif
sph_haval256_5_init( &ctx.haval );
sph_haval256_5( &ctx.haval, hash0, 64 );

View File

@@ -210,7 +210,7 @@ int scanhash_x22i( struct work *work, uint32_t max_nonce,
do
{
edata[19] = n;
if ( x22i_hash( hash64, edata, thr_id ) );
if ( x22i_hash( hash64, edata, thr_id ) )
if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n );

View File

@@ -245,7 +245,7 @@ int scanhash_x25x( struct work *work, uint32_t max_nonce,
do
{
edata[19] = n;
if ( x25x_hash( hash64, edata, thr_id ) );
if ( x25x_hash( hash64, edata, thr_id ) )
if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
{
pdata[19] = bswap_32( n );

View File

@@ -53,7 +53,6 @@
#include <stdlib.h>
#include <string.h>
#include "algo/sha/hmac-sha256-hash.h"
#include "algo/sha/hmac-sha256-hash-4way.h"
#include "yespower.h"
#include "yespower-platform.c"

20
configure vendored
View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.5.
# Generated by GNU Autoconf 2.71 for cpuminer-opt 23.7.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='23.5'
PACKAGE_STRING='cpuminer-opt 23.5'
PACKAGE_VERSION='23.7'
PACKAGE_STRING='cpuminer-opt 23.7'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 23.5 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 23.7 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1432,7 +1432,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 23.5:";;
short | recursive ) echo "Configuration of cpuminer-opt 23.7:";;
esac
cat <<\_ACEOF
@@ -1538,7 +1538,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 23.5
cpuminer-opt configure 23.7
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 23.5, which was
It was created by cpuminer-opt $as_me 23.7, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='23.5'
VERSION='23.7'
printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 23.5, which was
This file was extended by cpuminer-opt $as_me 23.7, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
cpuminer-opt config.status 23.5
cpuminer-opt config.status 23.7
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

View File

@@ -1,4 +1,4 @@
AC_INIT([cpuminer-opt], [23.5])
AC_INIT([cpuminer-opt], [23.7])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 23.5.
# Generated by GNU Autoconf 2.69 for cpuminer-opt 23.7.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='cpuminer-opt'
PACKAGE_TARNAME='cpuminer-opt'
PACKAGE_VERSION='23.5'
PACKAGE_STRING='cpuminer-opt 23.5'
PACKAGE_VERSION='23.7'
PACKAGE_STRING='cpuminer-opt 23.7'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures cpuminer-opt 23.5 to adapt to many kinds of systems.
\`configure' configures cpuminer-opt 23.7 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1404,7 +1404,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of cpuminer-opt 23.5:";;
short | recursive ) echo "Configuration of cpuminer-opt 23.7:";;
esac
cat <<\_ACEOF
@@ -1509,7 +1509,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
cpuminer-opt configure 23.5
cpuminer-opt configure 23.7
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by cpuminer-opt $as_me 23.5, which was
It was created by cpuminer-opt $as_me 23.7, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -2993,7 +2993,7 @@ fi
# Define the identity of the package.
PACKAGE='cpuminer-opt'
VERSION='23.5'
VERSION='23.7'
cat >>confdefs.h <<_ACEOF
@@ -6718,7 +6718,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by cpuminer-opt $as_me 23.5, which was
This file was extended by cpuminer-opt $as_me 23.7, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -6784,7 +6784,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
cpuminer-opt config.status 23.5
cpuminer-opt config.status 23.7
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"

View File

@@ -2321,6 +2321,12 @@ static void *miner_thread( void *userdata )
gettimeofday( (struct timeval *) &tv_start, NULL );
// Scan for nonce
// nonce_found = scanhash_sha256dt_ref( &work, max_nonce, &hashes_done,
// mythr );
// nonce_found = scanhash_sha256dt_4x32( &work, max_nonce, &hashes_done,
// mythr );
nonce_found = algo_gate.scanhash( &work, max_nonce, &hashes_done,
mythr );
@@ -3677,58 +3683,17 @@ static int thread_create(struct thr_info *thr, void* func)
void get_defconfig_path(char *out, size_t bufsize, char *argv0);
#include "simd-utils.h"
#include "algo/hamsi/hamsi-hash-4way.h"
#include "algo/hamsi/sph_hamsi.h"
int main(int argc, char *argv[])
{
struct thr_info *thr;
long flags;
int i, err;
/*
#include "simd-utils.h"
printf("bswap32: %08x, bswap64: %016lx\n", bswap_32( 0x03020100 ), bswap_64( 0x0706050403020100 ) );
printf("ror32: %08x, ror64: %016lx\n", ror32( 0x03020100, 8 ), ror64( 0x0706050403020100, 8 ) );
exit(0);
uint64x2_t a64 = v128_set64( 0x5555555555555555, 0xcccccccccccccccc ) ;
uint64x2_t c64 = v128_set64( 0xffffffffffffffff, 0x0000000000000000 ) ;
uint64x2_t mask = v128_set64( 0x0f0f0f0ff0f0f0f0, 0xf0f0f0f00f0f0f0f ) ;
uint32x4_t a32 = v128_set32( 0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100 );
uint16x8_t a16 = v128_set16( 0x0f0e, 0x00d0c, 0x0b0a, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100 );
uint8x16_t a8 = v128_set8( 0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11, 0x00 );
a64 = v128_bswap64( a32 );
a32 = v128_bswap32( a32 );
a16 = v128_bswap16( a16 );
uint64_t *b64 = (uint64_t*)&a64;
uint32_t *b32 = (uint32_t*)&a32;
uint16_t *b16 = (uint16_t*)&a16;
//a32 = v128_ror32( a32, 4 );
printf("64: %016lx, %016lx\n", b64[1], b64[0] );
printf("32: %08x %08x %08x %08x\n", b32[3], b32[2], b32[1], b32[0] );
printf("16: %04x %04x %04x %04x %04x %04x %04x %04x\n", b16[7], b16[6], b16[5], b16[4], b16[3], b16[2], b16[1], b16[0] );
//a32 = v128_ror32( a32, 28 );
//printf("32: %08x %08x %08x %08x\n", b32[3], b32[2], b32[1], b32[0] );
//a32 = v128_rol32( a32, 4 );
//printf("32: %08x %08x %08x %08x\n", b32[3], b32[2], b32[1], b32[0] );
//a32 = v128_rol32( a32, 28 );
//printf("32: %08x %08x %08x %08x\n", b32[3], b32[2], b32[1], b32[0] );
exit(0);
*/
pthread_mutex_init(&applog_lock, NULL);
show_credits();
@@ -3864,6 +3829,9 @@ exit(0);
return 1;
}
if ( is_root() )
applog( LOG_NOTICE, "Running cpuminer as Superuser is discouraged.");
#ifndef WIN32
if (opt_background)
{
@@ -4087,7 +4055,7 @@ exit(0);
applog( LOG_INFO, "%d of %d miner threads started using '%s' algorithm",
opt_n_threads, num_cpus, algo_names[opt_algo] );
/* main loop - simply wait for workio thread to exit */
/* main loop - simply wait for workio thread to exit */
pthread_join( thr_info[work_thr_id].pth, NULL );
applog( LOG_WARNING, "workio thread dead, exiting." );
return 0;

13
miner.h
View File

@@ -20,7 +20,7 @@
#define USER_AGENT_OS
#endif
#define USER_AGENT PACKAGE_NAME "-" PACKAGE_VERSION "-" USER_AGENT_ARCH "-" USER_AGENT_OS
#define USER_AGENT PACKAGE_NAME "-" PACKAGE_VERSION "-" USER_AGENT_ARCH USER_AGENT_OS
//#define MAX_CPUS 128
@@ -46,7 +46,7 @@
#include <stdbool.h>
#include <inttypes.h>
#include <sys/time.h>
#include <unistd.h>
#include <pthread.h>
#include <jansson.h>
#include <curl/curl.h>
@@ -76,6 +76,15 @@
#endif
static inline bool is_root()
{
#if defined(WIN32)
return false;
#else
return !getuid();
#endif
}
/*
#ifndef min
#define min(a,b) (a>b ? (b) :(a))

View File

@@ -1509,24 +1509,35 @@ static inline void v128_bswap32_intrlv80_2x64( void *d, const void *src )
#elif defined(__ARM_NEON)
casti_v128u64( d,0 ) = vdupq_laneq_u64( s0, 0 );
casti_v128u64( d,1 ) = vdupq_laneq_u64( s0, 1 );
casti_v128u64( d,0 ) = vdupq_laneq_u64( (uint64x2_t)s0, 0 );
casti_v128u64( d,1 ) = vdupq_laneq_u64( (uint64x2_t)s0, 1 );
casti_v128u64( d,2 ) = vdupq_laneq_u64( s1, 0 );
casti_v128u64( d,3 ) = vdupq_laneq_u64( s1, 1 );
casti_v128u64( d,2 ) = vdupq_laneq_u64( (uint64x2_t)s1, 0 );
casti_v128u64( d,3 ) = vdupq_laneq_u64( (uint64x2_t)s1, 1 );
casti_v128u64( d,4 ) = vdupq_laneq_u64( s2, 0 );
casti_v128u64( d,5 ) = vdupq_laneq_u64( s2, 1 );
casti_v128u64( d,4 ) = vdupq_laneq_u64( (uint64x2_t)s2, 0 );
casti_v128u64( d,5 ) = vdupq_laneq_u64( (uint64x2_t)s2, 1 );
casti_v128u64( d,6 ) = vdupq_laneq_u64( s3, 0 );
casti_v128u64( d,7 ) = vdupq_laneq_u64( s3, 1 );
casti_v128u64( d,6 ) = vdupq_laneq_u64( (uint64x2_t)s3, 0 );
casti_v128u64( d,7 ) = vdupq_laneq_u64( (uint64x2_t)s3, 1 );
casti_v128u64( d,8 ) = vdupq_laneq_u64( s4, 0 );
casti_v128u64( d,9 ) = vdupq_laneq_u64( s4, 1 );
casti_v128u64( d,8 ) = vdupq_laneq_u64( (uint64x2_t)s4, 0 );
casti_v128u64( d,9 ) = vdupq_laneq_u64( (uint64x2_t)s4, 1 );
#endif
}
static inline void extr_lane_2x64( void *dst, const void *src,
const int lane, const int bit_len )
{
uint64_t *d = (uint64_t*)dst;
const uint64_t *s = (const uint64_t*)src;
d[ 0] = s[ lane ]; d[ 1] = s[ lane+ 2 ];
d[ 2] = s[ lane+ 4 ]; d[ 3] = s[ lane+ 6 ];
if ( bit_len <= 256 ) return;
d[ 4] = s[ lane+ 8 ]; d[ 5] = s[ lane+10 ];
d[ 6] = s[ lane+12 ]; d[ 7] = s[ lane+14 ];
}
// 4x64 (AVX2)

View File

@@ -152,16 +152,6 @@
#define v128_unpacklo8 _mm_unpacklo_epi8
#define v128_unpackhi8 _mm_unpackhi_epi8
// New shorter agnostic name
#define v128_ziplo64 _mm_unpacklo_epi64
#define v128_ziphi64 _mm_unpackhi_epi64
#define v128_ziplo32 _mm_unpacklo_epi32
#define v128_ziphi32 _mm_unpackhi_epi32
#define v128_ziplo16 _mm_unpacklo_epi16
#define v128_ziphi16 _mm_unpackhi_epi16
#define v128_ziplo8 _mm_unpacklo_epi8
#define v128_ziphi8 _mm_unpackhi_epi8
// AES
#define v128_aesenc _mm_aesenc_si128
#define v128_aesenclast _mm_aesenclast_si128
@@ -171,7 +161,8 @@
// Used instead if casting.
typedef union
{
__m128i m128;
v128_t v128;
__m128i m128;
uint32_t u32[4];
} __attribute__ ((aligned (16))) m128_ovly;
#define v128_ovly m128_ovly
@@ -218,19 +209,41 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
return a;
}
// Emulate broadcast & insert instructions not available in SSE2
// FYI only, not used anywhere
//#define mm128_bcast_m64( v ) _mm_shuffle_epi32( v, 0x44 )
//#define mm128_bcast_m32( v ) _mm_shuffle_epi32( v, 0x00 )
// broadcast lane 0 to all lanes
#define v128_bcast64(v) _mm_shuffle_epi32( v, 0x44 )
#define v128_bcast32(v) _mm_shuffle_epi32( v, 0x00 )
#if defined(__AVX2__)
#define v128_bcast16(v) _mm_broadcastw_epi16(v)
#else
#define v128_bcast16(v) \
v128_bcast32( v128_or( v128_sl32( v, 16 ), v ) )
#endif
// broadcast lane l to all lanes
#define v128_replane64( v, l ) \
( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x44 ) \
: _mm_shuffle_epi32( v, 0xee )
#define v128_replane32( v, l ) \
( (l) == 0 ) ? _mm_shuffle_epi32( v, 0x00 ) \
: ( (l) == 1 ) ? _mm_shuffle_epi32( v, 0x55 ) \
: ( (l) == 2 ) ? _mm_shuffle_epi32( v, 0xaa ) \
: _mm_shuffle_epi32( v, 0xff )
// Pseudo constants
#define v128_zero _mm_setzero_si128()
#define m128_zero v128_zero
#define m128_zero _mm_setzero_si128()
#if defined(__SSE4_1__)
// Bitwise AND, return 1 if result is all bits clear.
#define v128_and_eq0 _mm_testz_si128
#define v128_and_eq0 _mm_testz_si128
static inline int v128_cmpeq0( v128_t v )
{ return v128_and_eq0( v, v ); }
@@ -341,9 +354,12 @@ static inline __m128i v128_neg1_fn()
*/
#define mm128_mask_32( v, m ) mm128_xim_32( v, v, m )
// Zero 32 bit elements when corresponding bit in 4 bit mask is set.
static inline __m128i mm128_mask_32( const __m128i v, const int m )
{ return mm128_xim_32( v, v, m ); }
//static inline __m128i mm128_mask_32( const __m128i v, const int m )
//{ return mm128_xim_32( v, v, m ); }
#define v128_mask32 mm128_mask_32
// Copy element i2 of v2 to element i1 of dest and copy remaining elements from v1.
#define v128_movlane32( v1, l1, v0, l0 ) \
@@ -483,10 +499,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
//
// Bit rotations
// Neon has fast xor-ror, useful for big blake, if it actually works.
#define v128_xror64( v1, v0, c ) v128_ror64( v128_xor( v1, v0 ) c )
// Slow bit rotation, used as last resort
#define mm128_ror_64_sse2( v, c ) \
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
@@ -645,32 +657,55 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
// Limited 2 input shuffle, combines shuffle with blend. The destination low
// half is always taken from v1, and the high half from v2.
#define mm128_shuffle2_64( v1, v2, c ) \
#define v128_shuffle2_64( v1, v2, c ) \
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( v1 ), \
_mm_castsi128_pd( v2 ), c ) );
#define mm128_shuffle2_64 v128_shuffle2_64
#define mm128_shuffle2_32( v1, v2, c ) \
#define v128_shuffle2_32( v1, v2, c ) \
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
_mm_castsi128_ps( v2 ), c ) );
#define mm128_shuffle2_32 v128_shuffle2_32
// Rotate vector elements accross all lanes
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
#define v128_swap64 mm128_swap_64
#define mm128_shuflr_64 mm128_swap_64
#define mm128_shufll_64 mm128_swap_64
#define v128_shuffle16( v, c ) \
_mm_or_si128( _mm_shufflehi_epi16( v, c ), _mm_shufflelo_epi16( v, c ) )
// Don't use as an alias for byte sized bit rotation
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
#define v128_shuflr32 mm128_shuflr_32
// reverse elements in vector
#define v128_swap64(v) _mm_shuffle_epi32( v, 0x4e ) // grandfathered
#define v128_rev64(v) _mm_shuffle_epi32( v, 0x4e ) // preferred
#define v128_rev32(v) _mm_shuffle_epi32( v, 0x1b )
#define v128_rev16(v) v128_shuffle16( v, 0x1b )
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
#define v128_shufll32 mm128_shufll_32
// rotate vector elements
#define v128_shuflr32(v) _mm_shuffle_epi32( v, 0x39 )
#define v128_shufll32(v) _mm_shuffle_epi32( v, 0x93 )
#define v128_swap64_32( v ) v128_ror64( v, 32 )
#define v128_shuflr16(v) v128_shuffle16( v, 0x39 )
#define v128_shufll16(v) v128_shuffle16( v, 0x93 )
#define mm128_rev_32( v ) _mm_shuffle_epi32( v, 0x1b )
#define v128_rev32 mm128_rev_32
// Some sub-vector shuffles are identical to bit rotation. Shuffle is faster.
// Bit rotation already promotes faster widths. Usage of these versions
// are context sensitive.
// reverse elements in vector lanes
#define v128_qrev32(v) v128_ror64( v, 32 )
#define v128_swap64_32(v) v128_ror64( v, 32 ) // grandfathered
#define v128_qrev16(v) \
_mm_or_si128( _mm_shufflehi_epi16( v, v128u16( 0x1b ) ) \
_mm_shufflelo_epi16( v, v128u16( 0x1b ) ) )
#define v128_lrev16(v) v128_ror32( v, 16 )
// alias bswap
#define v128_qrev8(v) _mm_shuffle_epi8( v, v128_8( 0,1,2,3,4,5,6,7 ) )
#define v128_lrev8(v) _mm_shuffle_epi8( v, v128_8( 4,5,6,7, 0,1,2,3 ) )
#define v128_wrev8(v) _mm_shuffle_epi8( v, v128_8( 6,7, 4,5, 2,3, 1,0 ) )
// reverse bits, can it be done?
//#define v128_bitrev8( v ) vrbitq_u8
/* Not used
#if defined(__SSSE3__)
@@ -682,7 +717,6 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
#endif
*/
//
// Endian byte swap.
#if defined(__SSSE3__)
@@ -798,8 +832,7 @@ static inline __m128i mm128_bswap_16( __m128i v )
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
}
#define mm128_bswap_128( v ) \
mm128_swap_64( mm128_bswap_64( v ) )
#define mm128_bswap_128( v ) v128_qrev32( v128_bswap64( v ) )
static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s )
{
@@ -846,7 +879,7 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
d[7] = mm128_bswap_32( s[7] );
}
#define mm128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 mm128_block_bswap_32
#define v128_block_bswap32_256 mm128_block_bswap_32
static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
{
@@ -907,7 +940,7 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s )
#else
#define v128_blendv( v1, v0, mask ) \
v128_or( v128_andnot( mask, v0 ), v128_and( mask, v1 ) )
v128_or( v128_andnot( mask, v0 ), v128_and( mask, v1 ) )
#endif

View File

@@ -375,16 +375,27 @@ static inline __m256i mm256_not( const __m256i v )
// Cross lane shuffles
//
// Rotate elements accross all lanes.
#define mm256_shuffle_16( v, c ) \
_mm256_or_si256( _mm256_shufflehi_epi16( v, c ), \
_mm256_shufflelo_epi16( v, c ) )
// Swap 128 bit elements in 256 bit vector.
#define mm256_swap_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
#define mm256_shuflr_128 mm256_swap_128
#define mm256_shufll_128 mm256_swap_128
#define mm256_rev_128( v ) _mm256_permute4x64_epi64( v, 0x4e )
// Rotate 256 bit vector by one 64 bit element
#define mm256_shuflr_64( v ) _mm256_permute4x64_epi64( v, 0x39 )
#define mm256_shufll_64( v ) _mm256_permute4x64_epi64( v, 0x93 )
// Reverse 64 bit elements
#define mm256_rev_64( v ) _mm256_permute4x64_epi64( v, 0x1b )
#define mm256_rev_32( v ) \
_mm256_permute8x32_epi64( v, 0x0000000000000001, 0x0000000200000003, \
0x0000000400000005, 0x0000000600000007 )
#define mm256_rev_16( v ) \
_mm256_permute4x64_epi64( mm256_shuffle_16( v, 0x1b ), 0x4e )
/* Not used
// Rotate 256 bit vector by one 32 bit element.
@@ -423,12 +434,16 @@ static inline __m256i mm256_shufll_32( const __m256i v )
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
_mm256_castsi256_ps( v2 ), c ) );
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_shuflr128_64 mm256_swap128_64
#define mm256_shufll128_64 mm256_swap128_64
#define mm256_swap128_64(v) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_rev128_64(v) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_rev128_32(v) _mm256_shuffle_epi32( v, 0x1b )
#define mm256_rev128_16(v) mm256_shuffle_16( v, 0x1b )
#define mm256_shuflr128_32( v ) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_shufll128_32( v ) _mm256_shuffle_epi32( v, 0x93 )
#define mm256_shuflr128_32(v) _mm256_shuffle_epi32( v, 0x39 )
#define mm256_shufll128_32(v) _mm256_shuffle_epi32( v, 0x93 )
#define mm256_shuflr128_16(v) _mm256_shuffle_epi16( v, 0x39 )
#define mm256_shufll128_16(v) _mm256_shuffle_epi16( v, 0x93 )
/* Not used
static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
@@ -436,7 +451,19 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
*/
// Same as bit rotation but logically used as byte/word rotation.
#define mm256_swap64_32( v ) mm256_ror_64( v, 32 )
#define mm256_swap64_32( v ) mm256_ror_64( v, 32 ) // grandfathered
#define mm256_rev64_32( v ) mm256_ror_64( v, 32 )
#define mm256_shuflr64_16(v) _mm256_ror_epi64( v, 16 )
#define mm256_shufll64_16(v) _mm256_rol_epi64( v, 16 )
#define mm256_shuflr64_8(v) _mm256_ror_epi64( v, 8 )
#define mm256_shufll64_8(v) _mm256_rol_epi64( v, 8 )
#define mm256_rev32_16( v ) mm256_ror_32( v, 16 )
#define mm256_shuflr32_8(v) _mm256_ror_epi32( v, 8 )
#define mm256_shufll32_8(v) _mm256_rol_epi32( v, 8 )
// Reverse byte order in elements, endian bswap.
#define mm256_bswap_64( v ) \

View File

@@ -15,11 +15,11 @@
// vxarq_u64( v1, v0, n ) ror( xor( v1, v0 ), n )
// vraxlq_u64( v1, v0 ) xor( rol( v1, 1 ), rol( v0, 1 ) )
// vbcaxq( v2, v1, v0 ) xor( v2, and( v1, not(v0) ) )
// vsraq_n( v1, v0, n ) add( v1, sr( v0, n ) )
//
// might not work, not tried yet:
// Doesn't work on RPi but works on OPi:
//
// vornq( v1, v0 ) or( v1, not( v0 ) )
// vsraq_n( v1, v0, n ) add( v1, sr( v0, n ) )
#define v128_t uint32x4_t // default,
#define v128u64_t uint64x2_t
@@ -31,6 +31,15 @@
#define v128_load( p ) vld1q_u32( (uint32_t*)(p) )
#define v128_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
#define v128u64_load( p ) vld1q_u64( (uint64_t*)(p) )
#define v128u64_store( p, v ) vst1q_u64( (uint64_t*)(p), v )
#define v128u32_load( p ) vld1q_u32( (uint32_t*)(p) )
#define v128u32_store( p, v ) vst1q_u32( (uint32_t*)(p), v )
#define v128u16_load( p ) vld1q_u16( (uint16_t*)(p) )
#define v128u16_store( p, v ) vst1q_u16( (uint16_t*)(p), v )
#define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) )
#define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v )
// load & set1 combined
#define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) )
#define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) )
@@ -74,6 +83,9 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
#define v128_cmpeq0 vceqzq_u64
// Not yet needed
//#define v128_cmpeq1
#define v128_cmpgt64 vcgtq_u64
#define v128_cmpgt32 vcgtq_u32
#define v128_cmpgt16 vcgtq_u16
@@ -95,31 +107,55 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
#define v128_sr16 vshrq_n_u16
#define v128_sr8 vshrq_n_u8
// Maybe signed shift will work.
// Unit tested, working.
#define v128_sra64 vshrq_n_s64
#define v128_sra32 vshrq_n_s32
#define v128_sra16 vshrq_n_s16
// logic
// unary logic
#define v128_not vmvnq_u32
// binary logic
#define v128_or vorrq_u32
#define v128_and vandq_u32
#define v128_not vmvnq_u32
#define v128_xor veorq_u32
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32(v1), v0 )
#define v128_xnor( a, b ) v128_not( v128_xor( a, b ) )
#define v128_ornot vornq_u32
// ternary logic, veorq_u32 not defined
// ~v1 & v0
#define v128_andnot( v1, v0 ) vandq_u32( vmvnq_u32( v1 ), v0 )
// ~( a ^ b ), same as (~a) ^ b
#define v128_xnor( v1, v0 ) v128_not( v128_xor( v1, v0 ) )
// ~v1 | v0, x86_64 convention, first arg is not'ed
#define v128_ornot( v1, v0 ) vornq_u32( v0, v1 )
// ternary logic
// v2 ^ v1 ^ v0
// veorq_u32 not defined
//#define v128_xor3 veor3q_u32
#define v128_xor3( v2, v1, v0 ) veorq_u32( v2, veorq_u32( v1, v0 ) )
#define v128_nor vornq_u32
// v2 & v1 & v0
#define v128_and3( v2, v1, v0 ) v128_and( v2, v128_and( v1, v0 ) )
// v2 | v1 | v0
#define v128_or3( v2, v1, v0 ) v128_or( v2, v128_or( v1, v0 ) )
// a ^ ( ~b & c )
#define v128_xorandnot( v2, v1, v0 ) v128_xor( v2, v128_andnot( v1, v0 ) )
#define v128_and3( a, b, c ) v128_and( a, v128_and( b, c ) )
#define v128_or3( a, b, c ) v128_or( a, v128_or( b, c ) )
#define v128_xorand( a, b, c ) v128_xor( a, v128_and( b, c ) )
#define v128_andxor( a, b, c ) v128_and( a, v128_xor( b, c ))
#define v128_xoror( a, b, c ) v128_xor( a, v128_or( b, c ) )
#define v128_orand( a, b, c ) v128_or( a, v128_and( b, c ) )
// a ^ ( b & c )
#define v128_xorand( v2, v1, v0 ) v128_xor( v2, v128_and( v1, v0 ) )
// a & ( b ^ c )
#define v128_andxor( v2, v1, v0 ) v128_and( v2, v128_xor( v1, v0 ) )
// a ^ ( b | c )
#define v128_xoror( v2, v1, v0 ) v128_xor( v2, v128_or( v1, v0 ) )
// v2 | ( v1 & v0 )
#define v128_orand( v2, v1, v0 ) v128_or( v2, v128_and( v1, v0 ) )
// shift 2 concatenated vectors right.
#define v128_alignr64( v1, v0, c ) vextq_u64( v0, v1, c )
@@ -127,24 +163,15 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
#define v128_alignr8( v1, v0, c ) vextq_u8( v0, v1, c )
// Intetleave high or low half of 2 vectors.
#define v128_unpacklo64( v1, v0 ) vzip1q_u64( v0, v1 )
#define v128_unpackhi64( v1, v0 ) vzip2q_u64( v0, v1 )
#define v128_unpacklo32( v1, v0 ) vzip1q_u32( v0, v1 )
#define v128_unpackhi32( v1, v0 ) vzip2q_u32( v0, v1 )
#define v128_unpacklo16( v1, v0 ) vzip1q_u16( v0, v1 )
#define v128_unpackhi16( v1, v0 ) vzip2q_u16( v0, v1 )
#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v0, v1 )
#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v0, v1 )
#define v128_unpacklo64( v1, v0 ) vzip1q_u64( v1, v0 )
#define v128_unpackhi64( v1, v0 ) vzip2q_u64( v1, v0 )
#define v128_unpacklo32( v1, v0 ) vzip1q_u32( v1, v0 )
#define v128_unpackhi32( v1, v0 ) vzip2q_u32( v1, v0 )
#define v128_unpacklo16( v1, v0 ) vzip1q_u16( v1, v0 )
#define v128_unpackhi16( v1, v0 ) vzip2q_u16( v1, v0 )
#define v128_unpacklo8( v1, v0 ) vzip1q_u8( v1, v0 )
#define v128_unpackhi8( v1, v0 ) vzip2q_u8( v1, v0 )
// Shorter agnostic names for unpack using NEON-like syntax
#define v128_ziplo64 vzip1q_u64
#define v128_ziphi64 vzip2q_u64
#define v128_ziplo32 vzip1q_u32
#define v128_ziphi32 vzip2q_u32
#define v128_ziplo16 vzip1q_u16
#define v128_ziphi16 vzip2q_u16
#define v128_ziplo8 vzip1q_u8
#define v128_ziphi8 vzip2q_u8
// AES
// consistent with Intel AES, break up for optimizing
@@ -156,10 +183,22 @@ static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 )
typedef union
{
uint32x4_t v128;
uint32x4_t m128;
uint32_t u32[4];
uint32_t u32[4];
} __attribute__ ((aligned (16))) v128_ovly;
// Broadcast lane 0 to all lanes
#define v128_bcast64(v) vdupq_laneq_u64( v, 0 )
#define v128_bcast32(v) vdupq_laneq_u32( v, 0 )
#define v128_bcast16(v) vdupq_laneq_u16( v, 0 )
// Replicate (broadcast) lane l to all lanes
#define v128_replane64( v, l ) vdupq_laneq_u64( v, l )
#define v128_replane32( v, l ) vdupq_laneq_u32( v, l )
#define v128_replane16( v, l ) vdupq_laneq_u16( v, l )
// pointer indexing
#define casti_v128( p, i ) (((uint32x4_t*)(p))[i])
#define cast_v128( p ) (*((uint32x4_t*)(p)))
@@ -253,12 +292,13 @@ typedef union
#define v128_negate16 vnegq_s16
#define v128_negate8 vnegq_s8
// Nothing else seems to work
static inline void v128_memset_zero( void *dst, const int n )
{
for( int i = 0; i < n; i++ )
((uint32x4_t*)dst)[n] = (uint32x4_t)(uint128_t)0;
memset( dst, 0, n*16 );
}
static inline void v128_memset( void *dst, const void *src, const int n )
{
for( int i = 0; i < n; i++ )
@@ -271,51 +311,40 @@ static inline void v128_memcpy( void *dst, const void *src, const int n )
((uint32x4_t*)dst)[i] = ((const uint32x4_t*)src)[i];
}
// how to build a bitmask from vector elements?
// how to build a bitmask from vector elements? Efficiently???
#define v128_movmask32
#define v128_movmask64
// Bit rotation
//TODO, maybe, Optimize 64 bit rotations
// Fall back for odd bit rotations
static inline uint64x2_t v128_ror64( uint64x2_t v, int c )
{ return vsriq_n_u64( vshlq_n_u64( v, 64-c ), v, c ); }
static inline uint64x2_t v128_rol64( uint64x2_t v, int c )
{ return vsriq_n_u64( vshlq_n_u64( v, c ), v, 64-c ); }
static inline uint32x4_t v128_ror32( uint32x4_t v, int c )
{ return vsriq_n_u32( vshlq_n_u32( v, 32-c ), v, c ); }
static inline uint32x4_t v128_rol32( uint32x4_t v, int c )
{ return vsriq_n_u32( vshlq_n_u32( v, c ), v, 32-c ); }
static inline uint16x8_t v128_ror16( uint16x8_t v, int c )
{ return vsriq_n_u16( vshlq_n_u16( v, 16-c ), v, c ); }
static inline uint16x8_t v128_rol16( uint16x8_t v, int c )
{ return vsriq_n_u16( vshlq_n_u16( v, c ), v, 16-c ); }
static inline uint8x16_t v128_ror8( uint8x16_t v, int c )
{ return vsriq_n_u8( vshlq_n_u8( v, 8-c ), v, c ); }
static inline uint8x16_t v128_rol8( uint16x8_t v, int c )
{ return vsriq_n_u8( vshlq_n_u8( v, c ), v, 8-c ); }
/*
// Optimzed for half element rotations (swap)
#define v128_ror64( v, c ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( v ) : v128_ror64_neon( v, c )
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
: vsriq_n_u64( vshlq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
#define v128_rol64( v, c ) \
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( v ) : v128_rol64_neon( v, c )
( (c) == 32 ) ? (uint64x2_t)vrev64q_u32( ((uint64x2_t)v) ) \
: vsliq_n_u64( vshrq_n_u64( ((uint64x2_t)v), 64-c ), ((uint64x2_t)v), c )
#define v128_ror32( v, c ) \
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( v ) : v128_ror32_neon( v, c )
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
: vsriq_n_u32( vshlq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
#define v128_rol32( v, c ) \
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( v ) : v128_rol32_neon( v, c )
*/
( (c) == 16 ) ? (uint32x4_t)vrev32q_u16( ((uint32x4_t)v) ) \
: vsliq_n_u32( vshrq_n_u32( ((uint32x4_t)v), 32-c ), ((uint32x4_t)v), c )
#define v128_ror16( v, c ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
: vsriq_n_u16( vshlq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
#define v128_rol16( v, c ) \
( (c) == 8 ) ? (uint16x8_t)vrev16q_u8( ((uint16x8_t)v) ) \
: vsliq_n_u16( vshrq_n_u16( ((uint16x8_t)v), 16-c ), ((uint16x8_t)v), c )
#define v128_ror8( v, c ) \
vsriq_n_u8( vshlq_n_u8( ((uint8x16_t)v), 8-c ), ((uint8x16_t)v), c )
#define v128_rol8( v, c ) \
vsliq_n_u8( vshrq_n_u8( ((uint8x16_t)v), 8-c ), ((uint8x16_t)v), c )
#define v128_2ror64( v1, v0, c ) \
{ \
@@ -343,7 +372,7 @@ static inline uint8x16_t v128_rol8( uint16x8_t v, int c )
uint32x4_t t1 = vshrq_n_u32( v1, c ); \
v0 = vsliq_n_u32( v0, 32-(c) ); \
v1 = vsliq_n_u32( v1, 32-(c) ); \
v0 = vorrq_u32( v0, t0 ); \
v0 = vorrq_32( v0, t0 ); \
v1 = vorrq_u32( v1, t1 ); \
}
@@ -357,16 +386,6 @@ static inline uint8x16_t v128_rol8( uint16x8_t v, int c )
v1 = vorrq_u32( v1, t1 ); \
}
// vector rotation , size?
static inline uint32x4_t v128_swap64( uint32x4_t v )
{ return vextq_u64( v, v, 1 ); }
static inline uint32x4_t v128_shuflr32( uint32x4_t v )
{ return vextq_u32( v, v, 1 ); }
static inline uint32x4_t v128_shufll32( uint32x4_t v )
{ return vextq_u32( v, v, 3 ); }
// Cross lane shuffles, no programmable shuffle in NEON
// vector mask, use as last resort. prefer rev, alignr, etc
@@ -395,29 +414,54 @@ static inline uint32x4_t v128_shufll32( uint32x4_t v )
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 1] ], \
((uint8_t*)&v)[ ((uint8_t*)(&vmask))[ 0] ] )
#define v128_swap64_32( v ) vrev64q_u32( v )
#define v128_v128_shuflr64_16( v ) v128_ror_64( v, 16 )
#define v128_v128_shufll64_16( v ) v128_rol_64( v, 16 )
// sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster.
// Bit rotation already promotes faster widths. Usage is context sensitive.
// preferred.
// Don't use as an alias for byte sized bit rotation
#define v128_swap32_16( v ) vrev64q_u16( v )
#define v128_v128_shuflr32_8( v ) v128_ror_32( v, 8 )
#define v128_v128_shufll32_8( v ) v128_rol_32( v, 8 )
// reverse elements in vector lanes
#define v128_qrev32 vrev64q_u32
#define v128_swap64_32 vrev64q_u32 // grandfathered
// reverse elements
#define v128_rev32( v ) vrev64q_u32( v )
#define v128_rev16( v ) vrev64q_u16( v )
#define v128_rev8( v ) vrev64q_u8( v )
#define v128_qrev16 vrev64q_u16
#define v128_lrev16 vrev32q_u16
// reverse bits, nothing like it in x86_64
#define v128_bitrev8( v ) vrbitq_u8
// aka bswap
#define v128_qrev8 vrev64q_u8
#define v128_lrev8 vrev32q_u8
#define v128_wrev8 vrev16q_u8
// full vector rotation
// reverse elements in vector
static inline uint64x2_t v128_rev64( uint64x2_t v )
{ return vextq_u64( v, v, 1 ); }
#define v128_swap64 v128_rev64 // grandfathered
#define v128_rev32(v) v128_rev64( v128_qrev32( v ) )
#define v128_rev16(v) v128_rev64( v128_qrev16( v ) )
// shuffle-rotate vector elements
static inline uint32x4_t v128_shuflr32( uint32x4_t v )
{ return vextq_u32( v, v, 1 ); }
static inline uint32x4_t v128_shufll32( uint32x4_t v )
{ return vextq_u32( v, v, 3 ); }
static inline uint16x8_t v128_shuflr16( uint16x8_t v )
{ return vextq_u16( v, v, 1 ); }
static inline uint16x8_t v128_shufll16( uint16x8_t v )
{ return vextq_u16( v, v, 7 ); }
// reverse bits in bytes, nothing like it in x86_64
#define v128_bitrev8 vrbitq_u8
// reverse byte order
#define v128_bswap16 vrev16q_u8
#define v128_bswap32 vrev32q_u8
#define v128_bswap64 vrev64q_u8
#define v128_bswap128(v) v128_swap64( v128_bswap64(v) )
#define v128_bswap256(p) v128_bswap128( (p)[0], (p)[1] )
#define v128_bswap16(v) (uint16x8_t)vrev16q_u8( (uint8x16_t)(v) )
#define v128_bswap32(v) (uint32x4_t)vrev32q_u8( (uint8x16_t)(v) )
#define v128_bswap64(v) (uint64x2_t)vrev64q_u8( (uint8x16_t)(v) )
#define v128_bswap128(v) (uint32x4_t)v128_swap64( v128_bswap64(v) )
#define v128_bswap256(p) v128_bswap128( (p)[0], (p)[1] )
// Usefull for x86_64 but does nothing for ARM
#define v128_block_bswap32( dst, src ) \