mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
323 lines
9.8 KiB
Plaintext
323 lines
9.8 KiB
Plaintext
// convert blake256 32 bit to use 64 bit with serial vectoring
|
|
//
|
|
// cut calls to GS in half
|
|
//
|
|
// combine V
|
|
// v0 = {V0,V1}
|
|
// v1 = {V2,V3}
|
|
// v2 = {V4,V5}
|
|
// v3 = {V6,V7}
|
|
// v4 = {V8,V9}
|
|
// v5 = {VA,VB}
|
|
// v6 = {VC,VD}
|
|
// v7 = {CE,VF}
|
|
//
|
|
// v6x = {VD,VC} swap(VC,VD) swap(v6)
|
|
// v7x = {VF,VE} swap(VE,VF) swap(v7)
|
|
//
|
|
// V0 = v1v0
|
|
// V1 = v3v2
|
|
// V2 = v5v4
|
|
// V3 = v7v6
|
|
// V4 = v9v8
|
|
// V5 = vbva
|
|
// V6 = vdvc
|
|
// V7 = vfve
|
|
//
|
|
// The rotate in ROUND is to effect straddle and unstraddle for the third
|
|
// and 4th iteration of GS.
|
|
// It concatenates 2 contiguous 256 bit vectors and extracts the middle
|
|
// 256 bits. After the transform they must be restored with only the
|
|
// chosen bits modified in the original 2 vectors.
|
|
// ror1x128 achieves this by putting the chosen bits in arg1, the "low"
|
|
// 256 bit vector and saves the untouched bits temporailly in arg0, the
|
|
// "high" 256 bit vector. Simply reverse the process to restore data back
|
|
// to original positions.
|
|
|
|
// Use standard 4way when AVX2 is not available use x2 mode with AVX2.
|
|
//
|
|
// Data is organised the same as 32 bit 4 way, in effect serial vectoring
|
|
// on top of parallel vectoring. Same data in the same place just taking
|
|
// two chunks at a time.
|
|
//
|
|
// Transparent to user, x2 mode used when AVX2 detected.
|
|
// Use existing 4way context but revert to scalar types.
|
|
// Same interleave function (128 bit) or x2 with 256 bit?
|
|
// User trsnaparency would have to apply to interleave as well.
|
|
//
|
|
// Use common 4way update and close
|
|
|
|
/*
|
|
typedef struct {
|
|
unsigned char buf[64<<2];
|
|
uint32_t H[8<<2];
|
|
uint32_t S[4<<2];
|
|
size_t ptr;
|
|
uint32_t T0, T1;
|
|
int rounds; // 14 for blake, 8 for blakecoin & vanilla
|
|
} blakex2_4way_small_context __attribute__ ((aligned (64)));
|
|
*/
|
|
|
|
static void
|
|
blake32x2_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
|
|
const uint32_t *salt, int rounds )
|
|
{
|
|
casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
|
|
casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
|
|
casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
|
|
casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
|
|
casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
|
|
casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
|
|
casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
|
|
casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
|
|
|
|
casti_m128i( ctx->S, 0 ) = m128_zero;
|
|
casti_m128i( ctx->S, 1 ) = m128_zero;
|
|
casti_m128i( ctx->S, 2 ) = m128_zero;
|
|
casti_m128i( ctx->S, 3 ) = m128_zero;
|
|
/*
|
|
sc->S[0] = _mm_set1_epi32( salt[0] );
|
|
sc->S[1] = _mm_set1_epi32( salt[1] );
|
|
sc->S[2] = _mm_set1_epi32( salt[2] );
|
|
sc->S[3] = _mm_set1_epi32( salt[3] );
|
|
*/
|
|
ctx->T0 = ctx->T1 = 0;
|
|
ctx->ptr = 0;
|
|
ctx->rounds = rounds;
|
|
}
|
|
|
|
static void
|
|
blake32x2( blake_4way_small_context *ctx, const void *data, size_t len )
|
|
{
|
|
__m128i *buf = (__m256i*)ctx->buf;
|
|
size_t bptr = ctx->ptr << 2;
|
|
size_t vptr = ctx->ptr >> 3;
|
|
size_t blen = len << 2;
|
|
// unsigned char *buf = ctx->buf;
|
|
// size_t ptr = ctx->ptr<<4; // repurposed
|
|
DECL_STATE32x2
|
|
|
|
// buf = sc->buf;
|
|
// ptr = sc->ptr;
|
|
|
|
// adjust len for use with ptr, clen, all absolute bytes.
|
|
// int blen = len<<2;
|
|
|
|
if ( blen < (sizeof ctx->buf) - bptr )
|
|
{
|
|
memcpy( buf + vptr, data, blen );
|
|
ptr += blen;
|
|
ctx->ptr = bptr >> 2;;
|
|
return;
|
|
}
|
|
|
|
READ_STATE32( ctx );
|
|
while ( blen > 0 )
|
|
{
|
|
size_t clen;
|
|
|
|
clen = ( sizeof sc->buf ) - ptr;
|
|
if ( clen > blen )
|
|
clen = blen;
|
|
memcpy( buf + vptr, data, clen );
|
|
bptr += clen;
|
|
vptr = bptr >> 5;
|
|
data = (const unsigned char *)data + clen;
|
|
blen -= clen;
|
|
if ( bptr == sizeof ctx->buf )
|
|
{
|
|
if ( ( T0 = T0 + 512 ) < 512 ) // not needed, will never rollover
|
|
T1 += 1;
|
|
COMPRESS32x2_4WAY( ctx->rounds );
|
|
ptr = 0;
|
|
}
|
|
}
|
|
WRITE_STATE32x2( ctx );
|
|
ctx->ptr = bptr >> 2;
|
|
}
|
|
|
|
static void
|
|
blake32x2_4way_close( blake_4way_small_context *ctx, void *dst )
|
|
{
|
|
__m256i buf[8] __attribute__ ((aligned (64)));
|
|
size_t ptr = ctx->ptr;
|
|
size_t vptr = ctx->ptr>>2;
|
|
unsigned bit_len = ( (unsigned)ptr << 3 ); // one lane
|
|
uint32_t th = ctx->T1;
|
|
uint32_t tl = ctx->T0 + bit_len;
|
|
|
|
if ( ptr == 0 )
|
|
{
|
|
ctx->T0 = 0xFFFFFE00UL;
|
|
ctx->T1 = 0xFFFFFFFFUL;
|
|
}
|
|
else if ( ctx->T0 == 0 )
|
|
{
|
|
ctx->T0 = 0xFFFFFE00UL + bit_len;
|
|
ctx->T1 -= 1;
|
|
}
|
|
else
|
|
ctx->T0 -= 512 - bit_len;
|
|
|
|
// memset doesn't do ints
|
|
buf[ vptr ] = _mm256_set_epi32( 0,0,0,0, 0x80, 0x80, 0x80, 0x80 );
|
|
|
|
if ( vptr < 5 )
|
|
{
|
|
memset_zero_256( buf + vptr + 1, 6 - vptr );
|
|
buf[ 6 ] = _mm256_or_si256( vbuf[ 6 ], _mm256_set_epi32(
|
|
0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL, 0,0,0,0 ) );
|
|
buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl,tl,tl,tl,
|
|
th,th,th,th ) );
|
|
blake32x2_4way( ctx, buf + vptr, 64 - ptr );
|
|
}
|
|
else
|
|
{
|
|
memset_zero_256( vbuf + vptr + 1, 7 - vptr );
|
|
blake32x2_4way( ctx, vbuf + ptr, 64 - ptr );
|
|
ctx->T0 = 0xFFFFFE00UL;
|
|
ctx->T1 = 0xFFFFFFFFUL;
|
|
buf[ 6 ] = mm256_zero;
|
|
buf[ 6 ] = _mm256_set_epi32( 0,0,0,0,
|
|
0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL );
|
|
buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl, tl, tl, tl,
|
|
th, th, th, th );
|
|
blake32x2_4way( ctx, buf, 64 );
|
|
}
|
|
|
|
casti_m256i( dst, 0 ) = mm256_bswap_32( casti_m256i( ctx->H, 0 ) );
|
|
casti_m256i( dst, 1 ) = mm256_bswap_32( casti_m256i( ctx->H, 1 ) );
|
|
casti_m256i( dst, 2 ) = mm256_bswap_32( casti_m256i( ctx->H, 2 ) );
|
|
casti_m256i( dst, 3 ) = mm256_bswap_32( casti_m256i( ctx->H, 3 ) );
|
|
}
|
|
|
|
|
|
|
|
|
|
#define DECL_STATE32x2_4WAY \
|
|
__m256i H0, H1, H2, H3; \
|
|
__m256i S0, S1; \
|
|
uint32_t T0, T1;
|
|
|
|
#define READ_STATE32x2_4WAY(state) do \
|
|
{ \
|
|
H0 = casti_m256i( state->H, 0 ); \
|
|
H1 = casti_m256i( state->H, 1 ); \
|
|
H2 = casti_m256i( state->H, 2 ); \
|
|
H3 = casti_m256i( state->H, 3 ); \
|
|
S0 = casti_m256i( state->S, 0 ); \
|
|
S1 = casti_m256i( state->S, 1 ); \
|
|
T0 = state->T0; \
|
|
T1 = state->T1; \
|
|
|
|
#define WRITE_STATE32x2_4WAY(state) do { \
|
|
casti_m256i( state->H, 0 ) = H0; \
|
|
casti_m256i( state->H, 1 ) = H1; \
|
|
casti_m256i( state->H, 2 ) = H2; \
|
|
casti_m256i( state->H, 3 ) = H3; \
|
|
casti_m256i( state->S, 0 ) = S0; \
|
|
casti_m256i( state->S, 1 ) = S1; \
|
|
state->T0 = T0; \
|
|
state->T1 = T1; \
|
|
} while (0)
|
|
|
|
|
|
#define GSx2_4WAY( m0m2, m1m3, c0c2, c1c3, a, b, c, d ) do \
|
|
{ \
|
|
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
|
_mm256_set_epi32( c1,c3, c1,c3, c1,c3, c1,c3 ), \
|
|
_mm256_set_epi32( m0,m2, m0,m2, m0,m2, m0,m2 ) ), b ), a ); \
|
|
d = mm256_ror_32( _mm_xor_si128( d, a ), 16 ); \
|
|
c = _mm256_add_epi32( c, d ); \
|
|
b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
|
|
a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
|
|
_mm256_set_epi32( c0,c2, c0,c2, c0,c2, c0,c2 ), \
|
|
_mm256_set_epi32( m1,m3, m1,m3, m1,m3, m1,m3 ) ), b ), a ); \
|
|
d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
|
|
c = _mm256_add_epi32( c, d ); \
|
|
b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
|
|
} while (0)
|
|
|
|
#define ROUND_Sx2_4WAY(r) do \
|
|
{ \
|
|
GS2_4WAY( Mx(r, 0), Mx(r, 1), Mx(r, 2), Mx(r, 3), \
|
|
CSx(r, 0), CSx(r, 1), CSx(r, 2), CSx(r, 3), V0, V2, V4, V6 ); \
|
|
GS2_4WAY( Mx(r, 4), Mx(r, 5), Mx(r, 6), Mx(r, 7), \
|
|
CSx(r, 4), CSx(r, 5), CSx(r, 6), CSx(r, 7), V1, V3, V5, V7 ); \
|
|
mm256_ror1x128_512( V3, V2 ); \
|
|
mm256_ror1x128_512( V6, V7 ); \
|
|
GS2_4WAY( Mx(r, 8), Mx(r, 9), Mx(r, A), Mx(r, B), \
|
|
CSx(r, 8), CSx(r, 9), CSx(r, A), CSx(r, B), V0, V2, V5, V7 ); \
|
|
GS2_4WAY( Mx(r, C), Mx(r, D), Mx(r, C), Mx(r, D), \
|
|
CSx(r, C), CSx(r, D), CSx(r, C), CSx(r, D), V1, V3, V4, V6 ); \
|
|
mm256_rol1x128_512( V2, V3 ); \
|
|
mm256_rol1x128_512( V7, V6 );
|
|
|
|
#define COMPRESS32x2_4WAY( rounds ) do \
|
|
{ \
|
|
__m256i M0, M1, M2, M3, M4, M5, M6, M7; \
|
|
__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
|
|
unsigned r; \
|
|
V0 = H0; \
|
|
V1 = H1; \
|
|
V2 = H2; \
|
|
V3 = H3; \
|
|
V4 = _mm256_xor_si256( S0, _mm256_set_epi32( CS1, CS1, CS1, CS1, \
|
|
CS0, CS0, CS0, CS0 ) ); \
|
|
V5 = _mm256_xor_si256( S1, _mm256_set_epi32( CS3, CS3, CS3, CS3, \
|
|
CS2, CS2, CS2, CS2 ) ); \
|
|
V6 = _mm256_xor_si256( _mm256_set1_epi32( T0 ), \
|
|
_mm256_set_epi32( CS5, CS5, CS5, CS5, \
|
|
CS4, CS4, CS4, CS4 ) ); \
|
|
V7 = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
|
|
_mm256_set_epi32( CS7, CS7, CS7, CS7, \
|
|
CS6, CS6, CS6, CS6 ) ); \
|
|
M0 = mm256_bswap_32( buf[ 0] ); \
|
|
M1 = mm256_bswap_32( buf[ 1] ); \
|
|
M2 = mm256_bswap_32( buf[ 2] ); \
|
|
M3 = mm256_bswap_32( buf[ 3] ); \
|
|
M4 = mm256_bswap_32( buf[ 4] ); \
|
|
M5 = mm256_bswap_32( buf[ 5] ); \
|
|
M6 = mm256_bswap_32( buf[ 6] ); \
|
|
M7 = mm256_bswap_32( buf[ 7] ); \
|
|
ROUND_Sx2_4WAY(0); \
|
|
ROUND_Sx2_4WAY(1); \
|
|
ROUND_Sx2_4WAY(2); \
|
|
ROUND_Sx2_4WAY(3); \
|
|
ROUND_Sx2_4WAY(4); \
|
|
ROUND_Sx2_4WAY(5); \
|
|
ROUND_Sx2_4WAY(6); \
|
|
ROUND_Sx2_4WAY(7); \
|
|
if (rounds == 14) \
|
|
{ \
|
|
ROUND_Sx2_4WAY(8); \
|
|
ROUND_Sx2_4WAY(9); \
|
|
ROUND_Sx2_4WAY(0); \
|
|
ROUND_Sx2_4WAY(1); \
|
|
ROUND_Sx2_4WAY(2); \
|
|
ROUND_Sx2_4WAY(3); \
|
|
} \
|
|
H0 = _mm256_xor_si256( _mm256_xor_si256( \
|
|
_mm256_xor_si256( V8, V0 ), S0 ), H0 ); \
|
|
H1 = _mm256_xor_si256( _mm256_xor_si256( \
|
|
_mm256_xor_si256( V9, V1 ), S1 ), H1 ); \
|
|
H2 = _mm256_xor_si256( _mm256_xor_si256( \
|
|
_mm256_xor_si256( VA, V2 ), S2 ), H2 ); \
|
|
H3 = _mm256_xor_si256( _mm256_xor_si256( \
|
|
_mm256_xor_si256( VB, V3 ), S3 ), H3 ); \
|
|
} while (0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|