mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.7.5
This commit is contained in:
@@ -536,22 +536,22 @@ do { \
|
||||
, _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \
|
||||
_mmset_epi32( CS7, CS7, CS7, CS7 ) ); \
|
||||
M[0x0] = mm_byteswap_epi32( *(buf + 0) ); \
|
||||
M[0x1] = mm_byteswap_epi32( *(buf + 1) ); \
|
||||
M[0x2] = mm_byteswap_epi32( *(buf + 2) ); \
|
||||
M[0x3] = mm_byteswap_epi32( *(buf + 3) ); \
|
||||
M[0x4] = mm_byteswap_epi32( *(buf + 4) ); \
|
||||
M[0x5] = mm_byteswap_epi32( *(buf + 5) ); \
|
||||
M[0x6] = mm_byteswap_epi32( *(buf + 6) ); \
|
||||
M[0x7] = mm_byteswap_epi32( *(buf + 7) ); \
|
||||
M[0x8] = mm_byteswap_epi32( *(buf + 8) ); \
|
||||
M[0x9] = mm_byteswap_epi32( *(buf + 9) ); \
|
||||
M[0xA] = mm_byteswap_epi32( *(buf + 10) ); \
|
||||
M[0xB] = mm_byteswap_epi32( *(buf + 11) ); \
|
||||
M[0xC] = mm_byteswap_epi32( *(buf + 12) ); \
|
||||
M[0xD] = mm_byteswap_epi32( *(buf + 13) ); \
|
||||
M[0xE] = mm_byteswap_epi32( *(buf + 14) ); \
|
||||
M[0xF] = mm_byteswap_epi32( *(buf + 15) ); \
|
||||
M[0x0] = mm_byteswap_32( *(buf + 0) ); \
|
||||
M[0x1] = mm_byteswap_32( *(buf + 1) ); \
|
||||
M[0x2] = mm_byteswap_32( *(buf + 2) ); \
|
||||
M[0x3] = mm_byteswap_32( *(buf + 3) ); \
|
||||
M[0x4] = mm_byteswap_32( *(buf + 4) ); \
|
||||
M[0x5] = mm_byteswap_32( *(buf + 5) ); \
|
||||
M[0x6] = mm_byteswap_32( *(buf + 6) ); \
|
||||
M[0x7] = mm_byteswap_32( *(buf + 7) ); \
|
||||
M[0x8] = mm_byteswap_32( *(buf + 8) ); \
|
||||
M[0x9] = mm_byteswap_32( *(buf + 9) ); \
|
||||
M[0xA] = mm_byteswap_32( *(buf + 10) ); \
|
||||
M[0xB] = mm_byteswap_32( *(buf + 11) ); \
|
||||
M[0xC] = mm_byteswap_32( *(buf + 12) ); \
|
||||
M[0xD] = mm_byteswap_32( *(buf + 13) ); \
|
||||
M[0xE] = mm_byteswap_32( *(buf + 14) ); \
|
||||
M[0xF] = mm_byteswap_32( *(buf + 15) ); \
|
||||
for (r = 0; r < BLAKE32_ROUNDS; r ++) \
|
||||
ROUND_S_4WAY(r); \
|
||||
H0 = _mm_xor_si128( _mm_xor_si128( \
|
||||
@@ -601,22 +601,22 @@ do { \
|
||||
_mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
|
||||
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
|
||||
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
|
||||
M0 = mm_byteswap_epi32( * buf ); \
|
||||
M1 = mm_byteswap_epi32( *(buf+1) ); \
|
||||
M2 = mm_byteswap_epi32( *(buf+2) ); \
|
||||
M3 = mm_byteswap_epi32( *(buf+3) ); \
|
||||
M4 = mm_byteswap_epi32( *(buf+4) ); \
|
||||
M5 = mm_byteswap_epi32( *(buf+5) ); \
|
||||
M6 = mm_byteswap_epi32( *(buf+6) ); \
|
||||
M7 = mm_byteswap_epi32( *(buf+7) ); \
|
||||
M8 = mm_byteswap_epi32( *(buf+8) ); \
|
||||
M9 = mm_byteswap_epi32( *(buf+9) ); \
|
||||
MA = mm_byteswap_epi32( *(buf+10) ); \
|
||||
MB = mm_byteswap_epi32( *(buf+11) ); \
|
||||
MC = mm_byteswap_epi32( *(buf+12) ); \
|
||||
MD = mm_byteswap_epi32( *(buf+13) ); \
|
||||
ME = mm_byteswap_epi32( *(buf+14) ); \
|
||||
MF = mm_byteswap_epi32( *(buf+15) ); \
|
||||
M0 = mm_byteswap_32( * buf ); \
|
||||
M1 = mm_byteswap_32( *(buf+1) ); \
|
||||
M2 = mm_byteswap_32( *(buf+2) ); \
|
||||
M3 = mm_byteswap_32( *(buf+3) ); \
|
||||
M4 = mm_byteswap_32( *(buf+4) ); \
|
||||
M5 = mm_byteswap_32( *(buf+5) ); \
|
||||
M6 = mm_byteswap_32( *(buf+6) ); \
|
||||
M7 = mm_byteswap_32( *(buf+7) ); \
|
||||
M8 = mm_byteswap_32( *(buf+8) ); \
|
||||
M9 = mm_byteswap_32( *(buf+9) ); \
|
||||
MA = mm_byteswap_32( *(buf+10) ); \
|
||||
MB = mm_byteswap_32( *(buf+11) ); \
|
||||
MC = mm_byteswap_32( *(buf+12) ); \
|
||||
MD = mm_byteswap_32( *(buf+13) ); \
|
||||
ME = mm_byteswap_32( *(buf+14) ); \
|
||||
MF = mm_byteswap_32( *(buf+15) ); \
|
||||
ROUND_S_4WAY(0); \
|
||||
ROUND_S_4WAY(1); \
|
||||
ROUND_S_4WAY(2); \
|
||||
@@ -722,22 +722,22 @@ do { \
|
||||
_mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
|
||||
_mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \
|
||||
M[0x0] = mm256_byteswap_epi64( *(buf+0) ); \
|
||||
M[0x1] = mm256_byteswap_epi64( *(buf+1) ); \
|
||||
M[0x2] = mm256_byteswap_epi64( *(buf+2) ); \
|
||||
M[0x3] = mm256_byteswap_epi64( *(buf+3) ); \
|
||||
M[0x4] = mm256_byteswap_epi64( *(buf+4) ); \
|
||||
M[0x5] = mm256_byteswap_epi64( *(buf+5) ); \
|
||||
M[0x6] = mm256_byteswap_epi64( *(buf+6) ); \
|
||||
M[0x7] = mm256_byteswap_epi64( *(buf+7) ); \
|
||||
M[0x8] = mm256_byteswap_epi64( *(buf+8) ); \
|
||||
M[0x9] = mm256_byteswap_epi64( *(buf+9) ); \
|
||||
M[0xA] = mm256_byteswap_epi64( *(buf+10) ); \
|
||||
M[0xB] = mm256_byteswap_epi64( *(buf+11) ); \
|
||||
M[0xC] = mm256_byteswap_epi64( *(buf+12) ); \
|
||||
M[0xD] = mm256_byteswap_epi64( *(buf+13) ); \
|
||||
M[0xE] = mm256_byteswap_epi64( *(buf+14) ); \
|
||||
M[0xF] = mm256_byteswap_epi64( *(buf+15) ); \
|
||||
M[0x0] = mm256_byteswap_64( *(buf+0) ); \
|
||||
M[0x1] = mm256_byteswap_64( *(buf+1) ); \
|
||||
M[0x2] = mm256_byteswap_64( *(buf+2) ); \
|
||||
M[0x3] = mm256_byteswap_64( *(buf+3) ); \
|
||||
M[0x4] = mm256_byteswap_64( *(buf+4) ); \
|
||||
M[0x5] = mm256_byteswap_64( *(buf+5) ); \
|
||||
M[0x6] = mm256_byteswap_64( *(buf+6) ); \
|
||||
M[0x7] = mm256_byteswap_64( *(buf+7) ); \
|
||||
M[0x8] = mm256_byteswap_64( *(buf+8) ); \
|
||||
M[0x9] = mm256_byteswap_64( *(buf+9) ); \
|
||||
M[0xA] = mm256_byteswap_64( *(buf+10) ); \
|
||||
M[0xB] = mm256_byteswap_64( *(buf+11) ); \
|
||||
M[0xC] = mm256_byteswap_64( *(buf+12) ); \
|
||||
M[0xD] = mm256_byteswap_64( *(buf+13) ); \
|
||||
M[0xE] = mm256_byteswap_64( *(buf+14) ); \
|
||||
M[0xF] = mm256_byteswap_64( *(buf+15) ); \
|
||||
for (r = 0; r < 16; r ++) \
|
||||
ROUND_B_4WAY(r); \
|
||||
H0 = _mm256_xor_si256( _mm256_xor_si256( \
|
||||
@@ -787,22 +787,22 @@ do { \
|
||||
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
|
||||
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
|
||||
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
|
||||
M0 = mm256_byteswap_epi64( *(buf + 0) ); \
|
||||
M1 = mm256_byteswap_epi64( *(buf + 1) ); \
|
||||
M2 = mm256_byteswap_epi64( *(buf + 2) ); \
|
||||
M3 = mm256_byteswap_epi64( *(buf + 3) ); \
|
||||
M4 = mm256_byteswap_epi64( *(buf + 4) ); \
|
||||
M5 = mm256_byteswap_epi64( *(buf + 5) ); \
|
||||
M6 = mm256_byteswap_epi64( *(buf + 6) ); \
|
||||
M7 = mm256_byteswap_epi64( *(buf + 7) ); \
|
||||
M8 = mm256_byteswap_epi64( *(buf + 8) ); \
|
||||
M9 = mm256_byteswap_epi64( *(buf + 9) ); \
|
||||
MA = mm256_byteswap_epi64( *(buf + 10) ); \
|
||||
MB = mm256_byteswap_epi64( *(buf + 11) ); \
|
||||
MC = mm256_byteswap_epi64( *(buf + 12) ); \
|
||||
MD = mm256_byteswap_epi64( *(buf + 13) ); \
|
||||
ME = mm256_byteswap_epi64( *(buf + 14) ); \
|
||||
MF = mm256_byteswap_epi64( *(buf + 15) ); \
|
||||
M0 = mm256_byteswap_64( *(buf + 0) ); \
|
||||
M1 = mm256_byteswap_64( *(buf + 1) ); \
|
||||
M2 = mm256_byteswap_64( *(buf + 2) ); \
|
||||
M3 = mm256_byteswap_64( *(buf + 3) ); \
|
||||
M4 = mm256_byteswap_64( *(buf + 4) ); \
|
||||
M5 = mm256_byteswap_64( *(buf + 5) ); \
|
||||
M6 = mm256_byteswap_64( *(buf + 6) ); \
|
||||
M7 = mm256_byteswap_64( *(buf + 7) ); \
|
||||
M8 = mm256_byteswap_64( *(buf + 8) ); \
|
||||
M9 = mm256_byteswap_64( *(buf + 9) ); \
|
||||
MA = mm256_byteswap_64( *(buf + 10) ); \
|
||||
MB = mm256_byteswap_64( *(buf + 11) ); \
|
||||
MC = mm256_byteswap_64( *(buf + 12) ); \
|
||||
MD = mm256_byteswap_64( *(buf + 13) ); \
|
||||
ME = mm256_byteswap_64( *(buf + 14) ); \
|
||||
MF = mm256_byteswap_64( *(buf + 15) ); \
|
||||
ROUND_B_4WAY(0); \
|
||||
ROUND_B_4WAY(1); \
|
||||
ROUND_B_4WAY(2); \
|
||||
@@ -870,7 +870,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
|
||||
|
||||
if ( len < buf_size - ptr )
|
||||
{
|
||||
memcpy_m128i( buf + (ptr>>2), vdata, len>>2 );
|
||||
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
@@ -884,7 +884,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
|
||||
clen = buf_size - ptr;
|
||||
if (clen > len)
|
||||
clen = len;
|
||||
memcpy_m128i( buf + (ptr>>2), vdata, clen>>2 );
|
||||
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
|
||||
ptr += clen;
|
||||
vdata += (clen>>2);
|
||||
len -= clen;
|
||||
@@ -936,32 +936,32 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
|
||||
|
||||
if ( ptr <= 48 )
|
||||
{
|
||||
memset_zero_m128i( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
|
||||
memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
|
||||
_mm_set_epi32( 0x010000000, 0x01000000,
|
||||
0x010000000, 0x01000000 ) );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_epi32( _mm_set_epi32( th, th, th, th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_epi32( _mm_set_epi32( tl, tl, tl, tl ) );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
|
||||
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_m128i( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||
memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
|
||||
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
|
||||
sc->T0 = SPH_C32(0xFFFFFE00);
|
||||
sc->T1 = SPH_C32(0xFFFFFFFF);
|
||||
memset_zero_m128i( u.buf, 56>>2 );
|
||||
memset_zero_128( u.buf, 56>>2 );
|
||||
if (out_size_w32 == 8)
|
||||
u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
|
||||
0x010000000, 0x01000000 );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_epi32( _mm_set_epi32( th, th, th, th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_epi32( _mm_set_epi32( tl, tl, tl, tl ) );
|
||||
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
|
||||
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
|
||||
blake32_4way( sc, u.buf, 64 );
|
||||
}
|
||||
out = (__m128i*)dst;
|
||||
for ( k = 0; k < out_size_w32; k++ )
|
||||
out[k] = mm_byteswap_epi32( sc->H[k] );
|
||||
out[k] = mm_byteswap_32( sc->H[k] );
|
||||
}
|
||||
|
||||
#if defined (__AVX2__)
|
||||
@@ -995,7 +995,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
||||
ptr = sc->ptr;
|
||||
if ( len < (buf_size - ptr) )
|
||||
{
|
||||
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
|
||||
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
|
||||
ptr += len;
|
||||
sc->ptr = ptr;
|
||||
return;
|
||||
@@ -1009,7 +1009,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
|
||||
clen = buf_size - ptr;
|
||||
if ( clen > len )
|
||||
clen = len;
|
||||
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
|
||||
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
|
||||
ptr += clen;
|
||||
vdata = vdata + (clen>>3);
|
||||
len -= clen;
|
||||
@@ -1062,44 +1062,44 @@ blake64_4way_close( blake_4way_big_context *sc,
|
||||
}
|
||||
if ( ptr <= 104 )
|
||||
{
|
||||
memset_zero_m256i( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
|
||||
_mm256_set_epi64x( 0x0100000000000000,
|
||||
0x0100000000000000,
|
||||
0x0100000000000000,
|
||||
0x0100000000000000 ) );
|
||||
*(u.buf+(112>>3)) = mm256_byteswap_epi64(
|
||||
*(u.buf+(112>>3)) = mm256_byteswap_64(
|
||||
_mm256_set_epi64x( th, th, th, th ) );
|
||||
*(u.buf+(120>>3)) = mm256_byteswap_epi64(
|
||||
*(u.buf+(120>>3)) = mm256_byteswap_64(
|
||||
_mm256_set_epi64x( tl, tl, tl, tl ) );
|
||||
|
||||
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
|
||||
}
|
||||
else
|
||||
{
|
||||
memset_zero_m256i( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
|
||||
|
||||
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
|
||||
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
|
||||
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
|
||||
memset_zero_m256i( u.buf, 112>>3 );
|
||||
memset_zero_256( u.buf, 112>>3 );
|
||||
if ( out_size_w64 == 8 )
|
||||
u.buf[104>>3] = _mm256_set_epi64x( 0x0100000000000000,
|
||||
0x0100000000000000,
|
||||
0x0100000000000000,
|
||||
0x0100000000000000 );
|
||||
|
||||
*(u.buf+(112>>3)) = mm256_byteswap_epi64(
|
||||
*(u.buf+(112>>3)) = mm256_byteswap_64(
|
||||
_mm256_set_epi64x( th, th, th, th ) );
|
||||
*(u.buf+(120>>3)) = mm256_byteswap_epi64(
|
||||
*(u.buf+(120>>3)) = mm256_byteswap_64(
|
||||
_mm256_set_epi64x( tl, tl, tl, tl ) );
|
||||
|
||||
blake64_4way( sc, u.buf, 128 );
|
||||
}
|
||||
out = (__m256i*)dst;
|
||||
for ( k = 0; k < out_size_w64; k++ )
|
||||
out[k] = mm256_byteswap_epi64( sc->H[k] );
|
||||
out[k] = mm256_byteswap_64( sc->H[k] );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user