This commit is contained in:
Jay D Dee
2017-12-08 15:39:28 -05:00
parent 4b57ac0eb9
commit af1c940919
53 changed files with 1324 additions and 4790 deletions

View File

@@ -536,22 +536,22 @@ do { \
, _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \
_mmset_epi32( CS7, CS7, CS7, CS7 ) ); \
M[0x0] = mm_byteswap_epi32( *(buf + 0) ); \
M[0x1] = mm_byteswap_epi32( *(buf + 1) ); \
M[0x2] = mm_byteswap_epi32( *(buf + 2) ); \
M[0x3] = mm_byteswap_epi32( *(buf + 3) ); \
M[0x4] = mm_byteswap_epi32( *(buf + 4) ); \
M[0x5] = mm_byteswap_epi32( *(buf + 5) ); \
M[0x6] = mm_byteswap_epi32( *(buf + 6) ); \
M[0x7] = mm_byteswap_epi32( *(buf + 7) ); \
M[0x8] = mm_byteswap_epi32( *(buf + 8) ); \
M[0x9] = mm_byteswap_epi32( *(buf + 9) ); \
M[0xA] = mm_byteswap_epi32( *(buf + 10) ); \
M[0xB] = mm_byteswap_epi32( *(buf + 11) ); \
M[0xC] = mm_byteswap_epi32( *(buf + 12) ); \
M[0xD] = mm_byteswap_epi32( *(buf + 13) ); \
M[0xE] = mm_byteswap_epi32( *(buf + 14) ); \
M[0xF] = mm_byteswap_epi32( *(buf + 15) ); \
M[0x0] = mm_byteswap_32( *(buf + 0) ); \
M[0x1] = mm_byteswap_32( *(buf + 1) ); \
M[0x2] = mm_byteswap_32( *(buf + 2) ); \
M[0x3] = mm_byteswap_32( *(buf + 3) ); \
M[0x4] = mm_byteswap_32( *(buf + 4) ); \
M[0x5] = mm_byteswap_32( *(buf + 5) ); \
M[0x6] = mm_byteswap_32( *(buf + 6) ); \
M[0x7] = mm_byteswap_32( *(buf + 7) ); \
M[0x8] = mm_byteswap_32( *(buf + 8) ); \
M[0x9] = mm_byteswap_32( *(buf + 9) ); \
M[0xA] = mm_byteswap_32( *(buf + 10) ); \
M[0xB] = mm_byteswap_32( *(buf + 11) ); \
M[0xC] = mm_byteswap_32( *(buf + 12) ); \
M[0xD] = mm_byteswap_32( *(buf + 13) ); \
M[0xE] = mm_byteswap_32( *(buf + 14) ); \
M[0xF] = mm_byteswap_32( *(buf + 15) ); \
for (r = 0; r < BLAKE32_ROUNDS; r ++) \
ROUND_S_4WAY(r); \
H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -601,22 +601,22 @@ do { \
_mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
_mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
M0 = mm_byteswap_epi32( * buf ); \
M1 = mm_byteswap_epi32( *(buf+1) ); \
M2 = mm_byteswap_epi32( *(buf+2) ); \
M3 = mm_byteswap_epi32( *(buf+3) ); \
M4 = mm_byteswap_epi32( *(buf+4) ); \
M5 = mm_byteswap_epi32( *(buf+5) ); \
M6 = mm_byteswap_epi32( *(buf+6) ); \
M7 = mm_byteswap_epi32( *(buf+7) ); \
M8 = mm_byteswap_epi32( *(buf+8) ); \
M9 = mm_byteswap_epi32( *(buf+9) ); \
MA = mm_byteswap_epi32( *(buf+10) ); \
MB = mm_byteswap_epi32( *(buf+11) ); \
MC = mm_byteswap_epi32( *(buf+12) ); \
MD = mm_byteswap_epi32( *(buf+13) ); \
ME = mm_byteswap_epi32( *(buf+14) ); \
MF = mm_byteswap_epi32( *(buf+15) ); \
M0 = mm_byteswap_32( * buf ); \
M1 = mm_byteswap_32( *(buf+1) ); \
M2 = mm_byteswap_32( *(buf+2) ); \
M3 = mm_byteswap_32( *(buf+3) ); \
M4 = mm_byteswap_32( *(buf+4) ); \
M5 = mm_byteswap_32( *(buf+5) ); \
M6 = mm_byteswap_32( *(buf+6) ); \
M7 = mm_byteswap_32( *(buf+7) ); \
M8 = mm_byteswap_32( *(buf+8) ); \
M9 = mm_byteswap_32( *(buf+9) ); \
MA = mm_byteswap_32( *(buf+10) ); \
MB = mm_byteswap_32( *(buf+11) ); \
MC = mm_byteswap_32( *(buf+12) ); \
MD = mm_byteswap_32( *(buf+13) ); \
ME = mm_byteswap_32( *(buf+14) ); \
MF = mm_byteswap_32( *(buf+15) ); \
ROUND_S_4WAY(0); \
ROUND_S_4WAY(1); \
ROUND_S_4WAY(2); \
@@ -722,22 +722,22 @@ do { \
_mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
_mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \
M[0x0] = mm256_byteswap_epi64( *(buf+0) ); \
M[0x1] = mm256_byteswap_epi64( *(buf+1) ); \
M[0x2] = mm256_byteswap_epi64( *(buf+2) ); \
M[0x3] = mm256_byteswap_epi64( *(buf+3) ); \
M[0x4] = mm256_byteswap_epi64( *(buf+4) ); \
M[0x5] = mm256_byteswap_epi64( *(buf+5) ); \
M[0x6] = mm256_byteswap_epi64( *(buf+6) ); \
M[0x7] = mm256_byteswap_epi64( *(buf+7) ); \
M[0x8] = mm256_byteswap_epi64( *(buf+8) ); \
M[0x9] = mm256_byteswap_epi64( *(buf+9) ); \
M[0xA] = mm256_byteswap_epi64( *(buf+10) ); \
M[0xB] = mm256_byteswap_epi64( *(buf+11) ); \
M[0xC] = mm256_byteswap_epi64( *(buf+12) ); \
M[0xD] = mm256_byteswap_epi64( *(buf+13) ); \
M[0xE] = mm256_byteswap_epi64( *(buf+14) ); \
M[0xF] = mm256_byteswap_epi64( *(buf+15) ); \
M[0x0] = mm256_byteswap_64( *(buf+0) ); \
M[0x1] = mm256_byteswap_64( *(buf+1) ); \
M[0x2] = mm256_byteswap_64( *(buf+2) ); \
M[0x3] = mm256_byteswap_64( *(buf+3) ); \
M[0x4] = mm256_byteswap_64( *(buf+4) ); \
M[0x5] = mm256_byteswap_64( *(buf+5) ); \
M[0x6] = mm256_byteswap_64( *(buf+6) ); \
M[0x7] = mm256_byteswap_64( *(buf+7) ); \
M[0x8] = mm256_byteswap_64( *(buf+8) ); \
M[0x9] = mm256_byteswap_64( *(buf+9) ); \
M[0xA] = mm256_byteswap_64( *(buf+10) ); \
M[0xB] = mm256_byteswap_64( *(buf+11) ); \
M[0xC] = mm256_byteswap_64( *(buf+12) ); \
M[0xD] = mm256_byteswap_64( *(buf+13) ); \
M[0xE] = mm256_byteswap_64( *(buf+14) ); \
M[0xF] = mm256_byteswap_64( *(buf+15) ); \
for (r = 0; r < 16; r ++) \
ROUND_B_4WAY(r); \
H0 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -787,22 +787,22 @@ do { \
_mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
_mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
M0 = mm256_byteswap_epi64( *(buf + 0) ); \
M1 = mm256_byteswap_epi64( *(buf + 1) ); \
M2 = mm256_byteswap_epi64( *(buf + 2) ); \
M3 = mm256_byteswap_epi64( *(buf + 3) ); \
M4 = mm256_byteswap_epi64( *(buf + 4) ); \
M5 = mm256_byteswap_epi64( *(buf + 5) ); \
M6 = mm256_byteswap_epi64( *(buf + 6) ); \
M7 = mm256_byteswap_epi64( *(buf + 7) ); \
M8 = mm256_byteswap_epi64( *(buf + 8) ); \
M9 = mm256_byteswap_epi64( *(buf + 9) ); \
MA = mm256_byteswap_epi64( *(buf + 10) ); \
MB = mm256_byteswap_epi64( *(buf + 11) ); \
MC = mm256_byteswap_epi64( *(buf + 12) ); \
MD = mm256_byteswap_epi64( *(buf + 13) ); \
ME = mm256_byteswap_epi64( *(buf + 14) ); \
MF = mm256_byteswap_epi64( *(buf + 15) ); \
M0 = mm256_byteswap_64( *(buf + 0) ); \
M1 = mm256_byteswap_64( *(buf + 1) ); \
M2 = mm256_byteswap_64( *(buf + 2) ); \
M3 = mm256_byteswap_64( *(buf + 3) ); \
M4 = mm256_byteswap_64( *(buf + 4) ); \
M5 = mm256_byteswap_64( *(buf + 5) ); \
M6 = mm256_byteswap_64( *(buf + 6) ); \
M7 = mm256_byteswap_64( *(buf + 7) ); \
M8 = mm256_byteswap_64( *(buf + 8) ); \
M9 = mm256_byteswap_64( *(buf + 9) ); \
MA = mm256_byteswap_64( *(buf + 10) ); \
MB = mm256_byteswap_64( *(buf + 11) ); \
MC = mm256_byteswap_64( *(buf + 12) ); \
MD = mm256_byteswap_64( *(buf + 13) ); \
ME = mm256_byteswap_64( *(buf + 14) ); \
MF = mm256_byteswap_64( *(buf + 15) ); \
ROUND_B_4WAY(0); \
ROUND_B_4WAY(1); \
ROUND_B_4WAY(2); \
@@ -870,7 +870,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
if ( len < buf_size - ptr )
{
memcpy_m128i( buf + (ptr>>2), vdata, len>>2 );
memcpy_128( buf + (ptr>>2), vdata, len>>2 );
ptr += len;
sc->ptr = ptr;
return;
@@ -884,7 +884,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
clen = buf_size - ptr;
if (clen > len)
clen = len;
memcpy_m128i( buf + (ptr>>2), vdata, clen>>2 );
memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
ptr += clen;
vdata += (clen>>2);
len -= clen;
@@ -936,32 +936,32 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
if ( ptr <= 48 )
{
memset_zero_m128i( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
_mm_set_epi32( 0x010000000, 0x01000000,
0x010000000, 0x01000000 ) );
*(u.buf+(56>>2)) = mm_byteswap_epi32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_epi32( _mm_set_epi32( tl, tl, tl, tl ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
}
else
{
memset_zero_m128i( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
sc->T0 = SPH_C32(0xFFFFFE00);
sc->T1 = SPH_C32(0xFFFFFFFF);
memset_zero_m128i( u.buf, 56>>2 );
memset_zero_128( u.buf, 56>>2 );
if (out_size_w32 == 8)
u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
0x010000000, 0x01000000 );
*(u.buf+(56>>2)) = mm_byteswap_epi32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_epi32( _mm_set_epi32( tl, tl, tl, tl ) );
*(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
*(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
blake32_4way( sc, u.buf, 64 );
}
out = (__m128i*)dst;
for ( k = 0; k < out_size_w32; k++ )
out[k] = mm_byteswap_epi32( sc->H[k] );
out[k] = mm_byteswap_32( sc->H[k] );
}
#if defined (__AVX2__)
@@ -995,7 +995,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
ptr = sc->ptr;
if ( len < (buf_size - ptr) )
{
memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
memcpy_256( buf + (ptr>>3), vdata, len>>3 );
ptr += len;
sc->ptr = ptr;
return;
@@ -1009,7 +1009,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
clen = buf_size - ptr;
if ( clen > len )
clen = len;
memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
ptr += clen;
vdata = vdata + (clen>>3);
len -= clen;
@@ -1062,44 +1062,44 @@ blake64_4way_close( blake_4way_big_context *sc,
}
if ( ptr <= 104 )
{
memset_zero_m256i( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
if ( out_size_w64 == 8 )
u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
_mm256_set_epi64x( 0x0100000000000000,
0x0100000000000000,
0x0100000000000000,
0x0100000000000000 ) );
*(u.buf+(112>>3)) = mm256_byteswap_epi64(
*(u.buf+(112>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_epi64(
*(u.buf+(120>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
}
else
{
memset_zero_m256i( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
memset_zero_m256i( u.buf, 112>>3 );
memset_zero_256( u.buf, 112>>3 );
if ( out_size_w64 == 8 )
u.buf[104>>3] = _mm256_set_epi64x( 0x0100000000000000,
0x0100000000000000,
0x0100000000000000,
0x0100000000000000 );
*(u.buf+(112>>3)) = mm256_byteswap_epi64(
*(u.buf+(112>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( th, th, th, th ) );
*(u.buf+(120>>3)) = mm256_byteswap_epi64(
*(u.buf+(120>>3)) = mm256_byteswap_64(
_mm256_set_epi64x( tl, tl, tl, tl ) );
blake64_4way( sc, u.buf, 128 );
}
out = (__m256i*)dst;
for ( k = 0; k < out_size_w64; k++ )
out[k] = mm256_byteswap_epi64( sc->H[k] );
out[k] = mm256_byteswap_64( sc->H[k] );
}
#endif