mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.15.7
This commit is contained in:
@@ -66,6 +66,17 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
|
||||
a = _mm512_xor_si512(a,c0);\
|
||||
b = _mm512_xor_si512(b,c1);
|
||||
|
||||
#define MULT24W( a0, a1 ) \
|
||||
do { \
|
||||
__m512i b = _mm512_xor_si512( a0, \
|
||||
_mm512_maskz_shuffle_epi32( 0xbbbb, a1, 16 ) ); \
|
||||
a0 = _mm512_or_si512( _mm512_bsrli_epi128( b, 4 ), \
|
||||
_mm512_bslli_epi128( a1,12 ) ); \
|
||||
a1 = _mm512_or_si512( _mm512_bsrli_epi128( a1, 4 ), \
|
||||
_mm512_bslli_epi128( b,12 ) ); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
#define MULT24W( a0, a1, mask ) \
|
||||
do { \
|
||||
__m512i b = _mm512_xor_si512( a0, \
|
||||
@@ -73,6 +84,7 @@ do { \
|
||||
a0 = _mm512_or_si512( _mm512_bsrli_epi128(b,4), _mm512_bslli_epi128(a1,12) );\
|
||||
a1 = _mm512_or_si512( _mm512_bsrli_epi128(a1,4), _mm512_bslli_epi128(b,12) );\
|
||||
} while(0)
|
||||
*/
|
||||
|
||||
// confirm pointer arithmetic
|
||||
// ok but use array indexes
|
||||
@@ -235,7 +247,6 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
__m512i msg0, msg1;
|
||||
__m512i tmp[2];
|
||||
__m512i x[8];
|
||||
const __m512i MASK = m512_const2_64( 0, 0x00000000ffffffff );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
@@ -249,7 +260,7 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
t0 = _mm512_xor_si512( t0, chainv[8] );
|
||||
t1 = _mm512_xor_si512( t1, chainv[9] );
|
||||
|
||||
MULT24W( t0, t1, MASK );
|
||||
MULT24W( t0, t1 );
|
||||
|
||||
msg0 = _mm512_shuffle_epi32( msg[0], 27 );
|
||||
msg1 = _mm512_shuffle_epi32( msg[1], 27 );
|
||||
@@ -268,68 +279,67 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
|
||||
MULT24W( chainv[0], chainv[1], MASK );
|
||||
MULT24W( chainv[0], chainv[1] );
|
||||
chainv[0] = _mm512_xor_si512( chainv[0], chainv[2] );
|
||||
chainv[1] = _mm512_xor_si512( chainv[1], chainv[3] );
|
||||
|
||||
MULT24W( chainv[2], chainv[3], MASK );
|
||||
MULT24W( chainv[2], chainv[3] );
|
||||
chainv[2] = _mm512_xor_si512(chainv[2], chainv[4]);
|
||||
chainv[3] = _mm512_xor_si512(chainv[3], chainv[5]);
|
||||
|
||||
MULT24W( chainv[4], chainv[5], MASK );
|
||||
MULT24W( chainv[4], chainv[5] );
|
||||
chainv[4] = _mm512_xor_si512(chainv[4], chainv[6]);
|
||||
chainv[5] = _mm512_xor_si512(chainv[5], chainv[7]);
|
||||
|
||||
MULT24W( chainv[6], chainv[7], MASK );
|
||||
MULT24W( chainv[6], chainv[7] );
|
||||
chainv[6] = _mm512_xor_si512(chainv[6], chainv[8]);
|
||||
chainv[7] = _mm512_xor_si512(chainv[7], chainv[9]);
|
||||
|
||||
MULT24W( chainv[8], chainv[9], MASK );
|
||||
MULT24W( chainv[8], chainv[9] );
|
||||
chainv[8] = _mm512_xor_si512( chainv[8], t0 );
|
||||
chainv[9] = _mm512_xor_si512( chainv[9], t1 );
|
||||
|
||||
t0 = chainv[8];
|
||||
t1 = chainv[9];
|
||||
|
||||
MULT24W( chainv[8], chainv[9], MASK );
|
||||
MULT24W( chainv[8], chainv[9] );
|
||||
chainv[8] = _mm512_xor_si512( chainv[8], chainv[6] );
|
||||
chainv[9] = _mm512_xor_si512( chainv[9], chainv[7] );
|
||||
|
||||
MULT24W( chainv[6], chainv[7], MASK );
|
||||
MULT24W( chainv[6], chainv[7] );
|
||||
chainv[6] = _mm512_xor_si512( chainv[6], chainv[4] );
|
||||
chainv[7] = _mm512_xor_si512( chainv[7], chainv[5] );
|
||||
|
||||
MULT24W( chainv[4], chainv[5], MASK );
|
||||
MULT24W( chainv[4], chainv[5] );
|
||||
chainv[4] = _mm512_xor_si512( chainv[4], chainv[2] );
|
||||
chainv[5] = _mm512_xor_si512( chainv[5], chainv[3] );
|
||||
|
||||
MULT24W( chainv[2], chainv[3], MASK );
|
||||
MULT24W( chainv[2], chainv[3] );
|
||||
chainv[2] = _mm512_xor_si512( chainv[2], chainv[0] );
|
||||
chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );
|
||||
|
||||
MULT24W( chainv[0], chainv[1], MASK );
|
||||
MULT24W( chainv[0], chainv[1] );
|
||||
chainv[0] = _mm512_xor_si512( _mm512_xor_si512( chainv[0], t0 ), msg0 );
|
||||
chainv[1] = _mm512_xor_si512( _mm512_xor_si512( chainv[1], t1 ), msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
MULT24W( msg0, msg1 );
|
||||
chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
|
||||
chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
MULT24W( msg0, msg1 );
|
||||
chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
|
||||
chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
MULT24W( msg0, msg1 );
|
||||
chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
|
||||
chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
MULT24W( msg0, msg1);
|
||||
chainv[8] = _mm512_xor_si512( chainv[8], msg0 );
|
||||
chainv[9] = _mm512_xor_si512( chainv[9], msg1 );
|
||||
|
||||
MULT24W( msg0, msg1, MASK );
|
||||
MULT24W( msg0, msg1 );
|
||||
|
||||
// replace with ror
|
||||
chainv[3] = _mm512_rol_epi32( chainv[3], 1 );
|
||||
chainv[5] = _mm512_rol_epi32( chainv[5], 2 );
|
||||
chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
|
||||
@@ -496,7 +506,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
|
||||
{
|
||||
// remaining data bytes
|
||||
buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
|
||||
buffer[1] = m512_const2_64( 0, 0x0000000080000000 );
|
||||
buffer[1] = m512_const1_i128( 0x0000000080000000 );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -520,7 +530,7 @@ int luffa_4way_close( luffa_4way_context *state, void *hashval )
|
||||
rnd512_4way( state, buffer );
|
||||
else
|
||||
{ // empty pad block, constant data
|
||||
msg[0] = m512_const2_64( 0, 0x0000000080000000 );
|
||||
msg[0] = m512_const1_i128( 0x0000000080000000 );
|
||||
msg[1] = m512_zero;
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
@@ -583,13 +593,13 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m512_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m512_const1_i128( 0x0000000080000000 );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m512_const2_64( 0, 0x0000000080000000 );
|
||||
msg[0] = m512_const1_i128( 0x0000000080000000 );
|
||||
msg[1] = m512_zero;
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
@@ -631,13 +641,13 @@ int luffa_4way_update_close( luffa_4way_context *state,
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m512_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m512_const1_i128( 0x0000000080000000 );
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m512_const2_64( 0, 0x0000000080000000 );
|
||||
msg[0] = m512_const1_i128( 0x0000000080000000 );
|
||||
msg[1] = m512_zero;
|
||||
rnd512_4way( state, msg );
|
||||
}
|
||||
@@ -832,7 +842,7 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
|
||||
__m256i msg0, msg1;
|
||||
__m256i tmp[2];
|
||||
__m256i x[8];
|
||||
const __m256i MASK = m256_const2_64( 0, 0x00000000ffffffff );
|
||||
const __m256i MASK = m256_const1_i128( 0x00000000ffffffff );
|
||||
|
||||
t0 = chainv[0];
|
||||
t1 = chainv[1];
|
||||
@@ -1088,7 +1098,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
|
||||
{
|
||||
// remaining data bytes
|
||||
buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
|
||||
buffer[1] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
buffer[1] = m256_const1_i128( 0x0000000080000000 );
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -1104,7 +1114,7 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval )
|
||||
rnd512_2way( state, buffer );
|
||||
else
|
||||
{ // empty pad block, constant data
|
||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
msg[0] = m256_const1_i128( 0x0000000080000000 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
@@ -1159,13 +1169,13 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m256_const1_i128( 0x0000000080000000 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
msg[0] = m256_const1_i128( 0x0000000080000000 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
@@ -1206,13 +1216,13 @@ int luffa_2way_update_close( luffa_2way_context *state,
|
||||
{
|
||||
// padding of partial block
|
||||
msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
|
||||
msg[1] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
msg[1] = m256_const1_i128( 0x0000000080000000 );
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
else
|
||||
{
|
||||
// empty pad block
|
||||
msg[0] = m256_const2_64( 0, 0x0000000080000000 );
|
||||
msg[0] = m256_const1_i128( 0x0000000080000000 );
|
||||
msg[1] = m256_zero;
|
||||
rnd512_2way( state, msg );
|
||||
}
|
||||
|
@@ -23,7 +23,7 @@
|
||||
#include "simd-utils.h"
|
||||
#include "luffa_for_sse2.h"
|
||||
|
||||
#define MULT2(a0,a1) do \
|
||||
#define MULT2( a0, a1 ) do \
|
||||
{ \
|
||||
__m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128(a1,MASK), 16 ) ); \
|
||||
a0 = _mm_or_si128( _mm_srli_si128(b,4), _mm_slli_si128(a1,12) ); \
|
||||
@@ -345,11 +345,11 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
// padding of partial block
|
||||
rnd512( state, m128_const_64( 0, 0x80000000 ),
|
||||
rnd512( state, m128_const_i128( 0x80000000 ),
|
||||
mm128_bswap_32( cast_m128i( data ) ) );
|
||||
else
|
||||
// empty pad block
|
||||
rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
|
||||
rnd512( state, m128_zero, m128_const_i128( 0x80000000 ) );
|
||||
|
||||
finalization512( state, (uint32*) output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
@@ -394,11 +394,11 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
|
||||
// 16 byte partial block exists for 80 byte len
|
||||
if ( state->rembytes )
|
||||
// padding of partial block
|
||||
rnd512( state, m128_const_64( 0, 0x80000000 ),
|
||||
rnd512( state, m128_const_i128( 0x80000000 ),
|
||||
mm128_bswap_32( cast_m128i( data ) ) );
|
||||
else
|
||||
// empty pad block
|
||||
rnd512( state, m128_zero, m128_const_64( 0, 0x80000000 ) );
|
||||
rnd512( state, m128_zero, m128_const_i128( 0x80000000 ) );
|
||||
|
||||
finalization512( state, (uint32*) output );
|
||||
if ( state->hashbitlen > 512 )
|
||||
@@ -606,7 +606,6 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
|
||||
casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
// casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
|
||||
rnd512( state, zero, zero );
|
||||
|
||||
@@ -621,7 +620,6 @@ static void finalization512( hashState_luffa *state, uint32 *b )
|
||||
|
||||
casti_m256i( b, 1 ) = _mm256_shuffle_epi8(
|
||||
casti_m256i( hash, 0 ), shuff_bswap32 );
|
||||
// casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
|
||||
}
|
||||
|
||||
#else
|
||||
|
Reference in New Issue
Block a user