mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.11.0
This commit is contained in:
@@ -2283,7 +2283,46 @@ static inline void rintrlv_8x32_8x64( void *dst,
|
||||
d[63] = _mm_unpackhi_epi32( s[61], s[63] );
|
||||
}
|
||||
|
||||
// 8x32 -> 4x128
|
||||
|
||||
// 16 bytes per lane
|
||||
#define RLEAVE_8X32_4X128( i ) \
|
||||
do { \
|
||||
uint32_t *d0 = (uint32_t*)dst0 + (i); \
|
||||
uint32_t *d1 = (uint32_t*)dst1 + (i); \
|
||||
const uint32_t *s = (const uint32_t*)src + ((i)<<1); \
|
||||
d0[ 0] = s[ 0]; d1[ 0] = s[ 4]; \
|
||||
d0[ 1] = s[ 8]; d1[ 1] = s[12]; \
|
||||
d0[ 2] = s[16]; d1[ 2] = s[20]; \
|
||||
d0[ 3] = s[24]; d1[ 3] = s[28]; \
|
||||
\
|
||||
d0[ 4] = s[ 1]; d1[ 4] = s[ 5]; \
|
||||
d0[ 5] = s[ 9]; d1[ 5] = s[13]; \
|
||||
d0[ 6] = s[17]; d1[ 6] = s[21]; \
|
||||
d0[ 7] = s[25]; d1[ 7] = s[29]; \
|
||||
\
|
||||
d0[ 8] = s[ 2]; d1[ 8] = s[ 6]; \
|
||||
d0[ 9] = s[10]; d1[ 9] = s[14]; \
|
||||
d0[10] = s[18]; d1[10] = s[22]; \
|
||||
d0[11] = s[26]; d1[11] = s[30]; \
|
||||
\
|
||||
d0[12] = s[ 3]; d1[12] = s[ 7]; \
|
||||
d0[13] = s[11]; d1[13] = s[15]; \
|
||||
d0[14] = s[19]; d1[14] = s[23]; \
|
||||
d0[15] = s[27]; d1[15] = s[31]; \
|
||||
} while(0)
|
||||
|
||||
static inline void rintrlv_8x32_4x128( void *dst0, void *dst1,
|
||||
const void *src, const int bit_len )
|
||||
{
|
||||
RLEAVE_8X32_4X128( 0 ); RLEAVE_8X32_4X128( 16 );
|
||||
if ( bit_len <= 256 ) return;
|
||||
RLEAVE_8X32_4X128( 32 ); RLEAVE_8X32_4X128( 48 );
|
||||
if ( bit_len <= 512 ) return;
|
||||
RLEAVE_8X32_4X128( 64 ); RLEAVE_8X32_4X128( 80 );
|
||||
RLEAVE_8X32_4X128( 96 ); RLEAVE_8X32_4X128( 112 );
|
||||
}
|
||||
#undef RLEAVE_8X32_4X128
|
||||
|
||||
/*
|
||||
#define RLEAVE_4x32_4x64(i) do \
|
||||
|
||||
@@ -42,17 +42,18 @@ static inline __m256i m256_const_64( const uint64_t i3, const uint64_t i2,
|
||||
return mm256_concat_128( hi, lo );
|
||||
}
|
||||
|
||||
// Broadcast 128 bits in pairs of 64 bit integer constants {i1. i0} to all
|
||||
// 128 bit lanes.
|
||||
#define m256_const2_64( i1, i0 ) \
|
||||
_mm256_permute4x64_epi64( _mm256_castsi128_si256( \
|
||||
m128_const_64( i1, i0 ) ), 0x44 )
|
||||
|
||||
// Equivalent of set1, broadcast integer constant to all elements.
|
||||
#define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
#define m256_const1_16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define m256_const1_8 ( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||
#define m256_const1_128( v ) _mm256_broadcastsi128_si256( v )
|
||||
#define m256_const1_64( i ) _mm256_broadcastq_epi64( mm128_mov64_128( i ) )
|
||||
#define m256_const1_32( i ) _mm256_broadcastd_epi32( mm128_mov32_128( i ) )
|
||||
#define m256_const1_16( i ) _mm256_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define m256_const1_8 ( i ) _mm256_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||
|
||||
#define m256_const2_64( i1, i0 ) \
|
||||
m256_const1_128( m128_const_64( i1, i0 ) )
|
||||
|
||||
#define m126_const2_32( i1, i0 ) \
|
||||
m256_const1_64( ( (uint64_t)(i1) << 32 ) | ( (uint64_t)(i0) & 0xffffffff ) )
|
||||
|
||||
|
||||
//
|
||||
|
||||
@@ -38,6 +38,36 @@
|
||||
// shuffle_epi8 shuffles accross entire 512 bits. Shuffle usually
|
||||
// doesn't cross 128 bit lane boundaries but is consistent with AVX2
|
||||
// where shuffle_epi8 spans the entire vector.
|
||||
//
|
||||
// There are 2 areas where overhead is aconcern: constants and
|
||||
// permutations.
|
||||
//
|
||||
// Constants need to be composed at run time by assembling individual
|
||||
// elements, very expensive. The cost is proportional to the number of
|
||||
// elements therefor use the largest element size possible, even by
|
||||
// merging smaller values.
|
||||
//
|
||||
// Constants with repeating patterns can be optimized with the smaller
|
||||
// patterns repeated more frequently being more efficient.
|
||||
//
|
||||
// Some specific constants can be very efficient. Zero is very efficient,
|
||||
// 1 and -1 slightly less so.
|
||||
//
|
||||
// If an expensive constant is to be reused in the same function it should
|
||||
// be declared as a local variable defined once and reused.
|
||||
//
|
||||
// Permutations cab be very exppensive if they use a vector control index,
|
||||
// even if the permutation itself is quite efficient.
|
||||
// The index is essentially a constant with all the baggage that brings.
|
||||
// The same rules apply, if an index is to be reused it should be defined
|
||||
// as a local. This applies specifically to bswap operations.
|
||||
//
|
||||
// Additionally, permutations using smaller vectors can be more efficient
|
||||
// if the permutation doesn't cross lane boundaries ,typically 128 bits,
|
||||
// ans the smnaller vector can use an imm comtrol.
|
||||
//
|
||||
// If the permutation doesn't cross lane boundaries a shuffle instructions
|
||||
// can be used with imm control instead of permute.
|
||||
|
||||
//////////////////////////////////////////////////////////////
|
||||
//
|
||||
@@ -106,12 +136,14 @@ static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
|
||||
#define m512_const1_16( i ) _mm512_broadcastw_epi16( mm128_mov32_128( i ) )
|
||||
#define m512_const1_8( i ) _mm512_broadcastb_epi8 ( mm128_mov32_128( i ) )
|
||||
|
||||
#define m512_const2_128( v1, v0 ) \
|
||||
m512_const1_256( _mm512_inserti64x2( _mm512_castsi128_si512( lo ), hi, 1 ) )
|
||||
|
||||
#define m512_const2_64( i1, i0 ) \
|
||||
m512_const1_128( m128_const_64( i1, i0 ) )
|
||||
|
||||
#define m512_const2_32( i1, i0 ) \
|
||||
m512_const1_64( ( ( ( (uint64_t)(i1) << 32 ) ) \
|
||||
| ( (uint64_t)(i0) & 0xffffffff ) ) )
|
||||
m512_const1_64( ( (uint64_t)(i1) << 32 ) | ( (uint64_t)(i0) & 0xffffffff ) )
|
||||
|
||||
// { m128_1, m128_1, m128_0, m128_0 }
|
||||
#define m512_const_2x128( v1, v0 ) \
|
||||
|
||||
Reference in New Issue
Block a user