mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.19.9
This commit is contained in:
@@ -2654,6 +2654,10 @@ static inline void intrlv_2x128( void *dst, const void *src0,
|
||||
d[10] = s0[5]; d[11] = s1[5];
|
||||
d[12] = s0[6]; d[13] = s1[6];
|
||||
d[14] = s0[7]; d[15] = s1[7];
|
||||
if ( bit_len <= 1024 ) return;
|
||||
d[16] = s0[8]; d[17] = s1[8];
|
||||
d[18] = s0[9]; d[19] = s1[9];
|
||||
// if ( bit_len <= 1280 ) return;
|
||||
}
|
||||
|
||||
static inline void intrlv_2x128_512( void *dst, const void *src0,
|
||||
@@ -2721,6 +2725,10 @@ static inline void intrlv_4x128( void *dst, const void *src0,
|
||||
d[20] = s0[5]; d[21] = s1[5]; d[22] = s2[5]; d[23] = s3[5];
|
||||
d[24] = s0[6]; d[25] = s1[6]; d[26] = s2[6]; d[27] = s3[6];
|
||||
d[28] = s0[7]; d[29] = s1[7]; d[30] = s2[7]; d[31] = s3[7];
|
||||
if ( bit_len <= 1024 ) return;
|
||||
d[32] = s0[8]; d[33] = s1[8]; d[34] = s2[8]; d[35] = s3[8];
|
||||
d[36] = s0[9]; d[37] = s1[9]; d[38] = s2[9]; d[39] = s3[9];
|
||||
// if ( bit_len <= 1280 ) return;
|
||||
}
|
||||
|
||||
static inline void intrlv_4x128_512( void *dst, const void *src0,
|
||||
|
@@ -411,7 +411,8 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
|
||||
#define mm128_rol_16( v, c ) \
|
||||
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
|
||||
|
||||
// Limited 2 input shuffle
|
||||
// Limited 2 input shuffle, combines shuffle with blend. The destination low
|
||||
// half is always taken from src a, and the high half from src b.
|
||||
#define mm128_shuffle2_64( a, b, c ) \
|
||||
_mm_castpd_si128( _mm_shuffle_pd( _mm_castsi128_pd( a ), \
|
||||
_mm_castsi128_pd( b ), c ) );
|
||||
|
@@ -442,8 +442,14 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
|
||||
#define mm256_shuflr64_32 mm256_swap64_32
|
||||
#define mm256_shufll64_32 mm256_swap64_32
|
||||
|
||||
//
|
||||
// Swap bytes in vector elements, endian bswap.
|
||||
// NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
|
||||
// lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
|
||||
// AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
|
||||
// needed for a shuffle that crosses 128 bit lanes. BSWAP doesn't therefore the
|
||||
// AVX2 version will work here. The bswap control vector is coded to work
|
||||
// with both versions, bit 4 is ignored in AVX2.
|
||||
|
||||
// Reverse byte order in elements, endian bswap.
|
||||
#define mm256_bswap_64( v ) \
|
||||
_mm256_shuffle_epi8( v, \
|
||||
m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
|
||||
|
@@ -318,6 +318,9 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
|
||||
// AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
|
||||
// elements and can be called directly. But they only accept immediate 8
|
||||
// for control arg.
|
||||
// The workaround is a fraud, just a fluke of the compiler's optimizer.
|
||||
// It fails without -O3. The compiler seems to unroll shift loops, eliminating
|
||||
// the variable control, better than rotate loops.
|
||||
//
|
||||
// _mm512_rol_epi64, _mm512_ror_epi64, _mm512_rol_epi32, _mm512_ror_epi32
|
||||
// _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
|
||||
@@ -430,21 +433,9 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
|
||||
casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
|
||||
} while(0)
|
||||
|
||||
//
|
||||
// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
|
||||
//
|
||||
|
||||
// rename plan change ror to vror for Vector ROtate Right,
|
||||
// and vrol for Vector ROtate Left, not to be confused with
|
||||
//variable rotate rorv, rolv,
|
||||
// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
|
||||
// operation. 1xNN notaion ia also removed and replaced with simpler NN.
|
||||
// Swap will still have its own mnemonic and will be aliased as both
|
||||
// left and right shuffles.
|
||||
|
||||
// Shift elements right or left in 512 bit vector, filling with zeros.
|
||||
// Multiple element shifts can be combined into a single larger
|
||||
// element shift.
|
||||
// Cross-lane shuffles implementing rotate & shift of elements within a vector.
|
||||
//
|
||||
|
||||
#define mm512_shiftr_256( v ) \
|
||||
_mm512_alignr_epi64( _mm512_setzero, v, 4 )
|
||||
@@ -530,7 +521,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
// 128 bit lane shift is handled by bslli bsrli.
|
||||
|
||||
// Swap hi & lo 128 bits in each 256 bit lane
|
||||
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
|
||||
#define mm512_swap256_128( v ) _mm512_permutex_epi64( v, 0x4e )
|
||||
#define mm512_shuflr256_128 mm512_swap256_128
|
||||
#define mm512_shufll256_128 mm512_swap256_128
|
||||
|
||||
@@ -584,7 +575,9 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
|
||||
//
|
||||
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
|
||||
|
||||
// Limited 2 input, 1 output shuffle within 128 bit lanes.
|
||||
// Limited 2 input, 1 output shuffle, combines shuffle with blend.
|
||||
// Like most shuffles it's limited to 128 bit lanes and like some shuffles
|
||||
// destination elements must come from a specific source.
|
||||
#define mm512_shuffle2_64( a, b, c ) \
|
||||
_mm512_castpd_si512( _mm512_shuffle_pd( _mm512_castsi512_pd( a ), \
|
||||
_mm512_castsi512_pd( b ), c ) );
|
||||
@@ -621,11 +614,7 @@ static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
|
||||
// Drop macros? They can easilly be rebuilt using shufl2 functions
|
||||
|
||||
// 2 input, 1 output
|
||||
// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
|
||||
// rotated v1
|
||||
// visually confusing for shif2r because of arg order. First arg is always
|
||||
// the target for modification, either update by reference or by function
|
||||
// return.
|
||||
// Rotate concatenated { v1, v2 ) right or left and return v1.
|
||||
#define mm512_shufl2r_256( v1, v2 ) _mm512_alignr_epi64( v2, v1, 4 )
|
||||
#define mm512_shufl2l_256( v1, v2 ) _mm512_alignr_epi64( v1, v2, 4 )
|
||||
|
||||
|
Reference in New Issue
Block a user