mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
v3.9.6
This commit is contained in:
@@ -477,42 +477,42 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src )
|
||||
__m256i s0 = mm256_bswap_32( casti_m256i( src,0 ) );
|
||||
__m256i s1 = mm256_bswap_32( casti_m256i( src,1 ) );
|
||||
__m128i s2 = mm128_bswap_32( casti_m128i( src,4 ) );
|
||||
const __m256i zero = m256_zero;
|
||||
const __m256i one = m256_one_32;
|
||||
const __m256i two = _mm256_add_epi32( one, one );
|
||||
const __m256i tre = _mm256_add_epi32( two, one );
|
||||
const __m256i four = _mm256_add_epi32( two, two );
|
||||
const __m256i zero = m256_zero;
|
||||
const __m256i one = m256_one_32;
|
||||
const __m256i two = _mm256_add_epi32( one, one );
|
||||
const __m256i three = _mm256_add_epi32( two, one );
|
||||
const __m256i four = _mm256_add_epi32( two, two );
|
||||
|
||||
casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, zero );
|
||||
casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32( s0, one );
|
||||
casti_m256i( d, 2 ) = _mm256_permutevar8x32_epi32( s0, two );
|
||||
casti_m256i( d, 3 ) = _mm256_permutevar8x32_epi32( s0, tre );
|
||||
casti_m256i( d, 4 ) = _mm256_permutevar8x32_epi32( s0, four );
|
||||
casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, zero );
|
||||
casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32( s0, one );
|
||||
casti_m256i( d, 2 ) = _mm256_permutevar8x32_epi32( s0, two );
|
||||
casti_m256i( d, 3 ) = _mm256_permutevar8x32_epi32( s0, three );
|
||||
casti_m256i( d, 4 ) = _mm256_permutevar8x32_epi32( s0, four );
|
||||
casti_m256i( d, 5 ) = _mm256_permutevar8x32_epi32( s0,
|
||||
_mm256_add_epi32( four, one ) );
|
||||
_mm256_add_epi32( four, one ) );
|
||||
casti_m256i( d, 6 ) = _mm256_permutevar8x32_epi32( s0,
|
||||
_mm256_add_epi32( four, two ) );
|
||||
_mm256_add_epi32( four, two ) );
|
||||
casti_m256i( d, 7 ) = _mm256_permutevar8x32_epi32( s0,
|
||||
_mm256_add_epi32( four, tre ) );
|
||||
casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, zero );
|
||||
casti_m256i( d, 9 ) = _mm256_permutevar8x32_epi32( s1, one );
|
||||
casti_m256i( d,10 ) = _mm256_permutevar8x32_epi32( s1, two );
|
||||
casti_m256i( d,11 ) = _mm256_permutevar8x32_epi32( s1, tre );
|
||||
casti_m256i( d,12 ) = _mm256_permutevar8x32_epi32( s1, four );
|
||||
_mm256_add_epi32( four, three ) );
|
||||
casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, zero );
|
||||
casti_m256i( d, 9 ) = _mm256_permutevar8x32_epi32( s1, one );
|
||||
casti_m256i( d,10 ) = _mm256_permutevar8x32_epi32( s1, two );
|
||||
casti_m256i( d,11 ) = _mm256_permutevar8x32_epi32( s1, three );
|
||||
casti_m256i( d,12 ) = _mm256_permutevar8x32_epi32( s1, four );
|
||||
casti_m256i( d,13 ) = _mm256_permutevar8x32_epi32( s1,
|
||||
_mm256_add_epi32( four, one ) );
|
||||
_mm256_add_epi32( four, one ) );
|
||||
casti_m256i( d,14 ) = _mm256_permutevar8x32_epi32( s1,
|
||||
_mm256_add_epi32( four, two ) );
|
||||
_mm256_add_epi32( four, two ) );
|
||||
casti_m256i( d,15 ) = _mm256_permutevar8x32_epi32( s1,
|
||||
_mm256_add_epi32( four, tre ) );
|
||||
_mm256_add_epi32( four, three ) );
|
||||
casti_m256i( d,16 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s2 ), zero );
|
||||
_mm256_castsi128_si256( s2 ), zero );
|
||||
casti_m256i( d,17 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s2 ), one );
|
||||
_mm256_castsi128_si256( s2 ), one );
|
||||
casti_m256i( d,18 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s2 ), two );
|
||||
_mm256_castsi128_si256( s2 ), two );
|
||||
casti_m256i( d,19 ) = _mm256_permutevar8x32_epi32(
|
||||
_mm256_castsi128_si256( s2 ), tre );
|
||||
_mm256_castsi128_si256( s2 ), three );
|
||||
}
|
||||
|
||||
#endif // AVX2
|
||||
@@ -677,39 +677,39 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, void *src )
|
||||
{
|
||||
__m512i s0 = mm512_bswap_32( casti_m512i( src, 0 ) );
|
||||
__m128i s1 = mm128_bswap_32( casti_m128i( src, 4 ) );
|
||||
const __m512i zero = m512_zero;
|
||||
const __m512i one = m512_one_32;
|
||||
const __m512i two = _mm512_add_epi32( one, one );
|
||||
const __m512i tre = _mm512_add_epi32( two, one );
|
||||
const __m512i four = _mm512_add_epi32( two, two );
|
||||
const __m512i eight = _mm512_add_epi32( four, four );
|
||||
const __m512i eleven = _mm512_add_epi32( eight, tre );
|
||||
const __m512i zero = m512_zero;
|
||||
const __m512i one = m512_one_32;
|
||||
const __m512i two = _mm512_add_epi32( one, one );
|
||||
const __m512i three = _mm512_add_epi32( two, one );
|
||||
const __m512i four = _mm512_add_epi32( two, two );
|
||||
const __m512i eight = _mm512_add_epi32( four, four );
|
||||
const __m512i eleven = _mm512_add_epi32( eight, three );
|
||||
|
||||
casti_m512i( d, 0 ) = _mm512_permutexvar_epi32( s0, zero );
|
||||
casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( s0, one );
|
||||
casti_m512i( d, 2 ) = _mm512_permutexvar_epi32( s0, two );
|
||||
casti_m512i( d, 3 ) = _mm512_permutexvar_epi32( s0, tre );
|
||||
casti_m512i( d, 4 ) = _mm512_permutexvar_epi32( s0, four );
|
||||
casti_m512i( d, 0 ) = _mm512_permutexvar_epi32( s0, zero );
|
||||
casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( s0, one );
|
||||
casti_m512i( d, 2 ) = _mm512_permutexvar_epi32( s0, two );
|
||||
casti_m512i( d, 3 ) = _mm512_permutexvar_epi32( s0, three );
|
||||
casti_m512i( d, 4 ) = _mm512_permutexvar_epi32( s0, four );
|
||||
casti_m512i( d, 5 ) = _mm512_permutexvar_epi32( s0,
|
||||
_mm512_add_epi32( four, one ) );
|
||||
_mm512_add_epi32( four, one ) );
|
||||
casti_m512i( d, 6 ) = _mm512_permutexvar_epi32( s0,
|
||||
_mm512_add_epi32( four, two ) );
|
||||
_mm512_add_epi32( four, two ) );
|
||||
casti_m512i( d, 7 ) = _mm512_permutexvar_epi32( s0,
|
||||
_mm512_add_epi32( four, tre ) );
|
||||
_mm512_add_epi32( four, three ) );
|
||||
casti_m512i( d, 8 ) = _mm512_permutexvar_epi32( s0, eight );
|
||||
casti_m512i( d, 9 ) = _mm512_permutexvar_epi32( s0,
|
||||
_mm512_add_epi32( eight, one ) );
|
||||
_mm512_add_epi32( eight, one ) );
|
||||
casti_m512i( d,10 ) = _mm512_permutexvar_epi32( s0,
|
||||
_mm512_add_epi32( eight, two ) );
|
||||
_mm512_add_epi32( eight, two ) );
|
||||
casti_m512i( d,11 ) = _mm512_permutexvar_epi32( s0, eleven );
|
||||
casti_m512i( d,12 ) = _mm512_permutexvar_epi32( s0,
|
||||
_mm512_add_epi32( eleven, one ) );
|
||||
_mm512_add_epi32( eleven, one ) );
|
||||
casti_m512i( d,13 ) = _mm512_permutexvar_epi32( s0,
|
||||
_mm512_add_epi32( eleven, two ) );
|
||||
_mm512_add_epi32( eleven, two ) );
|
||||
casti_m512i( d,14 ) = _mm512_permutexvar_epi32( s0,
|
||||
_mm512_add_epi32( eleven, tre ) );
|
||||
_mm512_add_epi32( eleven, three ) );
|
||||
casti_m512i( d,15 ) = _mm512_permutexvar_epi32( s0,
|
||||
_mm512_add_epi32( eleven, four ) );
|
||||
_mm512_add_epi32( eleven, four ) );
|
||||
casti_m512i( d,16 ) = _mm512_permutexvar_epi32(
|
||||
_mm512_castsi128_si512( s1 ), zero );
|
||||
casti_m512i( d,17 ) = _mm512_permutexvar_epi32(
|
||||
@@ -717,7 +717,7 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, void *src )
|
||||
casti_m512i( d,18 ) = _mm512_permutexvar_epi32(
|
||||
_mm512_castsi128_si512( s1 ), two );
|
||||
casti_m512i( d,19 ) = _mm512_permutexvar_epi32(
|
||||
_mm512_castsi128_si512( s1 ), tre );
|
||||
_mm512_castsi128_si512( s1 ), three );
|
||||
}
|
||||
|
||||
#endif // AVX512
|
||||
@@ -1006,20 +1006,20 @@ static inline void mm512_bswap32_intrlv80_8x64( void *dst, void *src )
|
||||
__m512i *d = (__m512i*)dst;
|
||||
__m512i s0 = mm512_bswap_32( casti_m512i(src, 0 ) );
|
||||
__m128i s1 = mm128_bswap_32( casti_m128i(src, 4 ) );
|
||||
const __m512i zero = m512_zero;
|
||||
const __m512i one = m512_one_64;
|
||||
const __m512i two = _mm512_add_epi64( one, one );
|
||||
const __m512i tre = _mm512_add_epi64( two, one );
|
||||
const __m512i four = _mm512_add_epi64( two, two );
|
||||
const __m512i zero = m512_zero;
|
||||
const __m512i one = m512_one_64;
|
||||
const __m512i two = _mm512_add_epi64( one, one );
|
||||
const __m512i three = _mm512_add_epi64( two, one );
|
||||
const __m512i four = _mm512_add_epi64( two, two );
|
||||
|
||||
d[0] = _mm512_permutexvar_epi64( s0, zero );
|
||||
d[1] = _mm512_permutexvar_epi64( s0, one );
|
||||
d[2] = _mm512_permutexvar_epi64( s0, two );
|
||||
d[3] = _mm512_permutexvar_epi64( s0, tre );
|
||||
d[3] = _mm512_permutexvar_epi64( s0, three );
|
||||
d[4] = _mm512_permutexvar_epi64( s0, four );
|
||||
d[5] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, one ) );
|
||||
d[6] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, two ) );
|
||||
d[7] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, tre ) );
|
||||
d[5] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, one ) );
|
||||
d[6] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, two ) );
|
||||
d[7] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, three ) );
|
||||
d[8] = _mm512_permutexvar_epi64(
|
||||
_mm512_castsi128_si512( s1 ), zero );
|
||||
d[9] = _mm512_permutexvar_epi64(
|
||||
@@ -1296,25 +1296,18 @@ static inline void rintrlv_4x64_2x128( void *dst0, void *dst1,
|
||||
#if defined(__SSE4_1__)
|
||||
// No SSE2 implementation.
|
||||
|
||||
#define mm128_intrlv_blend_64( hi, lo ) \
|
||||
_mm_blend_epi16( hi, lo, 0x0f )
|
||||
#define mm128_intrlv_blend_32( hi, lo ) \
|
||||
_mm_blend_epi16( hi, lo, 0x33 )
|
||||
#define mm128_intrlv_blend_64( hi, lo ) _mm_blend_epi16( hi, lo, 0x0f )
|
||||
#define mm128_intrlv_blend_32( hi, lo ) _mm_blend_epi16( hi, lo, 0x33 )
|
||||
|
||||
#endif // SSE4_1
|
||||
|
||||
#if defined(__AVX2__)
|
||||
|
||||
#define mm256_intrlv_blend_128( hi, lo ) \
|
||||
_mm256_blend_epi32( hi, lo, 0x0f )
|
||||
#define mm256_intrlv_blend_128( hi, lo ) _mm256_blend_epi32( hi, lo, 0x0f )
|
||||
#define mm256_intrlv_blend_64( hi, lo ) _mm256_blend_epi32( hi, lo, 0x33 )
|
||||
#define mm256_intrlv_blend_32( hi, lo ) _mm256_blend_epi32( hi, lo, 0x55 )
|
||||
|
||||
#define mm256_intrlv_blend_64( hi, lo ) \
|
||||
_mm256_blend_epi32( hi, lo, 0x33 )
|
||||
|
||||
#define mm256_intrlv_blend_32( hi, lo ) \
|
||||
_mm256_blend_epi32( hi, lo, 0x55 )
|
||||
|
||||
// Blend 32 byte lanes of hash from 2 sources according to control mask.
|
||||
// Select lanes of 32 byte hash from 2 sources according to control mask.
|
||||
// macro due to 256 bit value arg.
|
||||
#define mm256_blend_hash_4x64( dst, a, b, mask ) \
|
||||
do { \
|
||||
|
||||
@@ -358,17 +358,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
|
||||
// no SSE2 implementation, no current users
|
||||
|
||||
#define mm128_ror_1x16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi8( 1, 0,15,14,13,12,11,10 \
|
||||
9, 8, 7, 6, 5, 4, 3, 2 ) )
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x01000f0e0d0c0b0a, \
|
||||
0x0908070605040302 ) )
|
||||
#define mm128_rol_1x16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi8( 13,12,11,10, 9, 8, 7, 6, \
|
||||
5, 4, 3, 2, 1, 0,15,14 ) )
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x0d0c0b0a09080706, \
|
||||
0x0504030201000f0e ) )
|
||||
#define mm128_ror_1x8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi8( 0,15,14,13,12,11,10, 9, \
|
||||
8, 7, 6, 5, 4, 3, 2, 1 ) )
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x000f0e0d0c0b0a09, \
|
||||
0x0807060504030201 ) )
|
||||
#define mm128_rol_1x8( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi8( 14,13,12,11,10, 9, 8, 7, \
|
||||
6, 5, 4, 3, 2, 1, 0,15 ) )
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x0e0d0c0b0a090807, \
|
||||
0x060504030201000f ) )
|
||||
#endif // SSE3
|
||||
|
||||
// Rotate 16 byte (128 bit) vector by c bytes.
|
||||
@@ -386,12 +386,12 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
|
||||
#define mm128_swap32_64( v ) _mm_shuffle_epi32( v, 0xb1 )
|
||||
|
||||
#define mm128_ror16_64( v ) _mm_shuffle_epi8( v, \
|
||||
_mm_set_epi8( 9, 8,15,14,13,12,11,10, 1, 0, 7, 6, 5, 4, 3, 2 )
|
||||
m128_const_64( 0x09080f0e0d0c0b0a, 0x0100070605040302 )
|
||||
#define mm128_rol16_64( v ) _mm_shuffle_epi8( v, \
|
||||
_mm_set_epi8( 13,12,11,10, 9, 8,15,14, 5, 4, 3, 2, 1, 0, 7, 6 )
|
||||
m128_const_64( 0x0dc0b0a09080f0e, 0x0504030201000706 )
|
||||
|
||||
#define mm128_swap16_32( v ) _mm_shuffle_epi8( v, \
|
||||
_mm_set_epi8( 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2 )
|
||||
m128_const_64( 0x0d0c0f0e09080b0a, 0x0504070601000302 )
|
||||
|
||||
//
|
||||
// Endian byte swap.
|
||||
@@ -399,16 +399,15 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
|
||||
#if defined(__SSSE3__)
|
||||
|
||||
#define mm128_bswap_64( v ) \
|
||||
_mm_shuffle_epi8( v, m128_const64( 0x08090a0b0c0d0e0f, \
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
|
||||
0x0001020304050607 ) )
|
||||
|
||||
#define mm128_bswap_32( v ) \
|
||||
_mm_shuffle_epi8( v, m128_const_64( 0x0c0d0e0f08090a0b, \
|
||||
0x0405060700010203 ) )
|
||||
|
||||
#define mm128_bswap_16( v ) \
|
||||
_mm_shuffle_epi8( v, _mm_set_epi8( 14,15, 12,13, 10,11, 8, 9, \
|
||||
6, 7, 4, 5, 2, 3, 0, 1 ) )
|
||||
#define mm128_bswap_16( v ) _mm_shuffle_epi8( \
|
||||
m128_const_64( 0x0e0f0c0d0a0b0809, 0x0607040502030001 )
|
||||
|
||||
// 8 byte qword * 8 qwords * 2 lanes = 128 bytes
|
||||
#define mm128_block_bswap_64( d, s ) do \
|
||||
@@ -462,14 +461,14 @@ static inline __m128i mm128_bswap_16( __m128i v )
|
||||
|
||||
static inline void mm128_block_bswap_64( __m128i *d, __m128i *s )
|
||||
{
|
||||
d[0] = mm128_bswap_32( s[0] );
|
||||
d[1] = mm128_bswap_32( s[1] );
|
||||
d[2] = mm128_bswap_32( s[2] );
|
||||
d[3] = mm128_bswap_32( s[3] );
|
||||
d[4] = mm128_bswap_32( s[4] );
|
||||
d[5] = mm128_bswap_32( s[5] );
|
||||
d[6] = mm128_bswap_32( s[6] );
|
||||
d[7] = mm128_bswap_32( s[7] );
|
||||
d[0] = mm128_bswap_64( s[0] );
|
||||
d[1] = mm128_bswap_64( s[1] );
|
||||
d[2] = mm128_bswap_64( s[2] );
|
||||
d[3] = mm128_bswap_64( s[3] );
|
||||
d[4] = mm128_bswap_64( s[4] );
|
||||
d[5] = mm128_bswap_64( s[5] );
|
||||
d[6] = mm128_bswap_64( s[6] );
|
||||
d[7] = mm128_bswap_64( s[7] );
|
||||
}
|
||||
|
||||
static inline void mm128_block_bswap_32( __m128i *d, __m128i *s )
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
|
||||
// set instructions load memory resident constants, this avoids mem.
|
||||
// cost 4 pinsert + 1 vinsert, estimate 7 clocks.
|
||||
// Avoid using, mm128_const_64 twice is still faster.
|
||||
#define m256_const_64( i3, i2, i1, i0 ) \
|
||||
_mm256_insertf128_si256( _mm256_castsi128_si256( m128_const_64( i1, i0 ) ), \
|
||||
m128_const_64( i3, i2 ), 1 )
|
||||
@@ -50,7 +51,7 @@ static inline __m256i m256_one_64_fn()
|
||||
asm( "vpxor %0, %0, %0\n\t"
|
||||
"vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
|
||||
"vpsubq %%ymm1, %0, %0\n\t"
|
||||
:"=x"(a)
|
||||
: "=x"(a)
|
||||
:
|
||||
: "ymm1" );
|
||||
return a;
|
||||
@@ -63,7 +64,7 @@ static inline __m256i m256_one_32_fn()
|
||||
asm( "vpxor %0, %0, %0\n\t"
|
||||
"vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
|
||||
"vpsubd %%ymm1, %0, %0\n\t"
|
||||
:"=x"(a)
|
||||
: "=x"(a)
|
||||
:
|
||||
: "ymm1" );
|
||||
return a;
|
||||
@@ -76,7 +77,7 @@ static inline __m256i m256_one_16_fn()
|
||||
asm( "vpxor %0, %0, %0\n\t"
|
||||
"vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
|
||||
"vpsubw %%ymm1, %0, %0\n\t"
|
||||
:"=x"(a)
|
||||
: "=x"(a)
|
||||
:
|
||||
: "ymm1" );
|
||||
return a;
|
||||
@@ -89,7 +90,7 @@ static inline __m256i m256_one_8_fn()
|
||||
asm( "vpxor %0, %0, %0\n\t"
|
||||
"vpcmpeqd %%ymm1, %%ymm1, %%ymm1\n\t"
|
||||
"vpsubb %%ymm1, %0, %0\n\t"
|
||||
:"=x"(a)
|
||||
: "=x"(a)
|
||||
:
|
||||
: "ymm1" );
|
||||
return a;
|
||||
@@ -100,7 +101,7 @@ static inline __m256i m256_neg1_fn()
|
||||
{
|
||||
__m256i a;
|
||||
asm( "vpcmpeqq %0, %0, %0\n\t"
|
||||
:"=x"(a) );
|
||||
: "=x"(a) );
|
||||
return a;
|
||||
}
|
||||
#define m256_neg1 m256_neg1_fn()
|
||||
@@ -423,23 +424,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
|
||||
|
||||
// Rotate 256 bit vector by one 16 bit element.
|
||||
#define mm256_ror_1x16( v ) \
|
||||
_mm256_permutexvar_epi16( _mm256_set_epi16( \
|
||||
0,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1 ), v )
|
||||
_mm256_permutexvar_epi16( m256_const_64( \
|
||||
0x0000000f000e000d, 0x000c000b000a0009, \
|
||||
0x0008000700060005, 0x0004000300020001 ), v )
|
||||
|
||||
#define mm256_rol_1x16( v ) \
|
||||
_mm256_permutexvar_epi16( _mm256_set_epi16( \
|
||||
14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,15 ), v )
|
||||
_mm256_permutexvar_epi16( m256_const_64( \
|
||||
0x000e000d000c000b, 0x000a000900080007, \
|
||||
0x0006000500040003, 0x000200010000000f ), v )
|
||||
|
||||
// Rotate 256 bit vector by one byte.
|
||||
#define mm256_ror_1x8( v ) \
|
||||
_mm256_permutexvar_epi8( _mm256_set_epi8( \
|
||||
0,31,30,29,28,27,26,25, 24,23,22,21,20,19,18,17, \
|
||||
16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1 ), v )
|
||||
#define mm256_ror_1x8( v ) m256_const_64( \
|
||||
0x001f1e1d1c1b1a19, 0x1817161514131211, \
|
||||
0x100f0e0d0c0b0a09, 0x0807060504030201 )
|
||||
|
||||
#define mm256_rol_1x8( v ) \
|
||||
_mm256_permutexvar_epi8( _mm256_set_epi8( \
|
||||
30,29,28,27,26,25,24,23, 22,21,20,19,18,17,16,15, \
|
||||
14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,31 ), v )
|
||||
#define mm256_rol_1x8( v ) m256_const_64( \
|
||||
0x1e1d1c1b1a191817, 0x161514131211100f, \
|
||||
0x0e0d0c0b0a090807, 0x060504030201001f )
|
||||
|
||||
#endif // AVX512
|
||||
|
||||
|
||||
@@ -503,7 +503,7 @@ static inline __m512i m512_neg1_fn()
|
||||
0x08090A0B, 0x0C0D0E0F, 0x00010203, 0x04050607 ) )
|
||||
|
||||
#define mm512_bswap_32( v ) \
|
||||
_mm512_permutexvar_epi8( v, _mm512_set_epi832( \
|
||||
_mm512_permutexvar_epi8( v, _mm512_set_epi32( \
|
||||
0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \
|
||||
0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \
|
||||
0x3C3D3E3F, 0x38393A3B, 0x34353637, 0x30313233, \
|
||||
|
||||
Reference in New Issue
Block a user