mirror of
https://github.com/JayDDee/cpuminer-opt.git
synced 2025-09-17 23:44:27 +00:00
461 lines
15 KiB
C
461 lines
15 KiB
C
#if !defined(SIMD_SSE2_H__)
|
|
#define SIMD_SSE2_H__ 1
|
|
|
|
#if defined(__SSE2__)
|
|
|
|
//////////////////////////////////////////////////////////////////
|
|
//
|
|
// 128 bit SSE vectors
|
|
//
|
|
// SSE2 is generally required for full 128 bit support. Some functions
|
|
// are also optimized with SSSE3 or SSE4.1.
|
|
//
|
|
// Do not call _mm_extract directly, it isn't supported in SSE2.
|
|
// Use mm128_extr instead, it will select the appropriate implementation.
|
|
//
|
|
// 128 bit operations are enhanced with uint128 which adds 128 bit integer
|
|
// support for arithmetic and other operations. Casting to uint128_t is not
|
|
// free, it requires a move from mmx to gpr but is often the only way or
|
|
// the more efficient way for certain operations.
|
|
|
|
// Compile time constant initializers are type agnostic and can have
|
|
// a pointer handle of almost any type. All arguments must be scalar constants.
|
|
// up to 64 bits. These iniitializers should only be used at compile time
|
|
// to initialize vector arrays. All data reside in memory.
|
|
//
|
|
// These are of limited use, it is often simpler to use uint64_t arrays
|
|
// and cast as required.
|
|
|
|
#define mm128_const_64( x1, x0 ) {{ x1, x0 }}
|
|
#define mm128_const1_64( x ) {{ x, x }}
|
|
|
|
#define mm128_const_32( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
|
|
#define mm128_const1_32( x ) {{ x,x,x,x }}
|
|
|
|
#define mm128_const_16( x7, x6, x5, x4, x3, x2, x1, x0 ) \
|
|
{{ x7, x6, x5, x4, x3, x2, x1, x0 }}
|
|
#define mm128_const1_16( x ) {{ x,x,x,x, x,x,x,x }}
|
|
|
|
#define mm128_const_8( x15, x14, x13, x12, x11, x10, x09, x08, \
|
|
x07, x06, x05, x04, x03, x02, x01, x00 ) \
|
|
{{ x15, x14, x13, x12, x11, x10, x09, x08, \
|
|
x07, x06, x05, x04, x03, x02, x01, x00 }}
|
|
#define mm128_const1_8( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }}
|
|
|
|
// Compile time constants, use only for compile time initializing.
|
|
#define c128_zero mm128_const1_64( 0ULL )
|
|
#define c128_one_128 mm128_const_64( 0ULL, 1ULL )
|
|
#define c128_one_64 mm128_const1_64( 1ULL )
|
|
#define c128_one_32 mm128_const1_32( 1UL )
|
|
#define c128_one_16 mm128_const1_16( 1U )
|
|
#define c128_one_8 mm128_const1_8( 1U )
|
|
#define c128_neg1 mm128_const1_64( 0xFFFFFFFFFFFFFFFFULL )
|
|
#define c128_neg1_64 mm128_const1_64( 0xFFFFFFFFFFFFFFFFULL )
|
|
#define c128_neg1_32 mm128_const1_32( 0xFFFFFFFFUL )
|
|
#define c128_neg1_16 mm128_const1_32( 0xFFFFU )
|
|
#define c128_neg1_8 mm128_const1_32( 0xFFU )
|
|
|
|
//
|
|
// Pseudo constants.
|
|
//
|
|
// These can't be used for compile time initialization.
|
|
// These should be used for all simple vectors.
|
|
//
|
|
// _mm_setzero_si128 uses pxor instruction, it's unclear what _mm_set_epi does.
|
|
// Clearly it's faster than reading a memory resident constant. Assume set
|
|
// is also faster.
|
|
// If a pseudo constant is used often in a function it may be preferable
|
|
// to define a register variable to represent that constant.
|
|
// register __m128i zero = mm_setzero_si128().
|
|
// This reduces any references to a move instruction.
|
|
|
|
#define m128_zero _mm_setzero_si128()
|
|
|
|
#define m128_one_128 _mm_set_epi64x( 0ULL, 1ULL )
|
|
#define m128_one_64 _mm_set1_epi64x( 1ULL )
|
|
#define m128_one_32 _mm_set1_epi32( 1UL )
|
|
#define m128_one_16 _mm_set1_epi16( 1U )
|
|
#define m128_one_8 _mm_set1_epi8( 1U )
|
|
|
|
#define m128_neg1 _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
|
|
|
|
//
|
|
// Basic operations without equivalent SIMD intrinsic
|
|
|
|
// Bitwise not (~v)
|
|
#define mm128_not( v ) _mm_xor_si128( (v), m128_neg1 )
|
|
|
|
// Unary negation of elements
|
|
#define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v )
|
|
#define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v )
|
|
#define mm128_negate_16( v ) _mm_sub_epi16( m128_zero, v )
|
|
|
|
// Use uint128_t for most arithmetic, bit shift, comparison operations
|
|
// spanning all 128 bits. Some extractions are also more efficient
|
|
// casting __m128i as uint128_t and usingstandard operators.
|
|
|
|
// This isn't cheap, not suitable for bulk usage.
|
|
#define mm128_extr_4x32( a0, a1, a2, a3, src ) \
|
|
do { \
|
|
a0 = _mm_extract_epi32( src, 0 ); \
|
|
a1 = _mm_extract_epi32( src, 1 ); \
|
|
a1 = _mm_extract_epi32( src, 2 ); \
|
|
a3 = _mm_extract_epi32( src, 3 ); \
|
|
} while(0)
|
|
|
|
// Horizontal vector testing
|
|
|
|
// Bit-wise test of entire vector, useful to test results of cmp.
|
|
#define mm128_anybits0( a ) (uint128_t)(a)
|
|
#define mm128_anybits1( a ) (((uint128_t)(a))+1)
|
|
|
|
#define mm128_allbits0( a ) ( !mm128_anybits1(a) )
|
|
#define mm128_allbits1( a ) ( !mm128_anybits0(a) )
|
|
|
|
//
|
|
// Vector pointer cast
|
|
|
|
// p = any aligned pointer
|
|
// returns p as pointer to vector type
|
|
#define castp_m128i(p) ((__m128i*)(p))
|
|
|
|
// p = any aligned pointer
|
|
// returns *p, watch your pointer arithmetic
|
|
#define cast_m128i(p) (*((__m128i*)(p)))
|
|
|
|
// p = any aligned pointer, i = scaled array index
|
|
// returns value p[i]
|
|
#define casti_m128i(p,i) (((__m128i*)(p))[(i)])
|
|
|
|
// p = any aligned pointer, o = scaled offset
|
|
// returns pointer p+o
|
|
#define casto_m128i(p,o) (((__m128i*)(p))+(o))
|
|
|
|
// SSE2 doesn't implement extract
|
|
#if defined(__SSE4_1)
|
|
|
|
#define mm128_extr_64(a,n) _mm_extract_epi64( a, n )
|
|
#define mm128_extr_32(a,n) _mm_extract_epi32( a, n )
|
|
|
|
#else
|
|
|
|
#define mm128_extr_64(a,n) (((uint64_t*)&a)[n])
|
|
#define mm128_extr_32(a,n) (((uint32_t*)&a)[n])
|
|
|
|
#endif
|
|
|
|
|
|
// Gather and scatter data.
|
|
// Surprise, they don't use vector instructions. Several reasons why.
|
|
// Since scalar data elements are being manipulated scalar instructions
|
|
// are most appropriate and can bypass vector registers. They are faster
|
|
// and more efficient on a per instruction basis due to the higher clock
|
|
// speed and greater avaiability of execution resources. It's good for
|
|
// interleaving data buffers for parallel processing.
|
|
// May suffer overhead if data is already in a vector register. This can
|
|
// usually be easilly avoided by the coder. Sometimes _mm_set is simply better.
|
|
// These macros are likely to be used when transposing matrices rather than
|
|
// conversions of a single vector.
|
|
|
|
// Gather data elements into contiguous memory for vector use.
|
|
// Source args are appropriately sized value integers, destination arg is a
|
|
// type agnostic pointer.
|
|
// Vector alignment is not required, though likely. Appropriate integer
|
|
// alignment satisfies these macros.
|
|
|
|
// rewrite using insert
|
|
#define mm128_gather_64( d, s0, s1 ) \
|
|
((uint64_t*)d)[0] = (uint64_t)s0; \
|
|
((uint64_t*)d)[1] = (uint64_t)s1;
|
|
|
|
#define mm128_gather_32( d, s0, s1, s2, s3 ) \
|
|
((uint32_t*)d)[0] = (uint32_t)s0; \
|
|
((uint32_t*)d)[1] = (uint32_t)s1; \
|
|
((uint32_t*)d)[2] = (uint32_t)s2; \
|
|
((uint32_t*)d)[3] = (uint32_t)s3;
|
|
|
|
// Scatter data from contiguous memory.
|
|
#define mm128_scatter_64( d0, d1, s ) \
|
|
*( (uint64_t*)d0) = ((uint64_t*)s)[0]; \
|
|
*( (uint64_t*)d1) = ((uint64_t*)s)[1];
|
|
|
|
#define mm128_scatter_32( d0, d1, d2, d3, s ) \
|
|
*( (uint32_t*)d0) = ((uint32_t*)s)[0]; \
|
|
*( (uint32_t*)d1) = ((uint32_t*)s)[1]; \
|
|
*( (uint32_t*)d2) = ((uint32_t*)s)[2]; \
|
|
*( (uint32_t*)d3) = ((uint32_t*)s)[3];
|
|
|
|
// Memory functions
|
|
// Mostly for convenience, avoids calculating bytes.
|
|
// Assumes data is alinged and integral.
|
|
// n = number of __m128i, bytes/16
|
|
|
|
// Memory functions
|
|
// Mostly for convenience, avoids calculating bytes.
|
|
// Assumes data is alinged and integral.
|
|
// n = number of __m128i, bytes/16
|
|
|
|
static inline void memset_zero_128( __m128i *dst, int n )
|
|
{ for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; }
|
|
|
|
static inline void memset_128( __m128i *dst, const __m128i a, int n )
|
|
{ for ( int i = 0; i < n; i++ ) dst[i] = a; }
|
|
|
|
static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
|
|
{ for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
|
|
|
|
|
|
//
|
|
// Bit rotations
|
|
|
|
// AVX512 has implemented bit rotation for 128 bit vectors with
|
|
// 64 and 32 bit elements. Not really useful.
|
|
|
|
//
|
|
// Rotate each element of v by c bits
|
|
|
|
#define mm128_ror_64( v, c ) \
|
|
_mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
|
|
|
|
#define mm128_rol_64( v, c ) \
|
|
_mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
|
|
|
|
#define mm128_ror_32( v, c ) \
|
|
_mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
|
|
|
|
#define mm128_rol_32( v, c ) \
|
|
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
|
|
|
|
#define mm128_ror_16( v, c ) \
|
|
_mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) )
|
|
|
|
#define mm128_rol_16( v, c ) \
|
|
_mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )
|
|
|
|
//
|
|
// Rotate elements accross all lanes
|
|
|
|
#define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e )
|
|
|
|
#define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 )
|
|
#define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 )
|
|
|
|
#define mm128_ror_1x16( v ) \
|
|
_mm_shuffle_epi8( v, _mm_set_epi8( 1, 0,15,14,13,12,11,10 \
|
|
9, 8, 7, 6, 5, 4, 3, 2 ) )
|
|
#define mm128_rol_1x16( v ) \
|
|
_mm_shuffle_epi8( v, _mm_set_epi8( 13,12,11,10, 9, 8, 7, 6, \
|
|
5, 4, 3, 2, 1, 0,15,14 ) )
|
|
#define mm128_ror_1x8( v ) \
|
|
_mm_shuffle_epi8( v, _mm_set_epi8( 0,15,14,13,12,11,10, 9, \
|
|
8, 7, 6, 5, 4, 3, 2, 1 ) )
|
|
#define mm128_rol_1x8( v ) \
|
|
_mm_shuffle_epi8( v, _mm_set_epi8( 14,13,12,11,10, 9, 8, 7, \
|
|
6, 5, 4, 3, 2, 1, 0,15 ) )
|
|
|
|
// Rotate 16 byte (128 bit) vector by c bytes.
|
|
// Less efficient using shift but more versatile. Use only for odd number
|
|
// byte rotations. Use shuffle above whenever possible.
|
|
#define mm128_bror( v, c ) \
|
|
_mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) )
|
|
|
|
#define mm128_brol( v, c ) \
|
|
_mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) )
|
|
|
|
// Invert vector: {3,2,1,0} -> {0,1,2,3}
|
|
#define mm128_invert_32( v ) _mm_shuffle_epi32( a, 0x1b )
|
|
|
|
#define mm128_invert_16( v ) \
|
|
_mm_shuffle_epi8( v, _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, \
|
|
9, 8, 11,10, 13,12, 15,14 ) )
|
|
|
|
#define mm128_invert_8( v ) \
|
|
_mm_shuffle_epi8( v, _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, \
|
|
8, 9,10,11,12,13,14,15 ) )
|
|
|
|
//
|
|
// Rotate elements within lanes.
|
|
|
|
#define mm128_swap32_64( v ) _mm_shuffle_epi32( v, 0xb1 )
|
|
|
|
#define mm128_ror16_64( v ) _mm_shuffle_epi8( v, \
|
|
_mm_set_epi8( 9, 8,15,14,13,12,11,10, 1, 0, 7, 6, 5, 4, 3, 2 )
|
|
#define mm128_rol16_64( v ) _mm_shuffle_epi8( v, \
|
|
_mm_set_epi8( 13,12,11,10, 9, 8,15,14, 5, 4, 3, 2, 1, 0, 7, 6 )
|
|
|
|
|
|
#define mm128_swap16_32( v ) _mm_shuffle_epi8( v, \
|
|
_mm_set_epi8( 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2 )
|
|
|
|
//
|
|
// Endian byte swap.
|
|
|
|
#if defined(__SSSE3__)
|
|
|
|
#define mm128_bswap_64( v ) \
|
|
_mm_shuffle_epi8( v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, \
|
|
0, 1, 2, 3, 4, 5, 6, 7 ) )
|
|
|
|
#define mm128_bswap_32( v ) \
|
|
_mm_shuffle_epi8( v, _mm_set_epi8( 12,13,14,15, 8, 9,10,11, \
|
|
4, 5, 6, 7, 0, 1, 2, 3 ) )
|
|
|
|
#define mm128_bswap_16( v ) \
|
|
_mm_shuffle_epi8( v, _mm_set_epi8( 14,15, 12,13, 10,11, 8, 9, \
|
|
6, 7, 4, 5, 2, 3, 0, 1 ) )
|
|
|
|
#else // SSE2
|
|
|
|
// Use inline function instead of macro due to multiple statements.
|
|
static inline __m128i mm128_bswap_64( __m128i v )
|
|
{
|
|
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
|
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
|
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) );
|
|
}
|
|
|
|
static inline __m128i mm128_bswap_32( __m128i v )
|
|
{
|
|
v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
|
v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
|
|
return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) );
|
|
}
|
|
|
|
static inline __m128i mm128_bswap_16( __m128i v )
|
|
{
|
|
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
|
|
}
|
|
|
|
#endif // SSSE3 else SSE2
|
|
//
|
|
// Rotate in place concatenated 128 bit vectors as one 256 bit vector.
|
|
|
|
// Swap 128 bit vectorse.
|
|
|
|
#define mm128_swap128_256(v1, v2) \
|
|
v1 = _mm_xor_si128(v1, v2); \
|
|
v2 = _mm_xor_si128(v1, v2); \
|
|
v1 = _mm_xor_si128(v1, v2);
|
|
|
|
// Concatenate v1 & v2 and rotate as one 256 bit vector.
|
|
#if defined(__SSE4_1__)
|
|
|
|
#define mm128_ror1x64_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
|
|
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
|
|
v2 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_rol1x64_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
|
|
v2 = _mm_alignr_epi8( v2, v1, 8 ); \
|
|
v1 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_ror1x32_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_alignr_epi8( v1, v2, 4 ); \
|
|
v1 = _mm_alignr_epi8( v2, v1, 4 ); \
|
|
v2 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_rol1x32_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_alignr_epi8( v1, v2, 12 ); \
|
|
v2 = _mm_alignr_epi8( v2, v1, 12 ); \
|
|
v1 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_ror1x16_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_alignr_epi8( v1, v2, 2 ); \
|
|
v1 = _mm_alignr_epi8( v2, v1, 2 ); \
|
|
v2 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_rol1x16_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_alignr_epi8( v1, v2, 14 ); \
|
|
v2 = _mm_alignr_epi8( v2, v1, 14 ); \
|
|
v1 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_ror1x8_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_alignr_epi8( v1, v2, 1 ); \
|
|
v1 = _mm_alignr_epi8( v2, v1, 1 ); \
|
|
v2 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_rol1x8_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_alignr_epi8( v1, v2, 15 ); \
|
|
v2 = _mm_alignr_epi8( v2, v1, 15 ); \
|
|
v1 = t; \
|
|
} while(0)
|
|
|
|
#else // SSE2
|
|
|
|
#define mm128_ror1x64_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_srli_si128( v1, 8 ) | _mm_slli_si128( v2, 8 ); \
|
|
v2 = _mm_srli_si128( v2, 8 ) | _mm_slli_si128( v1, 8 ); \
|
|
v1 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_rol1x64_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_slli_si128( v1, 8 ) | _mm_srli_si128( v2, 8 ); \
|
|
v2 = _mm_slli_si128( v2, 8 ) | _mm_srli_si128( v1, 8 ); \
|
|
v1 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_ror1x32_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_srli_si128( v1, 4 ) | _mm_slli_si128( v2, 12 ); \
|
|
v2 = _mm_srli_si128( v2, 4 ) | _mm_slli_si128( v1, 12 ); \
|
|
v1 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_rol1x32_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_slli_si128( v1, 4 ) | _mm_srli_si128( v2, 12 ); \
|
|
v2 = _mm_slli_si128( v2, 4 ) | _mm_srli_si128( v1, 12 ); \
|
|
v1 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_ror1x16_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_srli_si128( v1, 2 ) | _mm_slli_si128( v2, 14 ); \
|
|
v2 = _mm_srli_si128( v2, 2 ) | _mm_slli_si128( v1, 14 ); \
|
|
v1 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_rol1x16_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_slli_si128( v1, 2 ) | _mm_srli_si128( v2, 14 ); \
|
|
v2 = _mm_slli_si128( v2, 2 ) | _mm_srli_si128( v1, 14 ); \
|
|
v1 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_ror1x8_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_srli_si128( v1, 1 ) | _mm_slli_si128( v2, 15 ); \
|
|
v2 = _mm_srli_si128( v2, 1 ) | _mm_slli_si128( v1, 15 ); \
|
|
v1 = t; \
|
|
} while(0)
|
|
|
|
#define mm128_rol1x8_256( v1, v2 ) \
|
|
do { \
|
|
__m128i t = _mm_slli_si128( v1, 1 ) | _mm_srli_si128( v2, 15 ); \
|
|
v2 = _mm_slli_si128( v2, 1 ) | _mm_srli_si128( v1, 15 ); \
|
|
v1 = t; \
|
|
} while(0)
|
|
|
|
#endif // SSE4.1 else SSE2
|
|
|
|
#endif // __SSE2__
|
|
#endif // SIMD_SSE2_H__
|