#if !defined(SIMD_SSE2_H__) #define SIMD_SSE2_H__ 1 #if defined(__SSE2__) ////////////////////////////////////////////////////////////////// // // 128 bit SSE vectors // // SSE2 is generally required for full 128 bit support. Some functions // are also optimized with SSSE3 or SSE4.1. // // Do not call _mm_extract directly, it isn't supported in SSE2. // Use mm128_extr instead, it will select the appropriate implementation. // // 128 bit operations are enhanced with uint128 which adds 128 bit integer // support for arithmetic and other operations. Casting to uint128_t is not // free, it requires a move from mmx to gpr but is often the only way or // the more efficient way for certain operations. // Compile time constant initializers are type agnostic and can have // a pointer handle of almost any type. All arguments must be scalar constants. // up to 64 bits. These iniitializers should only be used at compile time // to initialize vector arrays. All data reside in memory. // // These are of limited use, it is often simpler to use uint64_t arrays // and cast as required. #define mm128_const_64( x1, x0 ) {{ x1, x0 }} #define mm128_const1_64( x ) {{ x, x }} #define mm128_const_32( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }} #define mm128_const1_32( x ) {{ x,x,x,x }} #define mm128_const_16( x7, x6, x5, x4, x3, x2, x1, x0 ) \ {{ x7, x6, x5, x4, x3, x2, x1, x0 }} #define mm128_const1_16( x ) {{ x,x,x,x, x,x,x,x }} #define mm128_const_8( x15, x14, x13, x12, x11, x10, x09, x08, \ x07, x06, x05, x04, x03, x02, x01, x00 ) \ {{ x15, x14, x13, x12, x11, x10, x09, x08, \ x07, x06, x05, x04, x03, x02, x01, x00 }} #define mm128_const1_8( x ) {{ x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x }} // Compile time constants, use only for compile time initializing. #define c128_zero mm128_const1_64( 0ULL ) #define c128_one_128 mm128_const_64( 0ULL, 1ULL ) #define c128_one_64 mm128_const1_64( 1ULL ) #define c128_one_32 mm128_const1_32( 1UL ) #define c128_one_16 mm128_const1_16( 1U ) #define c128_one_8 mm128_const1_8( 1U ) #define c128_neg1 mm128_const1_64( 0xFFFFFFFFFFFFFFFFULL ) #define c128_neg1_64 mm128_const1_64( 0xFFFFFFFFFFFFFFFFULL ) #define c128_neg1_32 mm128_const1_32( 0xFFFFFFFFUL ) #define c128_neg1_16 mm128_const1_32( 0xFFFFU ) #define c128_neg1_8 mm128_const1_32( 0xFFU ) // // Pseudo constants. // // These can't be used for compile time initialization. // These should be used for all simple vectors. // // _mm_setzero_si128 uses pxor instruction, it's unclear what _mm_set_epi does. // Clearly it's faster than reading a memory resident constant. Assume set // is also faster. // If a pseudo constant is used often in a function it may be preferable // to define a register variable to represent that constant. // register __m128i zero = mm_setzero_si128(). // This reduces any references to a move instruction. #define m128_zero _mm_setzero_si128() #define m128_one_128 _mm_set_epi64x( 0ULL, 1ULL ) #define m128_one_64 _mm_set1_epi64x( 1ULL ) #define m128_one_32 _mm_set1_epi32( 1UL ) #define m128_one_16 _mm_set1_epi16( 1U ) #define m128_one_8 _mm_set1_epi8( 1U ) #define m128_neg1 _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL ) // // Basic operations without equivalent SIMD intrinsic // Bitwise not (~v) #define mm128_not( v ) _mm_xor_si128( (v), m128_neg1 ) // Unary negation of elements #define mm128_negate_64( v ) _mm_sub_epi64( m128_zero, v ) #define mm128_negate_32( v ) _mm_sub_epi32( m128_zero, v ) #define mm128_negate_16( v ) _mm_sub_epi16( m128_zero, v ) // Use uint128_t for most arithmetic, bit shift, comparison operations // spanning all 128 bits. Some extractions are also more efficient // casting __m128i as uint128_t and usingstandard operators. // This isn't cheap, not suitable for bulk usage. #define mm128_extr_4x32( a0, a1, a2, a3, src ) \ do { \ a0 = _mm_extract_epi32( src, 0 ); \ a1 = _mm_extract_epi32( src, 1 ); \ a1 = _mm_extract_epi32( src, 2 ); \ a3 = _mm_extract_epi32( src, 3 ); \ } while(0) // Horizontal vector testing // Bit-wise test of entire vector, useful to test results of cmp. #define mm128_anybits0( a ) (uint128_t)(a) #define mm128_anybits1( a ) (((uint128_t)(a))+1) #define mm128_allbits0( a ) ( !mm128_anybits1(a) ) #define mm128_allbits1( a ) ( !mm128_anybits0(a) ) // // Vector pointer cast // p = any aligned pointer // returns p as pointer to vector type #define castp_m128i(p) ((__m128i*)(p)) // p = any aligned pointer // returns *p, watch your pointer arithmetic #define cast_m128i(p) (*((__m128i*)(p))) // p = any aligned pointer, i = scaled array index // returns value p[i] #define casti_m128i(p,i) (((__m128i*)(p))[(i)]) // p = any aligned pointer, o = scaled offset // returns pointer p+o #define casto_m128i(p,o) (((__m128i*)(p))+(o)) // SSE2 doesn't implement extract #if defined(__SSE4_1) #define mm128_extr_64(a,n) _mm_extract_epi64( a, n ) #define mm128_extr_32(a,n) _mm_extract_epi32( a, n ) #else #define mm128_extr_64(a,n) (((uint64_t*)&a)[n]) #define mm128_extr_32(a,n) (((uint32_t*)&a)[n]) #endif // Gather and scatter data. // Surprise, they don't use vector instructions. Several reasons why. // Since scalar data elements are being manipulated scalar instructions // are most appropriate and can bypass vector registers. They are faster // and more efficient on a per instruction basis due to the higher clock // speed and greater avaiability of execution resources. It's good for // interleaving data buffers for parallel processing. // May suffer overhead if data is already in a vector register. This can // usually be easilly avoided by the coder. Sometimes _mm_set is simply better. // These macros are likely to be used when transposing matrices rather than // conversions of a single vector. // Gather data elements into contiguous memory for vector use. // Source args are appropriately sized value integers, destination arg is a // type agnostic pointer. // Vector alignment is not required, though likely. Appropriate integer // alignment satisfies these macros. // rewrite using insert #define mm128_gather_64( d, s0, s1 ) \ ((uint64_t*)d)[0] = (uint64_t)s0; \ ((uint64_t*)d)[1] = (uint64_t)s1; #define mm128_gather_32( d, s0, s1, s2, s3 ) \ ((uint32_t*)d)[0] = (uint32_t)s0; \ ((uint32_t*)d)[1] = (uint32_t)s1; \ ((uint32_t*)d)[2] = (uint32_t)s2; \ ((uint32_t*)d)[3] = (uint32_t)s3; // Scatter data from contiguous memory. #define mm128_scatter_64( d0, d1, s ) \ *( (uint64_t*)d0) = ((uint64_t*)s)[0]; \ *( (uint64_t*)d1) = ((uint64_t*)s)[1]; #define mm128_scatter_32( d0, d1, d2, d3, s ) \ *( (uint32_t*)d0) = ((uint32_t*)s)[0]; \ *( (uint32_t*)d1) = ((uint32_t*)s)[1]; \ *( (uint32_t*)d2) = ((uint32_t*)s)[2]; \ *( (uint32_t*)d3) = ((uint32_t*)s)[3]; // Memory functions // Mostly for convenience, avoids calculating bytes. // Assumes data is alinged and integral. // n = number of __m128i, bytes/16 // Memory functions // Mostly for convenience, avoids calculating bytes. // Assumes data is alinged and integral. // n = number of __m128i, bytes/16 static inline void memset_zero_128( __m128i *dst, int n ) { for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; } static inline void memset_128( __m128i *dst, const __m128i a, int n ) { for ( int i = 0; i < n; i++ ) dst[i] = a; } static inline void memcpy_128( __m128i *dst, const __m128i *src, int n ) { for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; } // // Bit rotations // AVX512 has implemented bit rotation for 128 bit vectors with // 64 and 32 bit elements. Not really useful. // // Rotate each element of v by c bits #define mm128_ror_64( v, c ) \ _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) ) #define mm128_rol_64( v, c ) \ _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) ) #define mm128_ror_32( v, c ) \ _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) ) #define mm128_rol_32( v, c ) \ _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ) #define mm128_ror_16( v, c ) \ _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) ) #define mm128_rol_16( v, c ) \ _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ) // // Rotate elements accross all lanes #define mm128_swap_64( v ) _mm_shuffle_epi32( v, 0x4e ) #define mm128_ror_1x32( v ) _mm_shuffle_epi32( v, 0x39 ) #define mm128_rol_1x32( v ) _mm_shuffle_epi32( v, 0x93 ) #define mm128_ror_1x16( v ) \ _mm_shuffle_epi8( v, _mm_set_epi8( 1, 0,15,14,13,12,11,10 \ 9, 8, 7, 6, 5, 4, 3, 2 ) ) #define mm128_rol_1x16( v ) \ _mm_shuffle_epi8( v, _mm_set_epi8( 13,12,11,10, 9, 8, 7, 6, \ 5, 4, 3, 2, 1, 0,15,14 ) ) #define mm128_ror_1x8( v ) \ _mm_shuffle_epi8( v, _mm_set_epi8( 0,15,14,13,12,11,10, 9, \ 8, 7, 6, 5, 4, 3, 2, 1 ) ) #define mm128_rol_1x8( v ) \ _mm_shuffle_epi8( v, _mm_set_epi8( 14,13,12,11,10, 9, 8, 7, \ 6, 5, 4, 3, 2, 1, 0,15 ) ) // Rotate 16 byte (128 bit) vector by c bytes. // Less efficient using shift but more versatile. Use only for odd number // byte rotations. Use shuffle above whenever possible. #define mm128_bror( v, c ) \ _mm_or_si128( _mm_srli_si128( v, c ), _mm_slli_si128( v, 16-(c) ) ) #define mm128_brol( v, c ) \ _mm_or_si128( _mm_slli_si128( v, c ), _mm_srli_si128( v, 16-(c) ) ) // Invert vector: {3,2,1,0} -> {0,1,2,3} #define mm128_invert_32( v ) _mm_shuffle_epi32( a, 0x1b ) #define mm128_invert_16( v ) \ _mm_shuffle_epi8( v, _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, \ 9, 8, 11,10, 13,12, 15,14 ) ) #define mm128_invert_8( v ) \ _mm_shuffle_epi8( v, _mm_set_epi8( 0, 1, 2, 3, 4, 5, 6, 7, \ 8, 9,10,11,12,13,14,15 ) ) // // Rotate elements within lanes. #define mm128_swap32_64( v ) _mm_shuffle_epi32( v, 0xb1 ) #define mm128_ror16_64( v ) _mm_shuffle_epi8( v, \ _mm_set_epi8( 9, 8,15,14,13,12,11,10, 1, 0, 7, 6, 5, 4, 3, 2 ) #define mm128_rol16_64( v ) _mm_shuffle_epi8( v, \ _mm_set_epi8( 13,12,11,10, 9, 8,15,14, 5, 4, 3, 2, 1, 0, 7, 6 ) #define mm128_swap16_32( v ) _mm_shuffle_epi8( v, \ _mm_set_epi8( 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2 ) // // Endian byte swap. #if defined(__SSSE3__) #define mm128_bswap_64( v ) \ _mm_shuffle_epi8( v, _mm_set_epi8( 8, 9,10,11,12,13,14,15, \ 0, 1, 2, 3, 4, 5, 6, 7 ) ) #define mm128_bswap_32( v ) \ _mm_shuffle_epi8( v, _mm_set_epi8( 12,13,14,15, 8, 9,10,11, \ 4, 5, 6, 7, 0, 1, 2, 3 ) ) #define mm128_bswap_16( v ) \ _mm_shuffle_epi8( v, _mm_set_epi8( 14,15, 12,13, 10,11, 8, 9, \ 6, 7, 4, 5, 2, 3, 0, 1 ) ) #else // SSE2 // Use inline function instead of macro due to multiple statements. static inline __m128i mm128_bswap_64( __m128i v ) { v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) ); return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 0, 1, 2, 3 ) ); } static inline __m128i mm128_bswap_32( __m128i v ) { v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) ); return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) ); } static inline __m128i mm128_bswap_16( __m128i v ) { return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); } #endif // SSSE3 else SSE2 // // Rotate in place concatenated 128 bit vectors as one 256 bit vector. // Swap 128 bit vectorse. #define mm128_swap128_256(v1, v2) \ v1 = _mm_xor_si128(v1, v2); \ v2 = _mm_xor_si128(v1, v2); \ v1 = _mm_xor_si128(v1, v2); // Concatenate v1 & v2 and rotate as one 256 bit vector. #if defined(__SSE4_1__) #define mm128_ror1x64_256( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 8 ); \ v1 = _mm_alignr_epi8( v2, v1, 8 ); \ v2 = t; \ } while(0) #define mm128_rol1x64_256( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 8 ); \ v2 = _mm_alignr_epi8( v2, v1, 8 ); \ v1 = t; \ } while(0) #define mm128_ror1x32_256( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 4 ); \ v1 = _mm_alignr_epi8( v2, v1, 4 ); \ v2 = t; \ } while(0) #define mm128_rol1x32_256( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 12 ); \ v2 = _mm_alignr_epi8( v2, v1, 12 ); \ v1 = t; \ } while(0) #define mm128_ror1x16_256( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 2 ); \ v1 = _mm_alignr_epi8( v2, v1, 2 ); \ v2 = t; \ } while(0) #define mm128_rol1x16_256( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 14 ); \ v2 = _mm_alignr_epi8( v2, v1, 14 ); \ v1 = t; \ } while(0) #define mm128_ror1x8_256( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 1 ); \ v1 = _mm_alignr_epi8( v2, v1, 1 ); \ v2 = t; \ } while(0) #define mm128_rol1x8_256( v1, v2 ) \ do { \ __m128i t = _mm_alignr_epi8( v1, v2, 15 ); \ v2 = _mm_alignr_epi8( v2, v1, 15 ); \ v1 = t; \ } while(0) #else // SSE2 #define mm128_ror1x64_256( v1, v2 ) \ do { \ __m128i t = _mm_srli_si128( v1, 8 ) | _mm_slli_si128( v2, 8 ); \ v2 = _mm_srli_si128( v2, 8 ) | _mm_slli_si128( v1, 8 ); \ v1 = t; \ } while(0) #define mm128_rol1x64_256( v1, v2 ) \ do { \ __m128i t = _mm_slli_si128( v1, 8 ) | _mm_srli_si128( v2, 8 ); \ v2 = _mm_slli_si128( v2, 8 ) | _mm_srli_si128( v1, 8 ); \ v1 = t; \ } while(0) #define mm128_ror1x32_256( v1, v2 ) \ do { \ __m128i t = _mm_srli_si128( v1, 4 ) | _mm_slli_si128( v2, 12 ); \ v2 = _mm_srli_si128( v2, 4 ) | _mm_slli_si128( v1, 12 ); \ v1 = t; \ } while(0) #define mm128_rol1x32_256( v1, v2 ) \ do { \ __m128i t = _mm_slli_si128( v1, 4 ) | _mm_srli_si128( v2, 12 ); \ v2 = _mm_slli_si128( v2, 4 ) | _mm_srli_si128( v1, 12 ); \ v1 = t; \ } while(0) #define mm128_ror1x16_256( v1, v2 ) \ do { \ __m128i t = _mm_srli_si128( v1, 2 ) | _mm_slli_si128( v2, 14 ); \ v2 = _mm_srli_si128( v2, 2 ) | _mm_slli_si128( v1, 14 ); \ v1 = t; \ } while(0) #define mm128_rol1x16_256( v1, v2 ) \ do { \ __m128i t = _mm_slli_si128( v1, 2 ) | _mm_srli_si128( v2, 14 ); \ v2 = _mm_slli_si128( v2, 2 ) | _mm_srli_si128( v1, 14 ); \ v1 = t; \ } while(0) #define mm128_ror1x8_256( v1, v2 ) \ do { \ __m128i t = _mm_srli_si128( v1, 1 ) | _mm_slli_si128( v2, 15 ); \ v2 = _mm_srli_si128( v2, 1 ) | _mm_slli_si128( v1, 15 ); \ v1 = t; \ } while(0) #define mm128_rol1x8_256( v1, v2 ) \ do { \ __m128i t = _mm_slli_si128( v1, 1 ) | _mm_srli_si128( v2, 15 ); \ v2 = _mm_slli_si128( v2, 1 ) | _mm_srli_si128( v1, 15 ); \ v1 = t; \ } while(0) #endif // SSE4.1 else SSE2 #endif // __SSE2__ #endif // SIMD_SSE2_H__