v3.21.3 Unreleased

This commit is contained in:
Jay D Dee
2023-03-13 03:20:13 -04:00
parent b339450898
commit c6bc9d67fb
49 changed files with 1126 additions and 1111 deletions

View File

@@ -470,7 +470,7 @@ static inline void mm128_intrlv_4x32x( void *dst, void *src0, void *src1,
#if defined(__SSSE3__)
static inline void mm128_bswap32_80( void *d, const void *s )
static inline void mm128_bswap32_80( void *d, void *s )
{
__m128i bswap_shuf = m128_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
casti_m128i( d, 0 ) = _mm_shuffle_epi8( casti_m128i( s, 0 ), bswap_shuf );
@@ -482,7 +482,7 @@ static inline void mm128_bswap32_80( void *d, const void *s )
#else
static inline void mm128_bswap32_80( void *d, const void *s )
static inline void mm128_bswap32_80( void *d, void *s )
{
( (uint32_t*)d )[ 0] = bswap_32( ( (uint32_t*)s )[ 0] );
( (uint32_t*)d )[ 1] = bswap_32( ( (uint32_t*)s )[ 1] );

View File

@@ -385,7 +385,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_rol_var_32( v, c ) \
_mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
/* Not used
//
// Limited 2 input shuffle, combines shuffle with blend. The destination low
// half is always taken from v1, and the high half from v2.
#define mm128_shuffle2_64( v1, v2, c ) \
@@ -395,7 +395,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_shuffle2_32( v1, v2, c ) \
_mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( v1 ), \
_mm_castsi128_ps( v2 ), c ) );
*/
//
// Rotate vector elements accross all lanes
@@ -407,7 +406,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
#define mm128_shuflr_32( v ) _mm_shuffle_epi32( v, 0x39 )
#define mm128_shufll_32( v ) _mm_shuffle_epi32( v, 0x93 )
/* Not used
#if defined(__SSSE3__)
// Rotate right by c bytes, no SSE2 equivalent.
@@ -415,7 +413,6 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
{ return _mm_alignr_epi8( v, v, c ); }
#endif
*/
// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
@@ -558,25 +555,68 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
v2 = _mm_xor_si128( v1, v2 ); \
v1 = _mm_xor_si128( v1, v2 );
// Concatenate { hi, lo }, rotate right by c elements and return low 128 bits.
// alignr for 32 & 64 bit elements is only available with AVX512 but
// emulated here. Shift argument is not needed, it's always 1.
// Behaviour is otherwise consistent with Intel alignr intrinsics.
#if defined(__SSSE3__)
// _mm_alignr_epi32 & _mm_alignr_epi64 are only available with AVX512VL but
// are emulated here using _mm_alignr_epi8. There are no fast equivalents for
// 256 bit vectors, though there is no for this functionality.
#define mm128_alignr_64( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*8 )
#define mm128_alignr_32( hi, lo, c ) _mm_alignr_epi8( hi, lo, (c)*4 )
#define mm128_alignr_64( v1, v2 ) _mm_alignr_epi8( v1, v2, 8 )
#define mm128_alignr_32( v1, v2 ) _mm_alignr_epi8( v1, v2, 4 )
#else
#define mm128_alignr_64( hi, lo, c ) \
_mm_or_si128( _mm_slli_si128( hi, (c)*8 ), _mm_srli_si128( lo, (c)*8 ) )
#define mm128_alignr_64( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 8 ), \
_mm_srli_si128( v2, 8 ) )
#define mm128_alignr_32( hi, lo, c ) \
_mm_or_si128( _mm_slli_si128( lo, (c)*4 ), _mm_srli_si128( hi, (c)*4 ) )
#define mm128_alignr_32( v1, v2 ) _mm_or_si128( _mm_slli_si128( v1, 4 ), \
_mm_srli_si128( v2, 4 ) )
#endif
// Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
// vrol & vror are deprecated and do not exist for larger vectors.
// Their only use is by lyra2 blake2b when AVX2 is not available and is
// grandfathered.
#if defined(__SSSE3__)
#define mm128_vror256_64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v1 = _mm_alignr_epi8( v2, v1, 8 ); \
v2 = t; \
} while(0)
#define mm128_vrol256_64( v1, v2 ) \
do { \
__m128i t = _mm_alignr_epi8( v1, v2, 8 ); \
v2 = _mm_alignr_epi8( v2, v1, 8 ); \
v1 = t; \
} while(0)
#else // SSE2
#define mm128_vror256_64( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
_mm_slli_si128( v2, 8 ) ); \
v2 = _mm_or_si128( _mm_srli_si128( v2, 8 ), \
_mm_slli_si128( v1, 8 ) ); \
v1 = t; \
} while(0)
#define mm128_vrol256_64( v1, v2 ) \
do { \
__m128i t = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
_mm_srli_si128( v2, 8 ) ); \
v2 = _mm_or_si128( _mm_slli_si128( v2, 8 ), \
_mm_srli_si128( v1, 8 ) ); \
v1 = t; \
} while(0)
#endif // SSE4.1 else SSE2
#endif // __SSE2__
#endif // SIMD_128_H__

View File

@@ -239,8 +239,8 @@ static inline __m256i mm256_not( const __m256i v )
// Mask making
// Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
// Create a 64 or 32 bit integer mask from MSB of 64 or 32 bit elements.
// Effectively a sign test: if (mask[n]) then -1 else 0.
// Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
// Effectively a sign test.
#define mm256_movmask_64( v ) \
_mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
@@ -348,7 +348,7 @@ static inline __m256i mm256_not( const __m256i v )
_mm256_or_si256( _mm256_slli_epi16( v, c ), \
_mm256_srli_epi16( v, 16-(c) ) )
// Deprecated. Obsolete sm3, the only user, is grandfathered.
// Deprecated.
#define mm256_rol_var_32( v, c ) \
_mm256_or_si256( _mm256_slli_epi32( v, c ), \
_mm256_srli_epi32( v, 32-(c) ) )
@@ -391,7 +391,6 @@ static inline __m256i mm256_shufll_32( const __m256i v )
//
// Rotate elements within each 128 bit lane of 256 bit vector.
/* Not used
// Limited 2 input shuffle
#define mm256_shuffle2_64( v1, v2, c ) \
_mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd( v1 ), \
@@ -400,7 +399,6 @@ static inline __m256i mm256_shufll_32( const __m256i v )
#define mm256_shuffle2_32( v1, v2, c ) \
_mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( v1 ), \
_mm256_castsi256_ps( v2 ), c ) );
*/
#define mm256_swap128_64( v ) _mm256_shuffle_epi32( v, 0x4e )
#define mm256_shuflr128_64 mm256_swap128_64
@@ -513,8 +511,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
} while(0)
// swap 256 bit vectors in place.
// Deprecated, Shabal is the only user and it should be modified to reorder
// instructions.
// This should be avoided, it's more efficient to switch references.
#define mm256_swap512_256( v1, v2 ) \
v1 = _mm256_xor_si256( v1, v2 ); \
v2 = _mm256_xor_si256( v1, v2 ); \

View File

@@ -409,20 +409,19 @@ static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
{ return _mm512_alignr_epi32( v, v, n ); }
/* Not used
#define mm512_shuflr_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x0000001F001E001D, 0x001C001B001A0019, \
0x0018001700160015, 0x0014001300120011, \
0x0010000F000E000D, 0x000C000B000A0009, \
0x0008000700060005, 0x0004000300020001 ), v )
0X0018001700160015, 0X0014001300120011, \
0X0010000F000E000D, 0X000C000B000A0009, \
0X0008000700060005, 0X0004000300020001 ), v )
#define mm512_shufll_16( v ) \
_mm512_permutexvar_epi16( m512_const_64( \
0x001E001D001C001B, 0x001A001900180017, \
0x0016001500140013, 0x001200110010000F, \
0x000E000D000C000B, 0x000A000900080007, \
0x0006000500040003, 0x000200010000001F ), v )
0X0016001500140013, 0X001200110010000F, \
0X000E000D000C000B, 0X000A000900080007, \
0X0006000500040003, 0X000200010000001F ), v )
#define mm512_shuflr_8( v ) \
_mm512_shuffle_epi8( v, m512_const_64( \
@@ -437,7 +436,6 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
0x2E2D2C2B2A292827, 0x262524232221201F, \
0x1E1D1C1B1A191817, 0x161514131211100F, \
0x0E0D0C0B0A090807, 0x060504030201003F ) )
*/
// 256 bit lanes used only by lyra2, move these there
// Rotate elements within 256 bit lanes of 512 bit vector.
@@ -451,7 +449,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
#define mm512_shuflr256_64( v ) _mm512_permutex_epi64( v, 0x39 )
#define mm512_shufll256_64( v ) _mm512_permutex_epi64( v, 0x93 )
/* Not used
/*
// Rotate 256 bit lanes by one 32 bit element
#define mm512_shuflr256_32( v ) \
_mm512_permutexvar_epi32( m512_const_64( \
@@ -498,7 +496,6 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
//
// Shuffle/rotate elements within 128 bit lanes of 512 bit vector.
/* Not used
// Limited 2 input, 1 output shuffle, combines shuffle with blend.
// Like most shuffles it's limited to 128 bit lanes and like some shuffles
// destination elements must come from a specific source arg.
@@ -509,10 +506,7 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
#define mm512_shuffle2_32( v1, v2, c ) \
_mm512_castps_si512( _mm512_shuffle_ps( _mm512_castsi512_ps( v1 ), \
_mm512_castsi512_ps( v2 ), c ) );
*/
// These hard coded shuffles exist for consistency with AVX2 & SSE2 where
// efficient generic versions don't exist.
// Swap 64 bits in each 128 bit lane
#define mm512_swap128_64( v ) _mm512_shuffle_epi32( v, 0x4e )
#define mm512_shuflr128_64 mm512_swap128_64
@@ -522,11 +516,9 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
#define mm512_shuflr128_32( v ) _mm512_shuffle_epi32( v, 0x39 )
#define mm512_shufll128_32( v ) _mm512_shuffle_epi32( v, 0x93 )
/* Not used
// Rotate right 128 bit lanes by c bytes, efficient generic version of above.
// Rotate right 128 bit lanes by c bytes, versatile and just as fast
static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
{ return _mm512_alignr_epi8( v, v, c ); }
*/
// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
// can be done with ror & rol. Defined only for convenience and consistency