v3.20.2

2025-09-17 23:44:27 +00:00 · 2022-08-01 20:21:05 -04:00
parent 1321ac474c
commit 58030e2788
27 changed files with 312 additions and 4734 deletions
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -273,9 +273,9 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #endif

 // Mask making
-
 // Equivalent of AVX512 _mm_movepi64_mask & _mm_movepi32_mask.
 // Returns 2 or 4 bit integer mask from MSB of 64 or 32 bit elements.
+// Effectively a sign test.

 #define mm_movmask_64( v ) \
   _mm_castpd_si128( _mm_movmask_pd( _mm_castsi128_pd( v ) ) )
@@ -306,34 +306,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 //
 // Bit rotations

-// AVX512VL has implemented bit rotation for 128 bit vectors with
-// 64 and 32 bit elements.
-
 // x2 rotates elements in 2 individual vectors in a double buffered
 // optimization for SSE2, does nothing for AVX512 but is there for
 // transparency.

-// compiler doesn't like when a variable is used for the last arg of
-// _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
-// specification but works with a variable. Therefore use rol_var where
-// necessary.
-// sm3-hash-4way.c has one instance where mm128_rol_var_32 is required.
-
-#define mm128_ror_var_64( v, c ) \
-   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
-
-#define mm128_rol_var_64( v, c ) \
-   _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
-
-#define mm128_ror_var_32( v, c ) \
-   _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
-
-#define mm128_rol_var_32( v, c ) \
-   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
-
-
 #if defined(__AVX512VL__)
-//#if defined(__AVX512F__) && defined(__AVX512VL__)

 #define mm128_ror_64    _mm_ror_epi64
 #define mm128_rol_64    _mm_rol_epi64
@@ -358,10 +335,17 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 #else  // SSE2

-#define mm128_ror_64   mm128_ror_var_64
-#define mm128_rol_64   mm128_rol_var_64
-#define mm128_ror_32   mm128_ror_var_32
-#define mm128_rol_32   mm128_rol_var_32
+#define mm128_ror_64( v, c ) \
+   _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) )
+
+#define mm128_rol_64( v, c ) \
+   _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) )
+
+#define mm128_ror_32( v, c ) \
+   _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) )
+
+#define mm128_rol_32( v, c ) \
+   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )

 #define mm128_rorx2_64( v1, v0, c ) \
 { \
@@ -411,6 +395,11 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_rol_16( v, c ) \
   _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) )

+// Deprecated.
+#define mm128_rol_var_32( v, c ) \
+   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
+
+//
 // Limited 2 input shuffle, combines shuffle with blend. The destination low
 // half is always taken from src a, and the high half from src b.
 #define mm128_shuffle2_64( a, b, c ) \
@@ -421,7 +410,6 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
   _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps( a ), \
                                     _mm_castsi128_ps( b ), c ) ); 

-
 //
 // Rotate vector elements accross all lanes

@@ -432,21 +420,61 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
 #define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )

-
-// Swap 32 bit elements in 64 bit lanes
-#define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
-#define mm128_shuflr64_32 mm128_swap64_32
-#define mm128_shufll64_32 mm128_swap64_32
-
 #if defined(__SSSE3__)

 // Rotate right by c bytes, no SSE2 equivalent.
 static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 { return _mm_alignr_epi8( v, v, c ); }

+#endif
+
+// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit rotations
+// for multiples of 8 bits. Uses ror/rol macros when AVX512 is available
+// (unlikely but faster), or when SSSE3 is not available (slower).
+
+#define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
+#define mm128_shuflr64_32 mm128_swap64_32
+#define mm128_shufll64_32 mm128_swap64_32
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_shuflr64_24( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
+#else
+  #define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
+#endif
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_shuflr64_16( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
+#else
+  #define mm128_shuflr64_16( v ) mm128_ror_64( v, 16 )
+#endif
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_swap32_16( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
+#else
+  #define mm128_swap32_16( v ) mm128_ror_32( v, 16 )
+#endif
+#define mm128_shuflr32_16 mm128_swap32_16
+#define mm128_shufll32_16 mm128_swap32_16
+
+#if defined(__SSSE3__) && !defined(__AVX512VL__)
+  #define mm128_shuflr32_8( v ) \
+    _mm_shuffle_epi8( v, _mm_set_epi64x( \
+                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
+#else
+  #define mm128_shuflr32_8( v ) mm128_ror_32( v, 8 )
+#endif
+
 //
 // Endian byte swap.

+#if defined(__SSSE3__)
+
 #define mm128_bswap_64( v ) \
   _mm_shuffle_epi8( v, m128_const_64( 0x08090a0b0c0d0e0f, \
                                       0x0001020304050607 ) )
@@ -537,8 +565,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 //
 // Rotate in place concatenated 128 bit vectors as one 256 bit vector.

-// Swap 128 bit vectorse.
-
+// Swap 128 bit vectors.
+// This should be avoided, it's more efficient to switch references.
 #define mm128_swap256_128( v1, v2 ) \
   v1 = _mm_xor_si128( v1, v2 ); \
   v2 = _mm_xor_si128( v1, v2 ); \
@@ -552,8 +580,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )

 // Function macros with two inputs and one output, inputs are preserved.
 // Returns the high 128 bits, ie updated v1.
-// These two-input functions are not available without SSSE3. Use procedure
-// macros below instead.
+// These functions are preferred but only available with SSSE3. Use procedure
+// macros below for SSE2 compatibility.

 #define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
 #define mm128_shufl2l_64( v1, v2 )     _mm_alignr_epi8( v1, v2, 8 )
@@ -568,8 +596,8 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
 #define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 8 )

 // Procedure macros with 2 inputs and 2 outputs, input args are overwritten.
-// Deprecated for SSSE3 and above, they exist for SSSE3 only for compatibility
-// with existing code. The function macros above can be used more effciently.
+// Deprecated for SSSE3 and above, SSSE3 versions exist for only for
+// compatibility with with existing code. 

 #define mm128_vror256_64( v1, v2 ) \
 do { \
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -13,6 +13,18 @@
 // AVX512 implementations. They will be selected automatically but their use
 // is limited because 256 bit vectors are less likely to be used when 512
 // is available.
+//
+// AVX2 version of _mm256_shuffle_epi8 is limited to 128 bit lanes but AVX512
+// version is not. Some usage has the index vector encoded as if full vector
+// shuffles are supported. This has no side effects and would have the same
+// results using either version.
+// If needed and AVX512 is available, 256 bit full vector shuffles can be
+// implemented using the AVX512 zero-mask feature with a NULL mask.
+// Using intrinsics it's simple:
+//   _mm256_maskz_shuffle_epi8( k0, v, c )
+// With asm it's a bit more complicated with the addition of the mask register
+// and zero tag:
+//   vpshufb ymm0{k0}{z}, ymm1, ymm2 

 #if defined(__AVX__)

@@ -234,9 +246,9 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #endif

 // Mask making
-
 // Equivalent of AVX512 _mm256_movepi64_mask & _mm256_movepi32_mask.
 // Returns 4 or 8 bit integer mask from MSB of 64 or 32 bit elements.
+// Effectively a sign test.

 #define mm256_movmask_64( v ) \
   _mm256_castpd_si256( _mm256_movmask_pd( _mm256_castsi256_pd( v ) ) )
@@ -273,42 +285,11 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 //           Bit rotations.
 //
-// The only bit shift for more than 64 bits is with __int128 which is slow.
-//
-// AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
-//
 // x2 rotates elements in 2 individual vectors in a double buffered
-// optimization for SSE2, does nothing for AVX512 but is there for
+// optimization for AVX2, does nothing for AVX512 but is here for
 // transparency.

-
-// compiler doesn't like when a variable is used for the last arg of
-// _mm_rol_epi32, must be "8 bit immediate". Therefore use rol_var where
-// necessary. 
-
-#define mm256_ror_var_64( v, c ) \
-   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
-                    _mm256_slli_epi64( v, 64-(c) ) )
-
-#define mm256_rol_var_64( v, c ) \
-   _mm256_or_si256( _mm256_slli_epi64( v, c ), \
-                    _mm256_srli_epi64( v, 64-(c) ) )
-
-#define mm256_ror_var_32( v, c ) \
-   _mm256_or_si256( _mm256_srli_epi32( v, c ), \
-                    _mm256_slli_epi32( v, 32-(c) ) )
-
-#define mm256_rol_var_32( v, c ) \
-   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
-                    _mm256_srli_epi32( v, 32-(c) ) )
-
-
-// The spec says both F & VL are required, but just in case AMD
-// decides to implement ROL/R without AVX512F.
 #if defined(__AVX512VL__)
-//#if defined(__AVX512F__) && defined(__AVX512VL__)
-
-// AVX512, control must be 8 bit immediate.

 #define mm256_ror_64    _mm256_ror_epi64
 #define mm256_rol_64    _mm256_rol_epi64
@@ -333,10 +314,23 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )

 #else   // AVX2

-#define mm256_ror_64    mm256_ror_var_64 
-#define mm256_rol_64    mm256_rol_var_64
-#define mm256_ror_32    mm256_ror_var_32
-#define mm256_rol_32    mm256_rol_var_32
+// use shuflr64 shuflr32 below for optimized bit rotations of multiples of 8.
+
+#define mm256_ror_64( v, c ) \
+   _mm256_or_si256( _mm256_srli_epi64( v, c ), \
+                    _mm256_slli_epi64( v, 64-(c) ) )
+
+#define mm256_rol_64( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi64( v, c ), \
+                    _mm256_srli_epi64( v, 64-(c) ) )
+
+#define mm256_ror_32( v, c ) \
+   _mm256_or_si256( _mm256_srli_epi32( v, c ), \
+                    _mm256_slli_epi32( v, 32-(c) ) )
+
+#define mm256_rol_32( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
+                    _mm256_srli_epi32( v, 32-(c) ) )

 #define mm256_rorx2_64( v1, v0, c ) \
 { \
@@ -388,6 +382,10 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
   _mm256_or_si256( _mm256_slli_epi16( v, c ), \
                    _mm256_srli_epi16( v, 16-(c) ) )

+// Deprecated.
+#define mm256_rol_var_32( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
+                    _mm256_srli_epi32( v, 32-(c) ) )

 //
 // Rotate elements accross all lanes.
@@ -399,7 +397,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )

 // Rotate 256 bit vector by one 64 bit element
 #define mm256_shuflr_64( v )    _mm256_permute4x64_epi64( v, 0x39 )
-
 #define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )

 // Rotate 256 bit vector by one 32 bit element.
@@ -413,7 +410,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
                     m256_const_64( 0x0000000600000005,  0x0000000400000003, \
                                    0x0000000200000001,  0x0000000000000007 ) )

-       
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.

@@ -426,7 +422,6 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
   _mm256_castps_si256( _mm256_shuffle_ps( _mm256_castsi256_ps( a ), \
                                           _mm256_castsi256_ps( b ), c ) ); 

-
 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
 #define mm256_shuflr128_64 mm256_swap128_64
 #define mm256_shufll128_64 mm256_swap128_64
@@ -437,11 +432,52 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }

-// Swap 32 bit elements in each 64 bit lane.
+// Rotate byte elements within 64 or 32 bit lanes, AKA optimized bit
+// rotations for multiples of 8 bits. Uses faster ror/rol instructions when
+// AVX512 is available.
+
 #define mm256_swap64_32( v )   _mm256_shuffle_epi32( v, 0xb1 )
 #define mm256_shuflr64_32 mm256_swap64_32
 #define mm256_shufll64_32 mm256_swap64_32

+#if defined(__AVX512VL__)
+  #define mm256_shuflr64_24( v )  _mm256_ror_epi64( v, 24 )
+#else
+  #define mm256_shuflr64_24( v ) \
+    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
+                                    0x0a09080f0e0d0c0b, 0x0201000706050403, \
+                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
+#endif
+
+#if defined(__AVX512VL__)
+  #define mm256_shuflr64_16( v )  _mm256_ror_epi64( v, 16 )
+#else
+  #define mm256_shuflr64_16( v ) \
+    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
+                                    0x09080f0e0d0c0b0a, 0x0100070605040302, \
+                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
+#endif
+
+#if defined(__AVX512VL__)
+  #define mm256_swap32_16( v )  _mm256_ror_epi32( v, 16 )
+#else
+  #define mm256_swap32_16( v ) \
+    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
+                                    0x0d0c0f0e09080b0a, 0x0504070601000302, \
+                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
+#endif
+#define mm256_shuflr32_16 mm256_swap32_16
+#define mm256_shufll32_16 mm256_swap32_16
+
+#if defined(__AVX512VL__)
+  #define mm256_shuflr32_8( v )  _mm256_ror_epi32( v, 8 )
+#else
+  #define mm256_shuflr32_8( v ) \
+    _mm256_shuffle_epi8( v, _mm256_set_epi64x( \
+                                    0x0c0f0e0d080b0a09, 0x0407060500030201, \
+                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
+#endif
+
 // NOTE: _mm256_shuffle_epi8, like most shuffles, is restricted to 128 bit
 // lanes. AVX512, however, supports full vector 8 bit shuffle. The AVX512VL +
 // AVX512BW intrinsic _mm256_mask_shuffle_epi8 with a NULL mask, can be used if
@@ -496,18 +532,8 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
  casti_m256i( d, 7 ) = _mm256_shuffle_epi8( casti_m256i( s, 7 ), ctl ); \
 } while(0)

-//
-// Rotate two concatenated 256 bit vectors as one 512 bit vector by specified
-// number of elements. Rotate is done in place, source arguments are
-// overwritten.
-// Some of these can use permute but appears to be slower. Maybe a Ryzen
-// issue
-
-//  _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
-//  makes these macros unnecessary.
-
-// continue using vror/vrol notation for now to avoid confusion with
-// shufl2r/shufl2l macro functions available with AVX512.
+// swap 256 bit vectors in place.
+// This should be avoided, it's more efficient to switch references.
 #define mm256_swap512_256( v1, v2 ) \
   v1 = _mm256_xor_si256( v1, v2 ); \
   v2 = _mm256_xor_si256( v1, v2 ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -316,58 +316,18 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 // Bit rotations.

 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
-// elements and can be called directly. But they only accept immediate 8
-// for control arg. 
-// The workaround is a fraud, just a fluke of the compiler's optimizer.
-// It fails without -O3. The compiler seems to unroll shift loops, eliminating
-// the variable control, better than rotate loops. 
+// elements and can be called directly. 
 //
 // _mm512_rol_epi64,  _mm512_ror_epi64,  _mm512_rol_epi32,  _mm512_ror_epi32
 // _mm512_rolv_epi64, _mm512_rorv_epi64, _mm512_rolv_epi32, _mm512_rorv_epi32
 //

-// For convenience and consistency with AVX2
+// For convenience and consistency with AVX2 macros.
 #define mm512_ror_64 _mm512_ror_epi64
 #define mm512_rol_64 _mm512_rol_epi64
 #define mm512_ror_32 _mm512_ror_epi32
 #define mm512_rol_32 _mm512_rol_epi32

-static inline __m512i mm512_ror_var_64( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_srli_epi64( v, c ),
-                           _mm512_slli_epi64( v, 64-c ) );
-}
-
-static inline __m512i mm512_rol_var_64( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_slli_epi64( v, c ),
-                           _mm512_srli_epi64( v, 64-c ) );
-}
-
-static inline __m512i mm512_ror_var_32( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_srli_epi32( v, c ),
-                           _mm512_slli_epi32( v, 32-c ) );
-}
-
-static inline __m512i mm512_rol_var_32( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_slli_epi32( v, c ),
-                           _mm512_srli_epi32( v, 32-c ) );
-}
-
-static inline __m512i mm512_ror_16( __m512i const v, const int c )
-{
-   return _mm512_or_si512( _mm512_srli_epi16( v, c ),
-                           _mm512_slli_epi16( v, 16-c ) );
-}
-
-static inline __m512i mm512_rol_16( const __m512i v, const int c )
-{
-   return _mm512_or_si512( _mm512_slli_epi16( v, c ),
-                           _mm512_srli_epi16( v, 16-c ) );
-}
-
 // Rotations using a vector control index are very slow due to overhead
 // to generate the index vector. Repeated rotations using the same index
 // are better handled by the calling function where the index only needs
@@ -599,22 +559,34 @@ static inline __m512i mm512_shuflr_x32( const __m512i v, const int n )
 static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 {  return _mm512_alignr_epi8( v, v, c ); }

-// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction
-// but only with AVX512. Shuffle is just as fast and availble with AVX2
-// & SSE2.
+// Rotate byte elements in each 64 or 32 bit lane. Redundant for AVX512, all
+// can be done with ror & rol. Defined only for convenience and consistency
+// with AVX2 & SSE2 macros.
+
 #define mm512_swap64_32( v )    _mm512_shuffle_epi32( v, 0xb1 )
 #define mm512_shuflr64_32 mm512_swap64_32
 #define mm512_shufll64_32 mm512_swap64_32

-// Need good way to distinguish 1 input shuffles, 2 input shuffle functions,
-// and 2 input 2 output shuffle macros.
-//
-// shuflr is 1 input
-// shufl2r is 2 input ...
-// Drop macros? They can easilly be rebuilt using shufl2 functions
+#define mm512_shuflr64_24( v )  _mm512_ror_epi64( v, 24 )
+#define mm512_shufll64_24( v )  _mm512_rol_epi64( v, 24 )
+
+#define mm512_shuflr64_16( v )  _mm512_ror_epi64( v, 16 )
+#define mm512_shufll64_16( v )  _mm512_rol_epi64( v, 16 )
+
+#define mm512_shuflr64_8(  v )  _mm512_ror_epi64( v,  8 )
+#define mm512_shufll64_8(  v )  _mm512_rol_epi64( v,  8 )
+
+#define mm512_swap32_16(   v )  _mm512_ror_epi32( v, 16 )
+#define mm512_shuflr32_16 mm512_swap32_16
+#define mm512_shufll32_16 mm512_swap32_16
+
+#define mm512_shuflr32_8(  v )  _mm512_ror_epi32( v,  8 )
+#define mm512_shufll32_8(  v )  _mm512_rol_epi32( v,  8 )
+

 // 2 input, 1 output
-// Rotate concatenated { v1, v2 ) right or left and return v1. 
+// Concatenate { v1, v2 ) then rotate right or left and return the high
+// 512 bits, ie rotated v1. 
 #define mm512_shufl2r_256( v1, v2 )    _mm512_alignr_epi64( v2, v1, 4 )
 #define mm512_shufl2l_256( v1, v2 )    _mm512_alignr_epi64( v1, v2, 4 )