v3.7.4

2025-09-17 23:44:27 +00:00 · 2021-09-29 17:31:16 -04:00
parent 9b905fccc8
commit 2cd1507c2e
80 changed files with 8145 additions and 2097 deletions
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -65,7 +65,7 @@ static inline void dintrlv_2x32( void *dst0, void *dst1,
   d0[24] = s[48];   d1[24] = s[49];   d0[25] = s[50];   d1[25] = s[51];
   d0[26] = s[52];   d1[26] = s[53];   d0[27] = s[54];   d1[27] = s[55];
   d0[28] = s[56];   d1[28] = s[57];   d0[29] = s[58];   d1[29] = s[59];
-   d0[30] = s[60];   d1[30] = s[61];   d0[31] = s[61];   d1[31] = s[63];
+   d0[30] = s[60];   d1[30] = s[61];   d0[31] = s[62];   d1[31] = s[63];
 }

 static inline void extr_lane_2x32( void *dst, const void *src,
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -35,6 +35,13 @@
 ///////////////////////////////////////////////////////////////////////////


+// Used instead if casting.
+typedef union
+{
+   __m128i m128;
+   uint32_t u32[4];
+} __attribute__ ((aligned (16))) m128_ovly;
+
 // Efficient and convenient moving between GP & low bits of XMM.
 // Use VEX when available to give access to xmm8-15 and zero extend for
 // larger vectors.
@@ -61,7 +68,10 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
  return  a;
 }

-static inline uint64_t mm128_mov128_64( const __m128i a )
+// Inconstant naming, prefix should reflect return value:
+// u64_mov128_64
+
+static inline uint64_t u64_mov128_64( const __m128i a )
 {
  uint64_t n;
 #if defined(__AVX__)
@@ -72,7 +82,7 @@ static inline uint64_t mm128_mov128_64( const __m128i a )
  return  n;
 }

-static inline uint32_t mm128_mov128_32( const __m128i a )
+static inline uint32_t u32_mov128_32( const __m128i a )
 {
  uint32_t n;
 #if defined(__AVX__)
@@ -166,12 +176,17 @@ static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,

 // Extract 32 bit element c from v and return as integer.
 static inline uint32_t mm128_extract_32( const __m128i v, const int c )
-{   return mm128_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }
+{   return u32_mov128_32( mm128_xim_32( v, v, c<<6 ) ); }

 // Clear (zero) 32 bit elements based on bits set in 4 bit mask.
 static inline __m128i mm128_mask_32( const __m128i v, const int m ) 
 {   return mm128_xim_32( v, v, m ); }

+// Move element i2 of v2 to element i1 of v1. For reference and convenience,
+// it's faster to precalculate the index.
+#define mm128_shuflmov_32( v1, i1, v2, i2 ) \
+  mm128_xim_32( v1, v2, ( (i1)<<4 ) | ( (i2)<<6 ) )
+
 #endif  // SSE4_1

 //
@@ -257,12 +272,37 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )

 #endif

+
+
+// Diagonal blend: d = s3[3], s2[2], s1[1], s0[0] ||
+
+// Blend 4 32 bit elements from 4 vectors
+
+#if defined (__AVX2__)
+
+#define mm128_diagonal_32( v3, v2, v1, v0 ) \
+  mm_blend_epi32( _mm_blend_epi32( s3, s2, 0x4 ), \
+                  _mm_blend_epi32( s1, s0, 0x1 ), 0x3 )
+
+#elif defined(__SSE4_1)
+
+#define mm128_diagonal_32( v3, v2, v1, v0 ) \
+  mm_blend_epi16( _mm_blend_epi16( s3, s2, 0x30 ), \
+                  _mm_blend_epi16( s1, s0, 0x03 ), 0x0f )
+
+#endif
+
+
 //
 // Bit rotations

 // AVX512VL has implemented bit rotation for 128 bit vectors with
 // 64 and 32 bit elements.

+// x2 rotates elements in 2 individual vectors in a double buffered
+// optimization for SSE2, does nothing for AVX512 but is there for
+// transparency.
+
 // compiler doesn't like when a variable is used for the last arg of
 // _mm_rol_epi32, must be "8 bit immediate". Oddly _mm_slli has the same
 // specification but works with a variable. Therefore use rol_var where
@@ -290,6 +330,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_ror_32    _mm_ror_epi32
 #define mm128_rol_32    _mm_rol_epi32

+#define mm128_rorx2_64( v1, v0, c ) \
+   _mm_ror_epi64( v0, c ); \
+   _mm_ror_epi64( v1, c )
+
+#define mm128_rolx2_64( v1, v0, c ) \
+   _mm_rol_epi64( v0, c ); \
+   _mm_rol_epi64( v1, c )
+
+#define mm128_rorx2_32( v1, v0, c ) \
+   _mm_ror_epi32( v0, c ); \
+   _mm_ror_epi32( v1, c )
+
+#define mm128_rolx2_32( v1, v0, c ) \
+   _mm_rol_epi32( v0, c ); \
+   _mm_rol_epi32( v1, c )
+
 #else  // SSE2

 #define mm128_ror_64   mm128_ror_var_64
@@ -297,6 +353,46 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 #define mm128_ror_32   mm128_ror_var_32
 #define mm128_rol_32   mm128_rol_var_32

+#define mm128_rorx2_64( v1, v0, c ) \
+{ \
+ __m128i t0 = _mm_srli_epi64( v0, c ); \
+ __m128i t1 = _mm_srli_epi64( v1, c ); \
+ v0 = _mm_slli_epi64( v0, 64-(c) ); \
+ v1 = _mm_slli_epi64( v1, 64-(c) ); \
+ v0 = _mm_or_si256( v0, t0 ); \
+ v1 = _mm_or_si256( v1, t1 ); \
+}
+
+#define mm128_rolx2_64( v1, v0, c ) \
+{ \
+ __m128i t0 = _mm_slli_epi64( v0, c ); \
+ __m128i t1 = _mm_slli_epi64( v1, c ); \
+ v0 = _mm_srli_epi64( v0, 64-(c) ); \
+ v1 = _mm_srli_epi64( v1, 64-(c) ); \
+ v0 = _mm_or_si256( v0, t0 ); \
+ v1 = _mm_or_si256( v1, t1 ); \
+}
+
+#define mm128_rorx2_32( v1, v0, c ) \
+{ \
+ __m128i t0 = _mm_srli_epi32( v0, c ); \
+ __m128i t1 = _mm_srli_epi32( v1, c ); \
+ v0 = _mm_slli_epi32( v0, 32-(c) ); \
+ v1 = _mm_slli_epi32( v1, 32-(c) ); \
+ v0 = _mm_or_si256( v0, t0 ); \
+ v1 = _mm_or_si256( v1, t1 ); \
+}
+
+#define mm128_rolx2_32( v1, v0, c ) \
+{ \
+ __m128i t0 = _mm_slli_epi32( v0, c ); \
+ __m128i t1 = _mm_slli_epi32( v1, c ); \
+ v0 = _mm_srli_epi32( v0, 32-(c) ); \
+ v1 = _mm_srli_epi32( v1, 32-(c) ); \
+ v0 = _mm_or_si256( v0, t0 ); \
+ v1 = _mm_or_si256( v1, t1 ); \
+}
+
 #endif   // AVX512 else SSE2

 #define mm128_ror_16( v, c ) \
@@ -309,16 +405,22 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 // Rotate vector elements accross all lanes

 #define mm128_swap_64( v )    _mm_shuffle_epi32( v, 0x4e )
-#define mm128_ror_1x32( v )   _mm_shuffle_epi32( v, 0x39 )
-#define mm128_rol_1x32( v )   _mm_shuffle_epi32( v, 0x93 )
+#define mm128_shuflr_64       mm128_swap_64
+#define mm128_shufll_64       mm128_swap_64
+
+#define mm128_shuflr_32( v )   _mm_shuffle_epi32( v, 0x39 )
+#define mm128_shufll_32( v )   _mm_shuffle_epi32( v, 0x93 )
+

 // Swap 32 bit elements in 64 bit lanes
 #define mm128_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
+#define mm128_shuflr64_32 mm128_swap64_32
+#define mm128_shufll64_32 mm128_swap64_32

 #if defined(__SSSE3__)

 // Rotate right by c bytes, no SSE2 equivalent.
-static inline __m128i mm128_ror_x8( const __m128i v, const int c )
+static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 { return _mm_alignr_epi8( v, v, c ); }

 //
@@ -422,59 +524,88 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s )
   v1 = _mm_xor_si128( v1, v2 );


+// Two input shuffle-rotate.
 // Concatenate v1 & v2 and rotate as one 256 bit vector.
-#if defined(__SSE4_1__)
+// Continue to use vror/vrol for now to avoid confusion with
+// shufl2r/shufl2l function macros available with AVX512.

-#define mm128_ror256_64( v1, v2 ) \
+#if defined(__SSSE3__)
+
+// Function macro with two inputs and one output, inputs are preserved.
+// Returns modified first arg.
+// Two input functions are not available without SSSE3. Use procedure
+// belowe instead.
+
+#define mm128_shufl2r_64( v1, v2 )     _mm_alignr_epi8( v2, v1, 8 )
+#define mm128_shufl2l_64( v1, v2 )     _mm_alignr_epi8( v1, v2, 8 )
+
+#define mm128_shufl2r_32( v1, v2 )     _mm_alignr_epi8( v2, v1, 4 )
+#define mm128_shufl2l_32( v1, v2 )     _mm_alignr_epi8( v1, v2, 4 )
+
+#define mm128_shufl2r_16( v1, v2 )     _mm_alignr_epi8( v2, v1, 2 )
+#define mm128_shufl2l_16( v1, v2 )     _mm_alignr_epi8( v1, v2, 2 )
+
+#define mm128_shufl2r_8( v1, v2 )      _mm_alignr_epi8( v2, v1, 8 )
+#define mm128_shufl2l_8( v1, v2 )      _mm_alignr_epi8( v1, v2, 8 )
+
+// Procedure macroswith 2 inputs and 2 outputs, inputs are destroyed.
+// Returns both modified args in place.
+
+// These macros retain the vrol/vror name for now to avoid
+// confusion with the shufl2r/shuffle2l function macros above.
+// These may be renamed to something like shufl2r2 for 2 1nputs and
+// 2 outputs, ie SHUFfLe 2 inputs Right with 2 outputs.
+
+#define mm128_vror256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
           v1 = _mm_alignr_epi8( v2, v1, 8 ); \
           v2 = t; \
 } while(0)

-#define mm128_rol256_64( v1, v2 ) \
+#define mm128_vrol256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 8 ); \
           v2 = _mm_alignr_epi8( v2, v1, 8 ); \
           v1 = t; \
 } while(0)

-#define mm128_ror256_32( v1, v2 ) \
+#define mm128_vror256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 4 ); \
           v1 = _mm_alignr_epi8( v2, v1, 4 ); \
           v2 = t; \
 } while(0)

-#define mm128_rol256_32( v1, v2 ) \
+#define mm128_vrol256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 12 ); \
           v2 = _mm_alignr_epi8( v2, v1, 12 ); \
           v1 = t; \
 } while(0)

-#define mm128_ror256_16( v1, v2 ) \
+#define mm128_vror256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 2 ); \
           v1 = _mm_alignr_epi8( v2, v1, 2 ); \
           v2 = t; \
 } while(0)

-#define mm128_rol256_16( v1, v2 ) \
+#define mm128_vrol256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 14 ); \
           v2 = _mm_alignr_epi8( v2, v1, 14 ); \
           v1 = t; \
 } while(0)

-#define mm128_ror256_8( v1, v2 ) \
+#define mm128_vror256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 1 ); \
           v1 = _mm_alignr_epi8( v2, v1, 1 ); \
           v2 = t; \
 } while(0)

-#define mm128_rol256_8( v1, v2 ) \
+#define mm128_vrol256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_alignr_epi8( v1, v2, 15 ); \
           v2 = _mm_alignr_epi8( v2, v1, 15 ); \
@@ -483,7 +614,7 @@ do { \

 #else  // SSE2

-#define mm128_ror256_64( v1, v2 ) \
+#define mm128_vror256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 8 ), \
                              _mm_slli_si128( v2, 8 ) ); \
@@ -492,7 +623,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_rol256_64( v1, v2 ) \
+#define mm128_vrol256_64( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 8 ), \
                              _mm_srli_si128( v2, 8 ) ); \
@@ -501,7 +632,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_ror256_32( v1, v2 ) \
+#define mm128_vror256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 4 ), \
                              _mm_slli_si128( v2, 12 ) ); \
@@ -510,7 +641,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_rol256_32( v1, v2 ) \
+#define mm128_vrol256_32( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 4 ), \
                              _mm_srli_si128( v2, 12 ) ); \
@@ -519,7 +650,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_ror256_16( v1, v2 ) \
+#define mm128_vror256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 2 ), \
                              _mm_slli_si128( v2, 14 ) ); \
@@ -528,7 +659,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_rol256_16( v1, v2 ) \
+#define mm128_vrol256_16( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 2 ), \
                              _mm_srli_si128( v2, 14 ) ); \
@@ -537,7 +668,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_ror256_8( v1, v2 ) \
+#define mm128_vror256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_srli_si128( v1, 1 ), \
                              _mm_slli_si128( v2, 15 ) ); \
@@ -546,7 +677,7 @@ do { \
           v1 = t; \
 } while(0)

-#define mm128_rol256_8( v1, v2 ) \
+#define mm128_vrol256_8( v1, v2 ) \
 do { \
   __m128i t  = _mm_or_si128( _mm_slli_si128( v1, 1 ), \
                              _mm_srli_si128( v2, 15 ) ); \
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -14,13 +14,28 @@
 // is limited because 256 bit vectors are less likely to be used when 512
 // is available.

+// Used instead if casting.
+typedef union
+{
+   __m256i m256;
+   __m128i m128[2];
+   uint64_t u64[4];
+   uint32_t u32[8];
+} __attribute__ ((aligned (32))) m256_ovly;
+
+
 // Move integer to low element of vector, other elements are set to zero.
 #define mm256_mov64_256( i ) _mm256_castsi128_si256( mm128_mov64_128( i ) )
 #define mm256_mov32_256( i ) _mm256_castsi128_si256( mm128_mov32_128( i ) )

 // Move low element of vector to integer.
-#define mm256_mov256_64( v ) mm128_mov128_64( _mm256_castsi256_si128( v ) )
-#define mm256_mov256_32( v ) mm128_mov128_32( _mm256_castsi256_si128( v ) )
+#define u64_mov256_64( v ) u64_mov128_64( _mm256_castsi256_si128( v ) )
+#define u32_mov256_32( v ) u32_mov128_32( _mm256_castsi256_si128( v ) )
+
+// deprecated
+//#define mm256_mov256_64 u64_mov256_64 
+//#define mm256_mov256_32 u32_mov256_32
+

 // concatenate two 128 bit vectors into one 256 bit vector: { hi, lo }
 #define mm256_concat_128( hi, lo ) \
@@ -214,12 +229,41 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )

 #endif

+// Diagonal blending
+
+// Blend 4 64 bit elements from 4 vectors
+#define mm256_diagonal_64( v3, v2, v1, v0 ) \
+  mm256_blend_epi32( _mm256_blend_epi32( v3, v2, 0x30 ), \
+                     _mm256_blend_epi32( v1, v0, 0x03 ), 0x0f )
+
+// Blend 8 32 bit elements from 8 vectors
+#define mm256_diagonal_32( v7, v6, v5, v4, v3, v2, v1, v0 ) \
+  _mm256_blend_epi32( \
+        _mm256_blend_epi32( \
+               _mm256_blend_epi32( v7, v6, 0x40 ), \
+               _mm256_blend_epi32( v5, v4, 0x10 ) 0x30 ), \
+        _mm256_blend_epi32( \
+               _mm256_blend_epi32( v3, v2, 0x04) \
+               _mm256_blend_epi32( v1, v0, 0x01 ), 0x03 ), 0x0f )  
+
+
+// Blend 4 32 bit elements from each 128 bit lane.
+#define mm256_diagonal128_32( v3, v2, v1, v0 ) \
+    _mm256_blend_epi32( \
+           _mm256_blend_epi32( v3, v2, 0x44) \
+           _mm256_blend_epi32( v1, v0, 0x11 ) )  
+
+
 //
 //           Bit rotations.
 //
-// The only bit shift for more than 64 bits is with __int128.
+// The only bit shift for more than 64 bits is with __int128 which is slow.
 //
 // AVX512 has bit rotate for 256 bit vectors with 64 or 32 bit elements
+//
+// x2 rotates elements in 2 individual vectors in a double buffered
+// optimization for SSE2, does nothing for AVX512 but is there for
+// transparency.


 // compiler doesn't like when a variable is used for the last arg of
@@ -255,6 +299,22 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_ror_32    _mm256_ror_epi32
 #define mm256_rol_32    _mm256_rol_epi32

+#define mm256_rorx2_64( v1, v0, c ) \
+   _mm256_ror_epi64( v0, c ); \
+   _mm256_ror_epi64( v1, c )
+
+#define mm256_rolx2_64( v1, v0, c ) \
+   _mm256_rol_epi64( v0, c ); \
+   _mm256_rol_epi64( v1, c )
+
+#define mm256_rorx2_32( v1, v0, c ) \
+   _mm256_ror_epi32( v0, c ); \
+   _mm256_ror_epi32( v1, c )
+
+#define mm256_rolx2_32( v1, v0, c ) \
+   _mm256_rol_epi32( v0, c ); \
+   _mm256_rol_epi32( v1, c )
+
 #else   // AVX2

 #define mm256_ror_64    mm256_ror_var_64 
@@ -262,6 +322,46 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 #define mm256_ror_32    mm256_ror_var_32
 #define mm256_rol_32    mm256_rol_var_32

+#define mm256_rorx2_64( v1, v0, c ) \
+{ \
+ __m256i t0 = _mm256_srli_epi64( v0, c ); \
+ __m256i t1 = _mm256_srli_epi64( v1, c ); \
+ v0 = _mm256_slli_epi64( v0, 64-(c) ); \
+ v1 = _mm256_slli_epi64( v1, 64-(c) ); \
+ v0 = _mm256_or_si256( v0, t0 ); \
+ v1 = _mm256_or_si256( v1, t1 ); \
+}
+
+#define mm256_rolx2_64( v1, v0, c ) \
+{ \
+ __m256i t0 = _mm256_slli_epi64( v0, c ); \
+ __m256i t1 = _mm256_slli_epi64( v1, c ); \
+ v0 = _mm256_srli_epi64( v0, 64-(c) ); \
+ v1 = _mm256_srli_epi64( v1, 64-(c) ); \
+ v0 = _mm256_or_si256( v0, t0 ); \
+ v1 = _mm256_or_si256( v1, t1 ); \
+}
+
+#define mm256_rorx2_32( v1, v0, c ) \
+{ \
+ __m256i t0 = _mm256_srli_epi32( v0, c ); \
+ __m256i t1 = _mm256_srli_epi32( v1, c ); \
+ v0 = _mm256_slli_epi32( v0, 32-(c) ); \
+ v1 = _mm256_slli_epi32( v1, 32-(c) ); \
+ v0 = _mm256_or_si256( v0, t0 ); \
+ v1 = _mm256_or_si256( v1, t1 ); \
+}
+
+#define mm256_rolx2_32( v1, v0, c ) \
+{ \
+ __m256i t0 = _mm256_slli_epi32( v0, c ); \
+ __m256i t1 = _mm256_slli_epi32( v1, c ); \
+ v0 = _mm256_srli_epi32( v0, 32-(c) ); \
+ v1 = _mm256_srli_epi32( v1, 32-(c) ); \
+ v0 = _mm256_or_si256( v0, t0 ); \
+ v1 = _mm256_or_si256( v1, t1 ); \
+}
+
 #endif     // AVX512 else AVX2

 #define  mm256_ror_16( v, c ) \
@@ -276,58 +376,45 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 //
 // Rotate elements accross all lanes.

-#if defined(__AVX512VL__)
-
-static inline __m256i mm256_swap_128( const __m256i v )
-{ return _mm256_alignr_epi64( v, v, 2 ); }
-
-static inline __m256i mm256_ror_1x64( const __m256i v )
-{ return _mm256_alignr_epi64( v, v, 1 ); }
-
-static inline __m256i mm256_rol_1x64( const __m256i v )
-{ return _mm256_alignr_epi64( v, v, 3 ); }
-
-static inline __m256i mm256_ror_1x32( const __m256i v )
-{ return _mm256_alignr_epi32( v, v, 1 ); }
-
-static inline __m256i mm256_rol_1x32( const __m256i v )
-{ return _mm256_alignr_epi32( v, v, 7 ); }
-
-#else   // AVX2
-
 // Swap 128 bit elements in 256 bit vector.
 #define mm256_swap_128( v )     _mm256_permute4x64_epi64( v, 0x4e )
+#define mm256_shuflr_128 mm256_swap_128
+#define mm256_shufll_128 mm256_swap_128

 // Rotate 256 bit vector by one 64 bit element
-#define mm256_ror_1x64( v )     _mm256_permute4x64_epi64( v, 0x39 )
-#define mm256_rol_1x64( v )     _mm256_permute4x64_epi64( v, 0x93 )
+#define mm256_shuflr_64( v )    _mm256_permute4x64_epi64( v, 0x39 )
+
+#define mm256_shufll_64( v )    _mm256_permute4x64_epi64( v, 0x93 )

 // Rotate 256 bit vector by one 32 bit element.
-#define mm256_ror_1x32( v ) \
+#define mm256_shuflr_32( v ) \
    _mm256_permutevar8x32_epi32( v, \
                     m256_const_64( 0x0000000000000007, 0x0000000600000005, \
-                                    0x0000000400000003, 0x0000000200000001 )
+                                    0x0000000400000003, 0x0000000200000001 ) )

-#define mm256_rol_1x32( v ) \
+#define mm256_shufll_32( v ) \
    _mm256_permutevar8x32_epi32( v, \
                     m256_const_64( 0x0000000600000005,  0x0000000400000003, \
-                                    0x0000000200000001,  0x0000000000000007 )
+                                    0x0000000200000001,  0x0000000000000007 ) )

       
-#endif    // AVX512 else AVX2
-
 //
 // Rotate elements within each 128 bit lane of 256 bit vector.

 #define mm256_swap128_64( v )  _mm256_shuffle_epi32( v, 0x4e )
-#define mm256_ror128_32( v )   _mm256_shuffle_epi32( v, 0x39 )
-#define mm256_rol128_32( v )   _mm256_shuffle_epi32( v, 0x93 )
+#define mm256_shuflr128_64 mm256_swap128_64
+#define mm256_shufll128_64 mm256_swap128_64

-static inline __m256i mm256_ror128_x8( const __m256i v, const int c )
+#define mm256_shuflr128_32( v )   _mm256_shuffle_epi32( v, 0x39 )
+#define mm256_shufll128_32( v )   _mm256_shuffle_epi32( v, 0x93 )
+
+static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 { return _mm256_alignr_epi8( v, v, c ); }

 // Swap 32 bit elements in each 64 bit lane.
 #define mm256_swap64_32( v )   _mm256_shuffle_epi32( v, 0xb1 )
+#define mm256_shuflr64_32 mm256_swap64_32
+#define mm256_shufll64_32 mm256_swap64_32

 //
 // Swap bytes in vector elements, endian bswap.
@@ -387,19 +474,21 @@ static inline __m256i mm256_ror128_x8( const __m256i v, const int c )
 //  _mm256_alignr_epi 64/32 are only available with AVX512 but AVX512 also
 //  makes these macros unnecessary.

+// continue using vror/vrol notation for now to avoid confusion with
+// shufl2r/shufl2l macro functions available with AVX512.
 #define mm256_swap512_256( v1, v2 ) \
   v1 = _mm256_xor_si256( v1, v2 ); \
   v2 = _mm256_xor_si256( v1, v2 ); \
   v1 = _mm256_xor_si256( v1, v2 );

-#define mm256_ror512_128( v1, v2 ) \
+#define mm256_vror512_128( v1, v2 ) \
 do { \
   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
   v1 = _mm256_permute2x128( v2, v1, 0x21 ); \
   v2 = t; \
 } while(0)

-#define mm256_rol512_128( v1, v2 ) \
+#define mm256_vrol512_128( v1, v2 ) \
 do { \
   __m256i t = _mm256_permute2x128( v1, v2, 0x03 ); \
   v2 = _mm256_permute2x128( v2, v1, 0x21 ); \
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -74,13 +74,22 @@
 // __AVX512VBMI__  __AVX512VAES__
 //

+// Used instead if casting.
+typedef union
+{
+   __m512i m512;
+   __m128i m128[4];
+   uint32_t u32[16];
+   uint64_t u64[8];
+} __attribute__ ((aligned (64))) m512_ovly;
+
 // Move integer to/from element 0 of vector.

 #define mm512_mov64_512( n ) _mm512_castsi128_si512( mm128_mov64_128( n ) )
 #define mm512_mov32_512( n ) _mm512_castsi128_si512( mm128_mov32_128( n ) )

-#define mm512_mov256_64( a ) mm128_mov128_64( _mm256_castsi512_si128( a ) )
-#define mm512_mov256_32( a ) mm128_mov128_32( _mm256_castsi512_si128( a ) )
+#define u64_mov512_64( a ) u64_mov128_64( _mm256_castsi512_si128( a ) )
+#define u32_mov512_32( a ) u32_mov128_32( _mm256_castsi512_si128( a ) )

 // A simple 128 bit permute, using function instead of macro avoids
 // problems if the v arg passed as an expression.
@@ -91,6 +100,10 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 #define mm512_concat_256( hi, lo ) \
   _mm512_inserti64x4( _mm512_castsi256_si512( lo ), hi, 1 )

+#define m512_const_128( v3, v2, v1, v0 ) \
+   mm512_concat_256( mm256_concat_128( v3, v2 ), \
+                     mm256_concat_128( v1, v0 ) )
+
 // Equivalent of set, assign 64 bit integers to respective 64 bit elements.
 // Use stack memory overlay
 static inline __m512i m512_const_64( const uint64_t i7, const uint64_t i6,
@@ -225,7 +238,7 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )

 //
 // Ternary logic uses 8 bit truth table to define any 3 input logical
-// operation using any number or combinations of AND, OR XOR, NOT.
+// expression using any number or combinations of AND, OR, XOR, NOT.

 // a ^ b ^ c
 #define mm512_xor3( a, b, c ) \
@@ -251,11 +264,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
 #define mm512_andxor( a, b, c ) \
   _mm512_ternarylogic_epi64( a, b, c, 0x60 )

-// a ^ ( b & c )
+// a ^ ( b | c )
 #define mm512_xoror( a, b, c ) \
   _mm512_ternarylogic_epi64( a, b, c, 0x1e )

-// a ^ ( ~b & c )     [ xor( a, andnot( b, c ) ]
+// a ^ ( ~b & c )     xor( a, andnot( b, c ) )
 #define mm512_xorandnot( a, b, c ) \
  _mm512_ternarylogic_epi64( a, b, c, 0xd2 ) 

@@ -265,11 +278,11 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )

 // Some 2 input operations that don't have their own instruction mnemonic.

-// ~( a | b )
+// ~( a | b ),  (~a) & (~b)
 #define mm512_nor( a, b ) \
   _mm512_ternarylogic_epi64( a, b, b, 0x01  )

-// ~( a ^ b ), same as (~a) ^ b
+// ~( a ^ b ),  (~a) ^ b
 #define mm512_xnor( a, b ) \
   _mm512_ternarylogic_epi64( a, b, b, 0x81  )

@@ -278,6 +291,27 @@ static inline void memcpy_512( __m512i *dst, const __m512i *src, const int n )
   _mm512_ternarylogic_epi64( a, b, b, 0xef  )


+// Diagonal blending
+// Blend 8 64 bit elements from 8 vectors
+#define mm512_diagonal_64( v7, v6, v5, v4, v3, v2, v1, v0 ) \
+  _mm512_mask_blend_epi64( 0x0f, \
+        _mm512_mask_blend_epi64( 0x30, \
+               _mm512_mask_blend_epi64( 0x40, v7, v6 ), \
+               _mm512_mask_blend_epi64( 0x40, v5, v4 ) ), \
+        _mm512_mask_blend_epi64( 0x03, \
+               _mm512_mask_blend_epi64( 0x04, v3, v2 ) \
+               _mm512_mask_blend_epi64( 0x01, v1, v0 ) ) )  
+
+
+// Blend 4 32 bit elements from each 128 bit lane.
+#define mm512_diagonal128_32( v3, v2, v1, v0 ) \
+    _mm512_mask_blend_epi32( 0x3333, \
+           _mm512_mask_blend_epi32( 0x4444, v3, v2 ), \
+           _mm512_mask_blend_epi32( 0x1111, v1, v0 ) )  
+
+
+
+
 // Bit rotations.

 // AVX512F has built-in fixed and variable bit rotation for 64 & 32 bit
@@ -395,59 +429,95 @@ static inline __m512i mm512_rol_16( const __m512i v, const int c )
  casti_m512i( d, 7 ) = _mm512_shuffle_epi8( casti_m512i( s, 7 ), ctl ); \
 } while(0)

-
 //
-// Rotate elements in 512 bit vector.
+// Shift with zero fill & shuffle-rotate elements in 512 bit vector.
+//
+
+// rename plan change ror to vror for Vector ROtate Right,
+// and vrol for Vector ROtate Left, not to be confused with
+//variable rotate rorv, rolv,
+// Plan changed, use shuflr & shufll instead symbolizing a shuffle-rotate
+// operation. 1xNN notaion ia also removed and replaced with simpler NN.
+// Swap will still have its own mnemonic and will be aliased as both
+// left and right shuffles.
+
+// Shift elements right or left in 512 bit vector, filling with zeros.
+// Multiple element shifts can be combined into a single larger
+// element shift.
+
+#define mm512_shiftr_256( v ) \
+  _mm512_alignr_epi64( _mm512_setzero, v, 4 )
+#define mm512_shiftl_256( v ) mm512_shifr_256
+
+#define mm512_shiftr_128( v ) \
+  _mm512_alignr_epi64( _mm512_setzero, v, 2 )
+#define mm512_shiftl_128( v ) \
+  _mm512_alignr_epi64( v,  _mm512_setzero, 6 )
+
+#define mm512_shiftr_64( v ) \
+  _mm512_alignr_epi64( _mm512_setzero, v, 1 )
+#define mm512_shiftl_64( v ) \
+  _mm512_alignr_epi64( v, _mm512_setzero, 7 )
+
+#define mm512_shiftr_32( v ) \
+  _mm512_alignr_epi32( _mm512_setzero, v, 1 )
+#define mm512_shiftl_32( v ) \
+  _mm512_alignr_epi32( v, _mm512_setzero, 15 )
+
+// Shuffle-rotate elements left or right in 512 bit vector.

 static inline __m512i mm512_swap_256( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 4 ); }
+#define mm512_shuflr_256( v ) mm512_swap_256
+#define mm512_shufll_256( v ) mm512_swap_256

-static inline __m512i mm512_ror_1x128( const __m512i v )
+static inline __m512i mm512_shuflr_128( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 2 ); }

-static inline __m512i mm512_rol_1x128( const __m512i v )
+static inline __m512i mm512_shufll_128( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 6 ); }

-static inline __m512i mm512_ror_1x64( const __m512i v )
+static inline __m512i mm512_shuflr_64( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 1 ); }

-static inline __m512i mm512_rol_1x64( const __m512i v )
+static inline __m512i mm512_shufll_64( const __m512i v )
 { return _mm512_alignr_epi64( v, v, 7 ); }

-static inline __m512i mm512_ror_1x32( const __m512i v )
+static inline __m512i mm512_shuflr_32( const __m512i v )
 { return _mm512_alignr_epi32( v, v, 1 ); }

-static inline __m512i mm512_rol_1x32( const __m512i v )
+static inline __m512i mm512_shufll_32( const __m512i v )
 { return _mm512_alignr_epi32( v, v, 15 ); }

-static inline __m512i mm512_ror_x64( const __m512i v, const int n )
+// Generic
+static inline __m512i mm512_shuflr_x64( const __m512i v, const int n )
 { return _mm512_alignr_epi64( v, v, n ); }

-static inline __m512i mm512_ror_x32( const __m512i v, const int n )
+static inline __m512i mm512_shufll_x32( const __m512i v, const int n )
 { return _mm512_alignr_epi32( v, v, n ); }

-#define mm512_ror_1x16( v ) \
+#define mm512_shuflr_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x0000001F001E001D, 0x001C001B001A0019, \
                       0X0018001700160015, 0X0014001300120011, \
                       0X0010000F000E000D, 0X000C000B000A0009, \
                       0X0008000700060005, 0X0004000300020001 ), v )

-#define mm512_rol_1x16( v ) \
+#define mm512_shufll_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                       0x001E001D001C001B, 0x001A001900180017, \
                       0X0016001500140013, 0X001200110010000F, \
                       0X000E000D000C000B, 0X000A000900080007, \
                       0X0006000500040003, 0X000200010000001F ), v )

-#define mm512_ror_1x8( v ) \
+#define mm512_shuflr_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                       0x003F3E3D3C3B3A39, 0x3837363534333231, \
                       0x302F2E2D2C2B2A29, 0x2827262524232221, \
                       0x201F1E1D1C1B1A19. 0x1817161514131211, \
                       0x100F0E0D0C0B0A09, 0x0807060504030201 ) )

-#define mm512_rol_1x8( v ) \
+#define mm512_shufll_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                       0x3E3D3C3B3A393837, 0x363534333231302F. \
                       0x2E2D2C2B2A292827, 0x262524232221201F, \
@@ -456,51 +526,55 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n )

 //
 // Rotate elements within 256 bit lanes of 512 bit vector.
+// 128 bit lane shift is handled by bslli bsrli.

 // Swap hi & lo 128 bits in each 256 bit lane
 #define mm512_swap256_128( v )   _mm512_permutex_epi64( v, 0x4e )
+#define mm512_shuflr256_128 mm512_swap256_128
+#define mm512_shufll256_128 mm512_swap256_128

 // Rotate 256 bit lanes by one 64 bit element
-#define mm512_ror256_64( v )     _mm512_permutex_epi64( v, 0x39 )
-#define mm512_rol256_64( v )     _mm512_permutex_epi64( v, 0x93 )
+#define mm512_shuflr256_64( v )     _mm512_permutex_epi64( v, 0x39 )
+
+#define mm512_shufll256_64( v )     _mm512_permutex_epi64( v, 0x93 )

 // Rotate 256 bit lanes by one 32 bit element
-#define mm512_ror256_32( v ) \
+#define mm512_shuflr256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
                      0x000000080000000f, 0x0000000e0000000d, \
                      0x0000000c0000000b, 0x0000000a00000009, \
                      0x0000000000000007, 0x0000000600000005, \
                      0x0000000400000003, 0x0000000200000001 ), v )

-#define mm512_rol256_32( v ) \
+#define mm512_shufll256_32( v ) \
   _mm512_permutexvar_epi32( m512_const_64( \
                      0x0000000e0000000d, 0x0000000c0000000b, \
                      0x0000000a00000009, 0x000000080000000f, \
                      0x0000000600000005, 0x0000000400000003, \
                      0x0000000200000001, 0x0000000000000007 ), v )

-#define mm512_ror256_16( v ) \
+#define mm512_shuflr256_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                     0x00100001001e001d, 0x001c001b001a0019, \
                     0x0018001700160015, 0x0014001300120011, \
                     0x0000000f000e000d, 0x000c000b000a0009, \
                     0x0008000700060005, 0x0004000300020001 ), v )

-#define mm512_rol256_16( v ) \
+#define mm512_shufll256_16( v ) \
   _mm512_permutexvar_epi16( m512_const_64( \
                     0x001e001d001c001b, 0x001a001900180017, \
                     0x0016001500140013, 0x001200110010001f, \
                     0x000e000d000c000b, 0x000a000900080007, \
                     0x0006000500040003, 0x000200010000000f ), v )

-#define mm512_ror256_8( v ) \
+#define mm512_shuflr256_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                     0x203f3e3d3c3b3a39, 0x3837363534333231, \
                     0x302f2e2d2c2b2a29, 0x2827262524232221, \
                     0x001f1e1d1c1b1a19, 0x1817161514131211, \
                     0x100f0e0d0c0b0a09, 0x0807060504030201 ) )

-#define mm512_rol256_8( v ) \
+#define mm512_shufll256_8( v ) \
   _mm512_shuffle_epi8( v, m512_const_64( \
                     0x3e3d3c3b3a393837, 0x363534333231302f, \
                     0x2e2d2c2b2a292827, 0x262524232221203f, \
@@ -508,82 +582,120 @@ static inline __m512i mm512_ror_x32( const __m512i v, const int n )
                     0x0e0d0c0b0a090807, 0x060504030201001f ) )

 //
-// Rotate elements within 128 bit lanes of 512 bit vector.
-
+// Shuffle-roate elements within 128 bit lanes of 512 bit vector.
+ 
 // Swap 64 bits in each 128 bit lane
 #define mm512_swap128_64( v )   _mm512_shuffle_epi32( v, 0x4e )
+#define mm512_shuflr128_64  mm512_swap128_64
+#define mm512_shufll128_64  mm512_swap128_64

 // Rotate 128 bit lanes by one 32 bit element
-#define mm512_ror128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
-#define mm512_rol128_32( v )    _mm512_shuffle_epi32( v, 0x93 )
+#define mm512_shuflr128_32( v )    _mm512_shuffle_epi32( v, 0x39 )
+#define mm512_shufll128_32( v )    _mm512_shuffle_epi32( v, 0x93 )

-// Rotate right 128 bit lanes by c bytes
-static inline __m512i mm512_ror128_x8( const __m512i v, const int c )
+// Rotate right 128 bit lanes by c bytes, versatile and just as fast
+static inline __m512i mm512_shuflr128_8( const __m512i v, const int c )
 {  return _mm512_alignr_epi8( v, v, c ); }

-// Swap 32 bits in each 64 bit lane.
+// Swap 32 bits in each 64 bit lane. Can be done with rotate instruction
+// but only with AVX512. Shuffle is just as fast and availble with AVX2
+// & SSE2.
 #define mm512_swap64_32( v )    _mm512_shuffle_epi32( v, 0xb1 )
+#define mm512_shuflr64_32 mm512_swap64_32
+#define mm512_shufll64_32 mm512_swap64_32

-
+// Need good way to distinguish 1 input shuffles, 2 input shuffle functions,
+// and 2 input 2 output shuffle macros.
 //
-//  Rotate elements from 2 512 bit vectors in place, source arguments
+// shuflr is 1 input
+// shufl2r is 2 input ...
+// Drop macros? They can easilly be rebuilt using shufl2 functions
+
+// add shuflr shufll functions performing rotate, returning first arg
+// They're faster than doing both, when both not needed.
+
+// Shuffle concatenated { v1, v2 ) right or left by 256 bits and return
+// rotated v1 
+// visually confusing for shif2r because of arg order. First arg is always
+// the target for modification, either update by reference or by function
+// return.
+#define mm512_shufl2r_256( v1, v2 )    _mm512_alignr_epi64( v2, v1, 4 )
+#define mm512_shufl2l_256( v1, v2 )    _mm512_alignr_epi64( v1, v2, 4 )
+
+#define mm512_shufl2r_128( v1, v2 )    _mm512_alignr_epi64( v2, v1, 2 )
+#define mm512_shufl2l_128( v1, v2 )    _mm512_alignr_epi64( v1, v2, 2 )
+
+#define mm512_shufl2r_64( v1, v2 )     _mm512_alignr_epi64( v2, v1, 1 )
+#define mm512_shufl2l_64( v1, v2 )     _mm512_alignr_epi64( v1, v2, 1 )
+
+#define mm512_shufl2r_32( v1, v2 )     _mm512_alignr_epi32( v2, v1, 1 )
+#define mm512_shufl2l_32( v1, v2 )     _mm512_alignr_epi32( v1, v2, 1 )
+
+// Rotate elements from 2 512 bit vectors in place, source arguments
 //  are overwritten.

 #define mm512_swap1024_512( v1, v2 ) \
   v1 = _mm512_xor_si512( v1, v2 ); \
   v2 = _mm512_xor_si512( v1, v2 ); \
   v1 = _mm512_xor_si512( v1, v2 );
+#define mm512_shufl2l_512 mm512_swap1024_512 \
+#define mm512_shufl2r_512 mm512_swap1024_512 \

-#define mm512_ror1024_256( v1, v2 ) \
+// Deprecated, will be removed. Use shufl2 functions instead. Leave them as is
+// for now.
+//  Rotate elements from 2 512 bit vectors in place, both source arguments
+//  are updated.
+
+#define mm512_vror1024_256( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 4 ); \
   v2 = t; \
 } while(0)

-#define mm512_rol1024_256( v1, v2 ) \
+#define mm512_vrol1024_256( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 4 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 4 ); \
   v1 = t; \
 } while(0)

-#define mm512_ror1024_128( v1, v2 ) \
+#define mm512_vror1024_128( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 2 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 2 ); \
   v2 = t; \
 } while(0)

-#define mm512_rol1024_128( v1, v2 ) \
+#define mm512_vrol1024_128( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 6 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 6 ); \
   v1 = t; \
 } while(0)

-#define mm512_ror1024_64( v1, v2 ) \
+#define mm512_vror1024_64( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 1 ); \
   v1 = _mm512_alignr_epi64( v2, v1, 1 ); \
   v2 = t; \
 } while(0)

-#define mm512_rol1024_64( v1, v2 ) \
+#define mm512_vrol1024_64( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi64( v1, v2, 7 ); \
   v2 = _mm512_alignr_epi64( v2, v1, 7 ); \
   v1 = t; \
 } while(0)

-#define mm512_ror1024_32( v1, v2 ) \
+#define mm512_vror1024_32( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi32( v1, v2, 1 ); \
   v1 = _mm512_alignr_epi32( v2, v1, 1 ); \
   v2 = t; \
 } while(0)

-#define mm512_rol1024_32( v1, v2 ) \
+#define mm512_vrol1024_32( v1, v2 ) \
 do { \
   __m512i t = _mm512_alignr_epi32( v1, v2, 15 ); \
   v2 = _mm512_alignr_epi32( v2, v1, 15 ); \
--- a/simd-utils/simd-64.h
+++ b/simd-utils/simd-64.h
@@ -68,13 +68,13 @@
 // rotation.

 // Swap hi & lo 32 bits.
-#define mm64_swap_32( a )     _mm_shuffle_pi16( a, 0x4e )
+#define mm64_swap_32( a )      _mm_shuffle_pi16( a, 0x4e )

-#define mm64_ror64_1x16( a )  _mm_shuffle_pi16( a, 0x39 ) 
-#define mm64_rol64_1x16( a )  _mm_shuffle_pi16( a, 0x93 ) 
+#define mm64_shulfr_16( a )  _mm_shuffle_pi16( a, 0x39 ) 
+#define mm64_shufll_16( a )  _mm_shuffle_pi16( a, 0x93 ) 

 // Swap hi & lo 16 bits of each 32 bit element
-#define mm64_swap32_16( a )  _mm_shuffle_pi16( a, 0xb1 )
+#define mm64_swap32_16( a )    _mm_shuffle_pi16( a, 0xb1 )

 #if defined(__SSSE3__)

@@ -86,7 +86,7 @@
    _mm_shuffle_pi8( v, (__m64)0x0607040502030001 );

 // Rotate right by c bytes
-static inline __m64 mm64_ror_x8( __m64 v, const int c )
+static inline __m64 mm64_vror_x8( __m64 v, const int c )
 { return _mm_alignr_pi8( v, v, c ); }

 #else
--- a/simd-utils/simd-int.h
+++ b/simd-utils/simd-int.h
@@ -5,10 +5,19 @@
 #define bswap_64( a ) __builtin_bswap64( a )
 #define bswap_32( a ) __builtin_bswap32( a )

-// safe division, integer or floating point
+// Safe division, integer or floating point. For floating point it's as  
+// safe as 0. is precisely zero.
+// Returns safe_result if division by zero.
 #define safe_div( dividend, divisor, safe_result ) \
   ( (divisor) == 0 ? safe_result : ( (dividend) / (divisor) )  )

+// Aliases with familiar names for built in bit rotate instructions
+#define rol64( a, n )   _lrotl( a, n )  
+#define ror64( a, n )   _lrotr( a, n )
+#define rol32( a, n )   _rotl( a, n )
+#define ror32( a, n )   _rotr( a, n )
+#define rol16( a, n )   _rotwl( a, n )
+#define ror16( a, n )   _rotwr( a, n )

 ///////////////////////////////////////
 // 
@@ -29,12 +38,14 @@
 // __m256i v256 = _mm256_set_m128i( (__m128i)my_int128, (__m128i)my_int128 );
 // my_int128 = (uint128_t)_mm256_extracti128_si256( v256, 1 );

+// obsolete test
 // Compiler check for __int128 support
 // Configure also has a test for int128.
 #if ( __GNUC__ > 4 ) || ( ( __GNUC__ == 4 ) && ( __GNUC_MINOR__ >= 8 ) )
  #define GCC_INT128 1
 #endif

+// obsolte test
 #if !defined(GCC_INT128)
  #warning "__int128 not supported, requires GCC-4.8 or newer."
 #endif