v3.23.0

2026-07-14 02:46:49 +00:00 · 2023-08-30 20:15:48 -04:00
parent 57a6b7b58b
commit 4378d2f841
72 changed files with 10184 additions and 2182 deletions
--- a/12
+++ b/12
@@ -65,9 +65,19 @@ If not what makes it happen or not happen?
 Change Log
 ----------

+v3.23.0
+
+#398: Prevent GBT fallback to Getwork on network error.
+#398: Prevent excessive logs when conditional mining is paused when mining solo.
+Fix a false start if stratum doesn't immediately send a new job after connecting.
+Tweak diagonal shuffle in Blake2b & Blake256 1-way SIMD to reduce latency.
+CPUID support for AVX10.
+Initial changes to AVX2 targeted code in preparation for AVX10.
+Code cleanup and miscellaneous small improvements.
+
 v3.22.3

-Data interleaving and byte swap optimizations iwith AVX2, AVX512 & AVX512VBMI.
+Data interleaving and byte swap optimizations with AVX2, AVX512 & AVX512VBMI.
 Faster Luffa with AVX2 & AVX512.
 Other small optimizations.
 Some code cleanup.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -171,7 +171,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,
         }
      }
      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
      n += 4;
   } while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
@@ -227,7 +227,7 @@ int scanhash_8way_64in_32out( struct work *work, uint32_t max_nonce,
         }
      }
      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
      n += 8;
   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -94,10 +94,13 @@ typedef  uint32_t set_t;
 #define SSE42_OPT        4
 #define AVX_OPT          8   // Sandybridge
 #define AVX2_OPT      0x10   // Haswell, Zen1
-#define SHA_OPT       0x20   // Zen1, Icelake (sha256)
-#define AVX512_OPT    0x40   // Skylake-X (AVX512[F,VL,DQ,BW])
-#define VAES_OPT      0x80   // Icelake (VAES & AVX512)
+#define SHA_OPT       0x20   // Zen1, Icelake (deprecated)
+#define AVX512_OPT    0x40   // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW])
+#define VAES_OPT      0x80   // Icelake, Zen3

+// AVX10 does not have explicit algo features:
+//  AVX10_512 is compatible with AVX512 + VAES
+//  AVX10_256 is compatible with AVX2 + VAES

 // return set containing all elements from sets a & b
 inline set_t set_union ( set_t a, set_t b ) { return a | b; }
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -308,7 +308,52 @@ static const sph_u32 CS[16] = {
 /////////////////////////////////////////
 //
 // Blake-256 1 way SIMD
+// Only used for prehash, otherwise 4way is used with SSE2.

+// optimize shuffles to reduce latency caused by dependencies on V1.
+#define BLAKE256_ROUND( r ) \
+{ \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
+                                          CSx( r, 5 ) ^ Mx( r, 4 ), \
+                                          CSx( r, 3 ) ^ Mx( r, 2 ), \
+                                          CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
+   V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
+                                          CSx( r, 4 ) ^ Mx( r, 5 ), \
+                                          CSx( r, 2 ) ^ Mx( r, 3 ), \
+                                          CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
+   V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
+   V0 = mm128_shufll_32( V0 ); \
+   V3 = mm128_swap_64( V3 ); \
+   V2 = mm128_shuflr_32( V2 ); \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, D ) ^ Mx( r, C ), \
+                                          CSx( r, B ) ^ Mx( r, A ), \
+                                          CSx( r, 9 ) ^ Mx( r, 8 ), \
+                                          CSx( r, F ) ^ Mx( r, E ) ) ) ); \
+   V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
+   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
+                           _mm_set_epi32( CSx( r, C ) ^ Mx( r, D ), \
+                                          CSx( r, A ) ^ Mx( r, B ), \
+                                          CSx( r, 8 ) ^ Mx( r, 9 ), \
+                                          CSx( r, E ) ^ Mx( r, F ) ) ) ); \
+   V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
+   V2 = _mm_add_epi32( V2, V3 ); \
+   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
+   V0 = mm128_shuflr_32( V0 ); \
+   V3 = mm128_swap_64( V3 ); \
+   V2 = mm128_shufll_32( V2 ); \
+}
+
+/*
 #define BLAKE256_ROUND( r ) \
 { \
   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
@@ -350,6 +395,7 @@ static const sph_u32 CS[16] = {
   V2 = mm128_swap_64( V2 ); \
   V1 = mm128_shufll_32( V1 ); \
 }
+*/

 void blake256_transform_le( uint32_t *H, const uint32_t *buf,
                            const uint32_t T0, const uint32_t T1 )
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -252,14 +252,14 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last )
   v[ 5] = ctx->h[5];
   v[ 6] = ctx->h[6];
   v[ 7] = ctx->h[7];
-   v[ 8] = m512_const1_64( 0x6A09E667F3BCC908 );
-   v[ 9] = m512_const1_64( 0xBB67AE8584CAA73B );
-   v[10] = m512_const1_64( 0x3C6EF372FE94F82B );
-   v[11] = m512_const1_64( 0xA54FF53A5F1D36F1 );
-   v[12] = m512_const1_64( 0x510E527FADE682D1 );
-   v[13] = m512_const1_64( 0x9B05688C2B3E6C1F );
-   v[14] = m512_const1_64( 0x1F83D9ABFB41BD6B );
-   v[15] = m512_const1_64( 0x5BE0CD19137E2179 );
+   v[ 8] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
+   v[ 9] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
+   v[10] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
+   v[11] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
+   v[12] = _mm512_set1_epi64( 0x510E527FADE682D1 );
+   v[13] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
+   v[14] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
+   v[15] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );

   v[12] = _mm512_xor_si512( v[12], _mm512_set1_epi64( ctx->t[0] ) );
   v[13] = _mm512_xor_si512( v[13], _mm512_set1_epi64( ctx->t[1] ) );
@@ -310,16 +310,16 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx )
 {
   size_t i;

-   ctx->h[0] = m512_const1_64( 0x6A09E667F3BCC908 );
-   ctx->h[1] = m512_const1_64( 0xBB67AE8584CAA73B );
-   ctx->h[2] = m512_const1_64( 0x3C6EF372FE94F82B );
-   ctx->h[3] = m512_const1_64( 0xA54FF53A5F1D36F1 );
-   ctx->h[4] = m512_const1_64( 0x510E527FADE682D1 );
-   ctx->h[5] = m512_const1_64( 0x9B05688C2B3E6C1F );
-   ctx->h[6] = m512_const1_64( 0x1F83D9ABFB41BD6B );
-   ctx->h[7] = m512_const1_64( 0x5BE0CD19137E2179 );
+   ctx->h[0] = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
+   ctx->h[1] = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
+   ctx->h[2] = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
+   ctx->h[3] = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
+   ctx->h[4] = _mm512_set1_epi64( 0x510E527FADE682D1 );
+   ctx->h[5] = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
+   ctx->h[6] = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
+   ctx->h[7] = _mm512_set1_epi64( 0x5BE0CD19137E2179 );

-   ctx->h[0] = _mm512_xor_si512( ctx->h[0], m512_const1_64( 0x01010020 ) );
+   ctx->h[0] = _mm512_xor_si512( ctx->h[0], _mm512_set1_epi64( 0x01010020 ) );

   ctx->t[0] = 0;
   ctx->t[1] = 0;
@@ -419,14 +419,14 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
   v[ 5] = ctx->h[5];
   v[ 6] = ctx->h[6];
   v[ 7] = ctx->h[7];
-   v[ 8] = m256_const1_64( 0x6A09E667F3BCC908 );
-   v[ 9] = m256_const1_64( 0xBB67AE8584CAA73B );
-   v[10] = m256_const1_64( 0x3C6EF372FE94F82B );
-   v[11] = m256_const1_64( 0xA54FF53A5F1D36F1 );
-   v[12] = m256_const1_64( 0x510E527FADE682D1 );
-   v[13] = m256_const1_64( 0x9B05688C2B3E6C1F );
-   v[14] = m256_const1_64( 0x1F83D9ABFB41BD6B );
-   v[15] = m256_const1_64( 0x5BE0CD19137E2179 );
+   v[ 8] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
+   v[ 9] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
+   v[10] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
+   v[11] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
+   v[12] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
+   v[13] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
+   v[14] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
+   v[15] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );

   v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
   v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
@@ -477,16 +477,16 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx )
 {
 	size_t i;

-   ctx->h[0] = m256_const1_64( 0x6A09E667F3BCC908 );
-   ctx->h[1] = m256_const1_64( 0xBB67AE8584CAA73B );
-   ctx->h[2] = m256_const1_64( 0x3C6EF372FE94F82B );
-   ctx->h[3] = m256_const1_64( 0xA54FF53A5F1D36F1 );
-   ctx->h[4] = m256_const1_64( 0x510E527FADE682D1 );
-   ctx->h[5] = m256_const1_64( 0x9B05688C2B3E6C1F );
-   ctx->h[6] = m256_const1_64( 0x1F83D9ABFB41BD6B );
-   ctx->h[7] = m256_const1_64( 0x5BE0CD19137E2179 );
+   ctx->h[0] = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
+   ctx->h[1] = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
+   ctx->h[2] = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
+   ctx->h[3] = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
+   ctx->h[4] = _mm256_set1_epi64x( 0x510E527FADE682D1 );
+   ctx->h[5] = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
+   ctx->h[6] = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
+   ctx->h[7] = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );

-   ctx->h[0] = _mm256_xor_si256( ctx->h[0], m256_const1_64( 0x01010020 ) );
+   ctx->h[0] = _mm256_xor_si256( ctx->h[0], _mm256_set1_epi64x( 0x01010020 ) );

 	ctx->t[0] = 0;
 	ctx->t[1] = 0;
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -62,14 +62,14 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )

   memset( S, 0, sizeof( blake2s_4way_state ) );

-   S->h[0] = m128_const1_64( 0x6A09E6676A09E667ULL );
-   S->h[1] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
-   S->h[2] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
-   S->h[3] = m128_const1_64( 0xA54FF53AA54FF53AULL );
-   S->h[4] = m128_const1_64( 0x510E527F510E527FULL );
-   S->h[5] = m128_const1_64( 0x9B05688C9B05688CULL );
-   S->h[6] = m128_const1_64( 0x1F83D9AB1F83D9ABULL );
-   S->h[7] = m128_const1_64( 0x5BE0CD195BE0CD19ULL );
+   S->h[0] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
+   S->h[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
+   S->h[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
+   S->h[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
+   S->h[4] = _mm_set1_epi64x( 0x510E527F510E527FULL );
+   S->h[5] = _mm_set1_epi64x( 0x9B05688C9B05688CULL );
+   S->h[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
+   S->h[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL );
   
 //   for( int i = 0; i < 8; ++i )
 //      S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
@@ -90,18 +90,18 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
   memcpy_128( m, block, 16 );
   memcpy_128( v, S->h, 8 );

-   v[ 8] = m128_const1_64( 0x6A09E6676A09E667ULL );
-   v[ 9] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
-   v[10] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
-   v[11] = m128_const1_64( 0xA54FF53AA54FF53AULL );
+   v[ 8] = _mm_set1_epi64x( 0x6A09E6676A09E667ULL );
+   v[ 9] = _mm_set1_epi64x( 0xBB67AE85BB67AE85ULL );
+   v[10] = _mm_set1_epi64x( 0x3C6EF3723C6EF372ULL );
+   v[11] = _mm_set1_epi64x( 0xA54FF53AA54FF53AULL );
   v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
-                          m128_const1_64( 0x510E527F510E527FULL ) );
+                          _mm_set1_epi64x( 0x510E527F510E527FULL ) );
   v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
-                          m128_const1_64( 0x9B05688C9B05688CULL ) );
+                          _mm_set1_epi64x( 0x9B05688C9B05688CULL ) );
   v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
-                          m128_const1_64( 0x1F83D9AB1F83D9ABULL ) );
+                          _mm_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );
   v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
-                          m128_const1_64( 0x5BE0CD195BE0CD19ULL ) );
+                          _mm_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );

 #define G4W( sigma0, sigma1, a, b, c, d ) \
 do { \
@@ -269,21 +269,21 @@ int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
   memcpy_256( m, block, 16 );
   memcpy_256( v, S->h, 8 );

-   v[ 8] = m256_const1_64( 0x6A09E6676A09E667ULL );
-   v[ 9] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
-   v[10] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
-   v[11] = m256_const1_64( 0xA54FF53AA54FF53AULL );
+   v[ 8] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
+   v[ 9] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
+   v[10] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
+   v[11] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
   v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
-                          m256_const1_64( 0x510E527F510E527FULL ) );
+                          _mm256_set1_epi64x( 0x510E527F510E527FULL ) );

   v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
-                          m256_const1_64( 0x9B05688C9B05688CULL ) );
+                          _mm256_set1_epi64x( 0x9B05688C9B05688CULL ) );

   v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
-                          m256_const1_64( 0x1F83D9AB1F83D9ABULL ) );
+                          _mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL ) );

   v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
-                          m256_const1_64( 0x5BE0CD195BE0CD19ULL ) );
+                          _mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL ) );

 /*
   v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
@@ -391,14 +391,14 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
   memset( P->personal, 0, sizeof( P->personal ) );

   memset( S, 0, sizeof( blake2s_8way_state ) );
-   S->h[0] = m256_const1_64( 0x6A09E6676A09E667ULL );
-   S->h[1] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
-   S->h[2] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
-   S->h[3] = m256_const1_64( 0xA54FF53AA54FF53AULL );
-   S->h[4] = m256_const1_64( 0x510E527F510E527FULL );
-   S->h[5] = m256_const1_64( 0x9B05688C9B05688CULL );
-   S->h[6] = m256_const1_64( 0x1F83D9AB1F83D9ABULL );
-   S->h[7] = m256_const1_64( 0x5BE0CD195BE0CD19ULL );
+   S->h[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667ULL );
+   S->h[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85ULL );
+   S->h[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372ULL );
+   S->h[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53AULL );
+   S->h[4] = _mm256_set1_epi64x( 0x510E527F510E527FULL );
+   S->h[5] = _mm256_set1_epi64x( 0x9B05688C9B05688CULL );
+   S->h[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9ABULL );
+   S->h[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19ULL );


 //   for( int i = 0; i < 8; ++i )
@@ -510,21 +510,21 @@ int blake2s_16way_compress( blake2s_16way_state *S, const __m512i *block )
   memcpy_512( m, block, 16 );
   memcpy_512( v, S->h, 8 );

-   v[ 8] = m512_const1_64( 0x6A09E6676A09E667ULL );
-   v[ 9] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
-   v[10] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
-   v[11] = m512_const1_64( 0xA54FF53AA54FF53AULL );
+   v[ 8] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
+   v[ 9] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
+   v[10] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
+   v[11] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
   v[12] = _mm512_xor_si512( _mm512_set1_epi32( S->t[0] ),
-                          m512_const1_64( 0x510E527F510E527FULL ) );
+                          _mm512_set1_epi64( 0x510E527F510E527FULL ) );

   v[13] = _mm512_xor_si512( _mm512_set1_epi32( S->t[1] ),
-                          m512_const1_64( 0x9B05688C9B05688CULL ) );
+                          _mm512_set1_epi64( 0x9B05688C9B05688CULL ) );

   v[14] = _mm512_xor_si512( _mm512_set1_epi32( S->f[0] ),
-                          m512_const1_64( 0x1F83D9AB1F83D9ABULL ) );
+                          _mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL ) );

   v[15] = _mm512_xor_si512( _mm512_set1_epi32( S->f[1] ),
-                          m512_const1_64( 0x5BE0CD195BE0CD19ULL ) );
+                          _mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL ) );


 #define G16W( sigma0, sigma1, a, b, c, d) \
@@ -589,14 +589,14 @@ int blake2s_16way_init( blake2s_16way_state *S, const uint8_t outlen )
   memset( P->personal, 0, sizeof( P->personal ) );

   memset( S, 0, sizeof( blake2s_16way_state ) );
-   S->h[0] = m512_const1_64( 0x6A09E6676A09E667ULL );
-   S->h[1] = m512_const1_64( 0xBB67AE85BB67AE85ULL );
-   S->h[2] = m512_const1_64( 0x3C6EF3723C6EF372ULL );
-   S->h[3] = m512_const1_64( 0xA54FF53AA54FF53AULL );
-   S->h[4] = m512_const1_64( 0x510E527F510E527FULL );
-   S->h[5] = m512_const1_64( 0x9B05688C9B05688CULL );
-   S->h[6] = m512_const1_64( 0x1F83D9AB1F83D9ABULL );
-   S->h[7] = m512_const1_64( 0x5BE0CD195BE0CD19ULL );
+   S->h[0] = _mm512_set1_epi64( 0x6A09E6676A09E667ULL );
+   S->h[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85ULL );
+   S->h[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372ULL );
+   S->h[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53AULL );
+   S->h[4] = _mm512_set1_epi64( 0x510E527F510E527FULL );
+   S->h[5] = _mm512_set1_epi64( 0x9B05688C9B05688CULL );
+   S->h[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9ABULL );
+   S->h[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19ULL );

   uint32_t *p = ( uint32_t * )( P );

--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -64,6 +64,22 @@
  V[1] = mm256_ror_64( _mm256_xor_si256( V[1], V[2] ), 63 ); \
 }

+// Pivot about V[1] instead of V[0] reduces latency.
+#define BLAKE2B_ROUND( R ) \
+{ \
+  __m256i *V = (__m256i*)v; \
+  const uint8_t *sigmaR = sigma[R]; \
+  BLAKE2B_G(  0,  1,  2,  3,  4,  5,  6,  7 ); \
+  V[0] = mm256_shufll_64( V[0] ); \
+  V[3] = mm256_swap_128( V[3] ); \
+  V[2] = mm256_shuflr_64( V[2] ); \
+  BLAKE2B_G( 14, 15,  8,  9, 10, 11, 12, 13 ); \
+  V[0] = mm256_shuflr_64( V[0] ); \
+  V[3] = mm256_swap_128( V[3] ); \
+  V[2] = mm256_shufll_64( V[2] ); \
+}
+
+/*
 #define BLAKE2B_ROUND( R ) \
 { \
  __m256i *V = (__m256i*)v; \
@@ -77,6 +93,7 @@
  V[2] = mm256_swap_128( V[2] ); \
  V[1] = mm256_shufll_64( V[1] ); \
 }
+*/

 #elif defined(__SSE2__)
 // always true
--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -451,22 +451,22 @@ static const __m128i final_s[16] =
 */
 void bmw256_4way_init( bmw256_4way_context *ctx )
 {
-   ctx->H[ 0] = m128_const1_64( 0x4041424340414243 );
-   ctx->H[ 1] = m128_const1_64( 0x4445464744454647 );
-   ctx->H[ 2] = m128_const1_64( 0x48494A4B48494A4B );
-   ctx->H[ 3] = m128_const1_64( 0x4C4D4E4F4C4D4E4F );
-   ctx->H[ 4] = m128_const1_64( 0x5051525350515253 );
-   ctx->H[ 5] = m128_const1_64( 0x5455565754555657 );
-   ctx->H[ 6] = m128_const1_64( 0x58595A5B58595A5B );
-   ctx->H[ 7] = m128_const1_64( 0x5C5D5E5F5C5D5E5F );
-   ctx->H[ 8] = m128_const1_64( 0x6061626360616263 );
-   ctx->H[ 9] = m128_const1_64( 0x6465666764656667 );
-   ctx->H[10] = m128_const1_64( 0x68696A6B68696A6B );
-   ctx->H[11] = m128_const1_64( 0x6C6D6E6F6C6D6E6F );
-   ctx->H[12] = m128_const1_64( 0x7071727370717273 );
-   ctx->H[13] = m128_const1_64( 0x7475767774757677 );
-   ctx->H[14] = m128_const1_64( 0x78797A7B78797A7B );
-   ctx->H[15] = m128_const1_64( 0x7C7D7E7F7C7D7E7F );
+   ctx->H[ 0] = _mm_set1_epi64x( 0x4041424340414243 );
+   ctx->H[ 1] = _mm_set1_epi64x( 0x4445464744454647 );
+   ctx->H[ 2] = _mm_set1_epi64x( 0x48494A4B48494A4B );
+   ctx->H[ 3] = _mm_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
+   ctx->H[ 4] = _mm_set1_epi64x( 0x5051525350515253 );
+   ctx->H[ 5] = _mm_set1_epi64x( 0x5455565754555657 );
+   ctx->H[ 6] = _mm_set1_epi64x( 0x58595A5B58595A5B );
+   ctx->H[ 7] = _mm_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
+   ctx->H[ 8] = _mm_set1_epi64x( 0x6061626360616263 );
+   ctx->H[ 9] = _mm_set1_epi64x( 0x6465666764656667 );
+   ctx->H[10] = _mm_set1_epi64x( 0x68696A6B68696A6B );
+   ctx->H[11] = _mm_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
+   ctx->H[12] = _mm_set1_epi64x( 0x7071727370717273 );
+   ctx->H[13] = _mm_set1_epi64x( 0x7475767774757677 );
+   ctx->H[14] = _mm_set1_epi64x( 0x78797A7B78797A7B );
+   ctx->H[15] = _mm_set1_epi64x( 0x7C7D7E7F7C7D7E7F );


 //   for ( int i = 0; i < 16; i++ )
@@ -529,7 +529,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,

   buf = sc->buf;
   ptr = sc->ptr;
-   buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 );
+   buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
   ptr += 4;
   h = sc->H;

@@ -959,22 +959,22 @@ static const __m256i final_s8[16] =

 void bmw256_8way_init( bmw256_8way_context *ctx )
 {
-   ctx->H[ 0] = m256_const1_64( 0x4041424340414243 );
-   ctx->H[ 1] = m256_const1_64( 0x4445464744454647 );
-   ctx->H[ 2] = m256_const1_64( 0x48494A4B48494A4B );
-   ctx->H[ 3] = m256_const1_64( 0x4C4D4E4F4C4D4E4F );
-   ctx->H[ 4] = m256_const1_64( 0x5051525350515253 );
-   ctx->H[ 5] = m256_const1_64( 0x5455565754555657 );
-   ctx->H[ 6] = m256_const1_64( 0x58595A5B58595A5B );
-   ctx->H[ 7] = m256_const1_64( 0x5C5D5E5F5C5D5E5F );
-   ctx->H[ 8] = m256_const1_64( 0x6061626360616263 );
-   ctx->H[ 9] = m256_const1_64( 0x6465666764656667 );
-   ctx->H[10] = m256_const1_64( 0x68696A6B68696A6B );
-   ctx->H[11] = m256_const1_64( 0x6C6D6E6F6C6D6E6F );
-   ctx->H[12] = m256_const1_64( 0x7071727370717273 );
-   ctx->H[13] = m256_const1_64( 0x7475767774757677 );
-   ctx->H[14] = m256_const1_64( 0x78797A7B78797A7B );
-   ctx->H[15] = m256_const1_64( 0x7C7D7E7F7C7D7E7F );
+   ctx->H[ 0] = _mm256_set1_epi64x( 0x4041424340414243 );
+   ctx->H[ 1] = _mm256_set1_epi64x( 0x4445464744454647 );
+   ctx->H[ 2] = _mm256_set1_epi64x( 0x48494A4B48494A4B );
+   ctx->H[ 3] = _mm256_set1_epi64x( 0x4C4D4E4F4C4D4E4F );
+   ctx->H[ 4] = _mm256_set1_epi64x( 0x5051525350515253 );
+   ctx->H[ 5] = _mm256_set1_epi64x( 0x5455565754555657 );
+   ctx->H[ 6] = _mm256_set1_epi64x( 0x58595A5B58595A5B );
+   ctx->H[ 7] = _mm256_set1_epi64x( 0x5C5D5E5F5C5D5E5F );
+   ctx->H[ 8] = _mm256_set1_epi64x( 0x6061626360616263 );
+   ctx->H[ 9] = _mm256_set1_epi64x( 0x6465666764656667 );
+   ctx->H[10] = _mm256_set1_epi64x( 0x68696A6B68696A6B );
+   ctx->H[11] = _mm256_set1_epi64x( 0x6C6D6E6F6C6D6E6F );
+   ctx->H[12] = _mm256_set1_epi64x( 0x7071727370717273 );
+   ctx->H[13] = _mm256_set1_epi64x( 0x7475767774757677 );
+   ctx->H[14] = _mm256_set1_epi64x( 0x78797A7B78797A7B );
+   ctx->H[15] = _mm256_set1_epi64x( 0x7C7D7E7F7C7D7E7F );
   ctx->ptr       = 0;
   ctx->bit_count = 0;
 }
@@ -1030,7 +1030,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )

   buf = ctx->buf;
   ptr = ctx->ptr;
-   buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 );
+   buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
   ptr += 4;
   h = ctx->H;

@@ -1460,22 +1460,22 @@ static const __m512i final_s16[16] =

 void bmw256_16way_init( bmw256_16way_context *ctx )
 {
-   ctx->H[ 0] = m512_const1_64( 0x4041424340414243 );
-   ctx->H[ 1] = m512_const1_64( 0x4445464744454647 );
-   ctx->H[ 2] = m512_const1_64( 0x48494A4B48494A4B );
-   ctx->H[ 3] = m512_const1_64( 0x4C4D4E4F4C4D4E4F );
-   ctx->H[ 4] = m512_const1_64( 0x5051525350515253 );
-   ctx->H[ 5] = m512_const1_64( 0x5455565754555657 );
-   ctx->H[ 6] = m512_const1_64( 0x58595A5B58595A5B );
-   ctx->H[ 7] = m512_const1_64( 0x5C5D5E5F5C5D5E5F );
-   ctx->H[ 8] = m512_const1_64( 0x6061626360616263 );
-   ctx->H[ 9] = m512_const1_64( 0x6465666764656667 );
-   ctx->H[10] = m512_const1_64( 0x68696A6B68696A6B );
-   ctx->H[11] = m512_const1_64( 0x6C6D6E6F6C6D6E6F );
-   ctx->H[12] = m512_const1_64( 0x7071727370717273 );
-   ctx->H[13] = m512_const1_64( 0x7475767774757677 );
-   ctx->H[14] = m512_const1_64( 0x78797A7B78797A7B );
-   ctx->H[15] = m512_const1_64( 0x7C7D7E7F7C7D7E7F );
+   ctx->H[ 0] = _mm512_set1_epi64( 0x4041424340414243 );
+   ctx->H[ 1] = _mm512_set1_epi64( 0x4445464744454647 );
+   ctx->H[ 2] = _mm512_set1_epi64( 0x48494A4B48494A4B );
+   ctx->H[ 3] = _mm512_set1_epi64( 0x4C4D4E4F4C4D4E4F );
+   ctx->H[ 4] = _mm512_set1_epi64( 0x5051525350515253 );
+   ctx->H[ 5] = _mm512_set1_epi64( 0x5455565754555657 );
+   ctx->H[ 6] = _mm512_set1_epi64( 0x58595A5B58595A5B );
+   ctx->H[ 7] = _mm512_set1_epi64( 0x5C5D5E5F5C5D5E5F );
+   ctx->H[ 8] = _mm512_set1_epi64( 0x6061626360616263 );
+   ctx->H[ 9] = _mm512_set1_epi64( 0x6465666764656667 );
+   ctx->H[10] = _mm512_set1_epi64( 0x68696A6B68696A6B );
+   ctx->H[11] = _mm512_set1_epi64( 0x6C6D6E6F6C6D6E6F );
+   ctx->H[12] = _mm512_set1_epi64( 0x7071727370717273 );
+   ctx->H[13] = _mm512_set1_epi64( 0x7475767774757677 );
+   ctx->H[14] = _mm512_set1_epi64( 0x78797A7B78797A7B );
+   ctx->H[15] = _mm512_set1_epi64( 0x7C7D7E7F7C7D7E7F );
   ctx->ptr       = 0;
   ctx->bit_count = 0;
 }
@@ -1531,7 +1531,7 @@ void bmw256_16way_close( bmw256_16way_context *ctx, void *dst )

   buf = ctx->buf;
   ptr = ctx->ptr;
-   buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
+   buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
   ptr += 4;
   h = ctx->H;

--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -896,22 +896,22 @@ static const __m256i final_b[16] =
 static void
 bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
 {
-   sc->H[ 0] = m256_const1_64( 0x8081828384858687 );
-   sc->H[ 1] = m256_const1_64( 0x88898A8B8C8D8E8F );
-   sc->H[ 2] = m256_const1_64( 0x9091929394959697 );
-   sc->H[ 3] = m256_const1_64( 0x98999A9B9C9D9E9F );
-   sc->H[ 4] = m256_const1_64( 0xA0A1A2A3A4A5A6A7 );
-   sc->H[ 5] = m256_const1_64( 0xA8A9AAABACADAEAF );
-   sc->H[ 6] = m256_const1_64( 0xB0B1B2B3B4B5B6B7 );
-   sc->H[ 7] = m256_const1_64( 0xB8B9BABBBCBDBEBF );
-   sc->H[ 8] = m256_const1_64( 0xC0C1C2C3C4C5C6C7 );
-   sc->H[ 9] = m256_const1_64( 0xC8C9CACBCCCDCECF );
-   sc->H[10] = m256_const1_64( 0xD0D1D2D3D4D5D6D7 );
-   sc->H[11] = m256_const1_64( 0xD8D9DADBDCDDDEDF );
-   sc->H[12] = m256_const1_64( 0xE0E1E2E3E4E5E6E7 );
-   sc->H[13] = m256_const1_64( 0xE8E9EAEBECEDEEEF );
-   sc->H[14] = m256_const1_64( 0xF0F1F2F3F4F5F6F7 );
-   sc->H[15] = m256_const1_64( 0xF8F9FAFBFCFDFEFF );
+   sc->H[ 0] = _mm256_set1_epi64x( 0x8081828384858687 );
+   sc->H[ 1] = _mm256_set1_epi64x( 0x88898A8B8C8D8E8F );
+   sc->H[ 2] = _mm256_set1_epi64x( 0x9091929394959697 );
+   sc->H[ 3] = _mm256_set1_epi64x( 0x98999A9B9C9D9E9F );
+   sc->H[ 4] = _mm256_set1_epi64x( 0xA0A1A2A3A4A5A6A7 );
+   sc->H[ 5] = _mm256_set1_epi64x( 0xA8A9AAABACADAEAF );
+   sc->H[ 6] = _mm256_set1_epi64x( 0xB0B1B2B3B4B5B6B7 );
+   sc->H[ 7] = _mm256_set1_epi64x( 0xB8B9BABBBCBDBEBF );
+   sc->H[ 8] = _mm256_set1_epi64x( 0xC0C1C2C3C4C5C6C7 );
+   sc->H[ 9] = _mm256_set1_epi64x( 0xC8C9CACBCCCDCECF );
+   sc->H[10] = _mm256_set1_epi64x( 0xD0D1D2D3D4D5D6D7 );
+   sc->H[11] = _mm256_set1_epi64x( 0xD8D9DADBDCDDDEDF );
+   sc->H[12] = _mm256_set1_epi64x( 0xE0E1E2E3E4E5E6E7 );
+   sc->H[13] = _mm256_set1_epi64x( 0xE8E9EAEBECEDEEEF );
+   sc->H[14] = _mm256_set1_epi64x( 0xF0F1F2F3F4F5F6F7 );
+   sc->H[15] = _mm256_set1_epi64x( 0xF8F9FAFBFCFDFEFF );
   sc->ptr = 0;
   sc->bit_count = 0;
 }
@@ -967,7 +967,7 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,

   buf = sc->buf;
   ptr = sc->ptr;
-   buf[ ptr>>3 ] = m256_const1_64( 0x80 );
+   buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
   ptr += 8;
   h = sc->H;

@@ -1379,22 +1379,22 @@ static const __m512i final_b8[16] =
 void bmw512_8way_init( bmw512_8way_context *ctx )
 //bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
 {
-   ctx->H[ 0] = m512_const1_64( 0x8081828384858687 );
-   ctx->H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
-   ctx->H[ 2] = m512_const1_64( 0x9091929394959697 );
-   ctx->H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
-   ctx->H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
-   ctx->H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
-   ctx->H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
-   ctx->H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
-   ctx->H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
-   ctx->H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
-   ctx->H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
-   ctx->H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
-   ctx->H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
-   ctx->H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
-   ctx->H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
-   ctx->H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
+   ctx->H[ 0] = _mm512_set1_epi64( 0x8081828384858687 );
+   ctx->H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F );
+   ctx->H[ 2] = _mm512_set1_epi64( 0x9091929394959697 );
+   ctx->H[ 3] = _mm512_set1_epi64( 0x98999A9B9C9D9E9F );
+   ctx->H[ 4] = _mm512_set1_epi64( 0xA0A1A2A3A4A5A6A7 );
+   ctx->H[ 5] = _mm512_set1_epi64( 0xA8A9AAABACADAEAF );
+   ctx->H[ 6] = _mm512_set1_epi64( 0xB0B1B2B3B4B5B6B7 );
+   ctx->H[ 7] = _mm512_set1_epi64( 0xB8B9BABBBCBDBEBF );
+   ctx->H[ 8] = _mm512_set1_epi64( 0xC0C1C2C3C4C5C6C7 );
+   ctx->H[ 9] = _mm512_set1_epi64( 0xC8C9CACBCCCDCECF );
+   ctx->H[10] = _mm512_set1_epi64( 0xD0D1D2D3D4D5D6D7 );
+   ctx->H[11] = _mm512_set1_epi64( 0xD8D9DADBDCDDDEDF );
+   ctx->H[12] = _mm512_set1_epi64( 0xE0E1E2E3E4E5E6E7 );
+   ctx->H[13] = _mm512_set1_epi64( 0xE8E9EAEBECEDEEEF );
+   ctx->H[14] = _mm512_set1_epi64( 0xF0F1F2F3F4F5F6F7 );
+   ctx->H[15] = _mm512_set1_epi64( 0xF8F9FAFBFCFDFEFF );
   ctx->ptr = 0;
   ctx->bit_count = 0;
 }
@@ -1448,7 +1448,7 @@ void bmw512_8way_close( bmw512_8way_context *ctx, void *dst )

   buf = ctx->buf;
   ptr = ctx->ptr;
-   buf[ ptr>>3 ] = m512_const1_64( 0x80 );
+   buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
   ptr += 8;
   h = ctx->H;

@@ -1483,22 +1483,22 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,

 // Init

-   H[ 0] = m512_const1_64( 0x8081828384858687 );
-   H[ 1] = m512_const1_64( 0x88898A8B8C8D8E8F );
-   H[ 2] = m512_const1_64( 0x9091929394959697 );
-   H[ 3] = m512_const1_64( 0x98999A9B9C9D9E9F );
-   H[ 4] = m512_const1_64( 0xA0A1A2A3A4A5A6A7 );
-   H[ 5] = m512_const1_64( 0xA8A9AAABACADAEAF );
-   H[ 6] = m512_const1_64( 0xB0B1B2B3B4B5B6B7 );
-   H[ 7] = m512_const1_64( 0xB8B9BABBBCBDBEBF );
-   H[ 8] = m512_const1_64( 0xC0C1C2C3C4C5C6C7 );
-   H[ 9] = m512_const1_64( 0xC8C9CACBCCCDCECF );
-   H[10] = m512_const1_64( 0xD0D1D2D3D4D5D6D7 );
-   H[11] = m512_const1_64( 0xD8D9DADBDCDDDEDF );
-   H[12] = m512_const1_64( 0xE0E1E2E3E4E5E6E7 );
-   H[13] = m512_const1_64( 0xE8E9EAEBECEDEEEF );
-   H[14] = m512_const1_64( 0xF0F1F2F3F4F5F6F7 );
-   H[15] = m512_const1_64( 0xF8F9FAFBFCFDFEFF );
+   H[ 0] = _mm512_set1_epi64( 0x8081828384858687 );
+   H[ 1] = _mm512_set1_epi64( 0x88898A8B8C8D8E8F );
+   H[ 2] = _mm512_set1_epi64( 0x9091929394959697 );
+   H[ 3] = _mm512_set1_epi64( 0x98999A9B9C9D9E9F );
+   H[ 4] = _mm512_set1_epi64( 0xA0A1A2A3A4A5A6A7 );
+   H[ 5] = _mm512_set1_epi64( 0xA8A9AAABACADAEAF );
+   H[ 6] = _mm512_set1_epi64( 0xB0B1B2B3B4B5B6B7 );
+   H[ 7] = _mm512_set1_epi64( 0xB8B9BABBBCBDBEBF );
+   H[ 8] = _mm512_set1_epi64( 0xC0C1C2C3C4C5C6C7 );
+   H[ 9] = _mm512_set1_epi64( 0xC8C9CACBCCCDCECF );
+   H[10] = _mm512_set1_epi64( 0xD0D1D2D3D4D5D6D7 );
+   H[11] = _mm512_set1_epi64( 0xD8D9DADBDCDDDEDF );
+   H[12] = _mm512_set1_epi64( 0xE0E1E2E3E4E5E6E7 );
+   H[13] = _mm512_set1_epi64( 0xE8E9EAEBECEDEEEF );
+   H[14] = _mm512_set1_epi64( 0xF0F1F2F3F4F5F6F7 );
+   H[15] = _mm512_set1_epi64( 0xF8F9FAFBFCFDFEFF );

 // Update

@@ -1530,7 +1530,7 @@ void bmw512_8way_full( bmw512_8way_context *ctx, void *out, const void *data,
   __m512i h1[16], h2[16];
   size_t u, v;

-   buf[ ptr>>3 ] = m512_const1_64( 0x80 );
+   buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
   ptr += 8;

   if (  ptr > (buf_size - 8) )
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -423,21 +423,6 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,

 // 2 way 128 

-// This isn't expected to be used with AVX512 so HW rotate intruction
-// is assumed not avaiable.
-// Use double buffering to optimize serial bit rotations. Full double
-// buffering isn't practical because it needs twice as many registers
-// with AVX2 having only half as many as AVX512.
-#define ROL2( out0, out1, in0, in1, c ) \
-{ \
- __m256i t0 = _mm256_slli_epi32( in0, c ); \
- __m256i t1 = _mm256_slli_epi32( in1, c ); \
- out0 = _mm256_srli_epi32( in0, 32-(c) ); \
- out1 = _mm256_srli_epi32( in1, 32-(c) ); \
- out0 = _mm256_or_si256( out0, t0 ); \
- out1 = _mm256_or_si256( out1, t1 ); \
-}
-
 static void transform_2way( cube_2way_context *sp )
 {
    int r;
@@ -460,8 +445,10 @@ static void transform_2way( cube_2way_context *sp )
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
        x7 = _mm256_add_epi32( x3, x7 );
-        ROL2( y0, y1, x2, x3, 7 );
-        ROL2( x2, x3, x0, x1, 7 );
+        y0 = mm256_rol_32( x2, 7 );
+        y1 = mm256_rol_32( x3, 7 );
+        x2 = mm256_rol_32( x0, 7 );
+        x3 = mm256_rol_32( x1, 7 );
        x0 = _mm256_xor_si256( y0, x4 );
        x1 = _mm256_xor_si256( y1, x5 );
        x2 = _mm256_xor_si256( x2, x6 );
@@ -474,8 +461,10 @@ static void transform_2way( cube_2way_context *sp )
        x5 = _mm256_add_epi32( x1, x5 );
        x6 = _mm256_add_epi32( x2, x6 );
        x7 = _mm256_add_epi32( x3, x7 );
-        ROL2( y0, x1, x1, x0, 11 );
-        ROL2( y1, x3, x3, x2, 11 );
+        y0 = mm256_rol_32( x1, 11 );
+        x1 = mm256_rol_32( x0, 11 );
+        y1 = mm256_rol_32( x3, 11 );
+        x3 = mm256_rol_32( x2, 11 );
        x0 = _mm256_xor_si256( y0, x4 );
        x1 = _mm256_xor_si256( x1, x5 );
        x2 = _mm256_xor_si256( y1, x6 );
--- a/algo/cubehash/cubehash_sse2.c
+++ b/algo/cubehash/cubehash_sse2.c
@@ -32,7 +32,7 @@ static void transform( cubehashParam *sp )
    { 
        x1 = _mm512_add_epi32( x0, x1 );
        x0 = mm512_swap_256( x0 );
-        x0 = mm512_rol_32(  x0, 7 );
+        x0 = mm512_rol_32( x0, 7 );
        x0 = _mm512_xor_si512( x0, x1 );
        x1 = mm512_swap128_64( x1 );
        x1 = _mm512_add_epi32( x0, x1 );
@@ -58,19 +58,18 @@ static void transform( cubehashParam *sp )
    { 
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
-        y0 = x0;
-        x0 = mm256_rol_32( x1, 7 );
-        x1 = mm256_rol_32( y0, 7 );
-        x0 = _mm256_xor_si256( x0, x2 );
-        x1 = _mm256_xor_si256( x1, x3 );
+        y0 = mm256_rol_32( x1, 7 );
+        y1 = mm256_rol_32( x0, 7 );
+        x0 = _mm256_xor_si256( y0, x2 );
+        x1 = _mm256_xor_si256( y1, x3 );
        x2 = mm256_swap128_64( x2 );
        x3 = mm256_swap128_64( x3 );
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
-        y0 = mm256_swap_128( x0 );
-        y1 = mm256_swap_128( x1 );
-        x0 = mm256_rol_32( y0, 11 );
-        x1 = mm256_rol_32( y1, 11 );
+        x0 = mm256_swap_128( x0 );
+        x1 = mm256_swap_128( x1 );
+        x0 = mm256_rol_32( x0, 11 );
+        x1 = mm256_rol_32( x1, 11 );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
        x2 = mm256_swap64_32( x2 );
@@ -94,47 +93,48 @@ static void transform( cubehashParam *sp )
    x6 = _mm_load_si128( (__m128i*)sp->x + 6 );
    x7 = _mm_load_si128( (__m128i*)sp->x + 7 );

-    for (r = 0; r < rounds; ++r) {
-	x4 = _mm_add_epi32(x0, x4);
-	x5 = _mm_add_epi32(x1, x5);
-	x6 = _mm_add_epi32(x2, x6);
-	x7 = _mm_add_epi32(x3, x7);
-	y0 = x2;
-	y1 = x3;
-	y2 = x0;
-	y3 = x1;
-	x0 = _mm_xor_si128(_mm_slli_epi32(y0, 7), _mm_srli_epi32(y0, 25));
-	x1 = _mm_xor_si128(_mm_slli_epi32(y1, 7), _mm_srli_epi32(y1, 25));
-	x2 = _mm_xor_si128(_mm_slli_epi32(y2, 7), _mm_srli_epi32(y2, 25));
-	x3 = _mm_xor_si128(_mm_slli_epi32(y3, 7), _mm_srli_epi32(y3, 25));
-	x0 = _mm_xor_si128(x0, x4);
-	x1 = _mm_xor_si128(x1, x5);
-	x2 = _mm_xor_si128(x2, x6);
-	x3 = _mm_xor_si128(x3, x7);
-	x4 = _mm_shuffle_epi32(x4, 0x4e);
-	x5 = _mm_shuffle_epi32(x5, 0x4e);
-	x6 = _mm_shuffle_epi32(x6, 0x4e);
-	x7 = _mm_shuffle_epi32(x7, 0x4e);
-	x4 = _mm_add_epi32(x0, x4);
-	x5 = _mm_add_epi32(x1, x5);
-	x6 = _mm_add_epi32(x2, x6);
-	x7 = _mm_add_epi32(x3, x7);
-	y0 = x1;
-	y1 = x0;
-	y2 = x3;
-	y3 = x2;
-	x0 = _mm_xor_si128(_mm_slli_epi32(y0, 11), _mm_srli_epi32(y0, 21));
-	x1 = _mm_xor_si128(_mm_slli_epi32(y1, 11), _mm_srli_epi32(y1, 21));
-	x2 = _mm_xor_si128(_mm_slli_epi32(y2, 11), _mm_srli_epi32(y2, 21));
-	x3 = _mm_xor_si128(_mm_slli_epi32(y3, 11), _mm_srli_epi32(y3, 21));
-	x0 = _mm_xor_si128(x0, x4);
-	x1 = _mm_xor_si128(x1, x5);
-	x2 = _mm_xor_si128(x2, x6);
-	x3 = _mm_xor_si128(x3, x7);
-	x4 = _mm_shuffle_epi32(x4, 0xb1);
-	x5 = _mm_shuffle_epi32(x5, 0xb1);
-	x6 = _mm_shuffle_epi32(x6, 0xb1);
-	x7 = _mm_shuffle_epi32(x7, 0xb1);
+    for ( r = 0; r < rounds; ++r )
+    {
+       x4 = _mm_add_epi32( x0, x4 );
+       x5 = _mm_add_epi32( x1, x5 );
+       x6 = _mm_add_epi32( x2, x6 );
+       x7 = _mm_add_epi32( x3, x7 );
+       y0 = x2;
+       y1 = x3;
+       y2 = x0;
+       y3 = x1;
+       x0 = mm128_rol_32( y0, 7 );
+       x1 = mm128_rol_32( y1, 7 );
+       x2 = mm128_rol_32( y2, 7 );
+       x3 = mm128_rol_32( y3, 7 );
+       x0 = _mm_xor_si128( x0, x4 );
+       x1 = _mm_xor_si128( x1, x5 );
+       x2 = _mm_xor_si128( x2, x6 );
+       x3 = _mm_xor_si128( x3, x7 );
+       x4 = _mm_shuffle_epi32( x4, 0x4e );
+       x5 = _mm_shuffle_epi32( x5, 0x4e );
+       x6 = _mm_shuffle_epi32( x6, 0x4e );
+       x7 = _mm_shuffle_epi32( x7, 0x4e );
+       x4 = _mm_add_epi32( x0, x4 );
+       x5 = _mm_add_epi32( x1, x5 );
+       x6 = _mm_add_epi32( x2, x6 );
+       x7 = _mm_add_epi32( x3, x7 );
+       y0 = x1;
+       y1 = x0;
+       y2 = x3;
+       y3 = x2;
+       x0 = mm128_rol_32( y0, 11 );
+       x1 = mm128_rol_32( y1, 11 );
+       x2 = mm128_rol_32( y2, 11 );
+       x3 = mm128_rol_32( y3, 11 );
+	    x0 = _mm_xor_si128( x0, x4 );
+	    x1 = _mm_xor_si128( x1, x5 );
+	    x2 = _mm_xor_si128( x2, x6 );
+	    x3 = _mm_xor_si128( x3, x7 );
+	    x4 = _mm_shuffle_epi32( x4, 0xb1 );
+	    x5 = _mm_shuffle_epi32( x5, 0xb1 );
+	    x6 = _mm_shuffle_epi32( x6, 0xb1 );
+	    x7 = _mm_shuffle_epi32( x7, 0xb1 );
    }

    _mm_store_si128( (__m128i*)sp->x,     x0 );
@@ -180,25 +180,25 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
    if ( hashbitlen == 512 )
    {

-       x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
-       x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
-       x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
-       x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
-       x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
-       x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
-       x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
-       x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+       x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
    }
    else
    {
-       x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
-       x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
-       x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
-       x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
-       x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
-       x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
-       x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
-       x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+       x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
    }   

    return SUCCESS;
@@ -234,10 +234,10 @@ int cubehashDigest( cubehashParam *sp, byte *digest )

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      m128_const_64( 0, 0x80 ) );
+                                      _mm_set_epi64x( 0, 0x80 ) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
+    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );
    transform( sp );
    transform( sp );
    transform( sp );
@@ -279,10 +279,10 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      m128_const_64( 0, 0x80 ) );
+                                      _mm_set_epi64x( 0, 0x80 ) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
+    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );

    transform( sp );
    transform( sp );
@@ -313,25 +313,25 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,
    if ( hashbitlen == 512 )
    {

-       x[0] = m128_const_64( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
-       x[1] = m128_const_64( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
-       x[2] = m128_const_64( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
-       x[3] = m128_const_64( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
-       x[4] = m128_const_64( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
-       x[5] = m128_const_64( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
-       x[6] = m128_const_64( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
-       x[7] = m128_const_64( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
+       x[0] = _mm_set_epi64x( 0x4167D83E2D538B8B, 0x50F494D42AEA2A61 );
+       x[1] = _mm_set_epi64x( 0x50AC5695CC39968E, 0xC701CF8C3FEE2313 );
+       x[2] = _mm_set_epi64x( 0x825B453797CF0BEF, 0xA647A8B34D42C787 );
+       x[3] = _mm_set_epi64x( 0xA23911AED0E5CD33, 0xF22090C4EEF864D2 );
+       x[4] = _mm_set_epi64x( 0xB64445321B017BEF, 0x148FE485FCD398D9 );
+       x[5] = _mm_set_epi64x( 0x0DBADEA991FA7934, 0x2FF5781C6A536159 );
+       x[6] = _mm_set_epi64x( 0xBC796576B1C62456, 0xA5A70E75D65C8A2B );
+       x[7] = _mm_set_epi64x( 0xD43E3B447795D246, 0xE7989AF11921C8F7 );
    }
    else
    {
-       x[0] = m128_const_64( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
-       x[1] = m128_const_64( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
-       x[2] = m128_const_64( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
-       x[3] = m128_const_64( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
-       x[4] = m128_const_64( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
-       x[5] = m128_const_64( 0x93CB628565C892FD, 0x5FA2560309392549 );
-       x[6] = m128_const_64( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
-       x[7] = m128_const_64( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
+       x[0] = _mm_set_epi64x( 0x35481EAE63117E71, 0xCCD6F29FEA2BD4B4 );
+       x[1] = _mm_set_epi64x( 0xF4CC12BE7E624131, 0xE5D94E6322512D5B );
+       x[2] = _mm_set_epi64x( 0x3361DA8CD0720C35, 0x42AF2070C2D0B696 );
+       x[3] = _mm_set_epi64x( 0x40E5FBAB4680AC00, 0x8EF8AD8328CCECA4 );
+       x[4] = _mm_set_epi64x( 0xF0B266796C859D41, 0x6107FBD5D89041C3 );
+       x[5] = _mm_set_epi64x( 0x93CB628565C892FD, 0x5FA2560309392549 );
+       x[6] = _mm_set_epi64x( 0x85254725774ABFDD, 0x9E4B4E602AF2B5AE );
+       x[7] = _mm_set_epi64x( 0xD6032C0A9CDAF8AF, 0x4AB6AAD615815AEB );
    }


@@ -358,10 +358,10 @@ int cubehash_full( cubehashParam *sp, byte *digest, int hashbitlen,

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->x[ sp->pos ] = _mm_xor_si128( sp->x[ sp->pos ],
-                                      m128_const_64( 0, 0x80 ) );
+                                      _mm_set_epi64x( 0, 0x80 ) );
    transform( sp );

-    sp->x[7] = _mm_xor_si128( sp->x[7], m128_const_64( 0x100000000, 0 ) );
+    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi64x( 0x100000000, 0 ) );

    transform( sp );
    transform( sp );
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -566,16 +566,16 @@ HashReturn echo_full( hashState_echo *state, BitSequence *hashval,
         state->uHashSize = 256;
         state->uBlockLength = 192;
         state->uRounds = 8;
-         state->hashsize = m128_const_64( 0, 0x100 );
-         state->const1536 = m128_const_64( 0, 0x600 );
+         state->hashsize = _mm_set_epi64x( 0, 0x100 );
+         state->const1536 = _mm_set_epi64x( 0, 0x600 );
         break;

      case 512:
         state->uHashSize = 512;
         state->uBlockLength = 128;
         state->uRounds = 10;
-         state->hashsize = m128_const_64( 0, 0x200 );
-         state->const1536 = m128_const_64( 0, 0x400 );
+         state->hashsize = _mm_set_epi64x( 0, 0x200 );
+         state->const1536 = _mm_set_epi64x( 0, 0x400 );
         break;

      default:
--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -469,8 +469,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
   t1 = _mm256_and_si256( t1, lsbmask_2way ); \
   t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
   s2 = _mm256_xor_si256( s2, t2 );\
-   state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], \
-                              _mm256_xor_si256( s2, state1[ 1 ][ j1 ] ) ); \
+   state2[ 0 ][ j ] = mm256_xor3( state2[ 0 ][ j ], s2, state1[ 1 ][ j1 ] ); \
   state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], s2 ); \
   state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], state1[ 1 ][ j1 ] ); \
   state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], state1[ 1 ][ j1 ] ); \
@@ -480,8 +479,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
   t2 = _mm256_shuffle_epi8( mul2mask_2way, t1 ); \
   s2 = _mm256_xor_si256( s2, t2 ); \
   state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 2 ][ j2 ] ); \
-   state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], \
-                            _mm256_xor_si256( s2, state1[ 2 ][ j2 ] ) ); \
+   state2[ 1 ][ j ] = mm256_xor3( state2[ 1 ][ j ], s2, state1[ 2 ][ j2 ] ); \
   state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], s2 ); \
   state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3][ j ], state1[ 2 ][ j2 ] ); \
   s2 = _mm256_add_epi8( state1[ 3 ][ j3 ], state1[ 3 ][ j3 ] ); \
@@ -491,8 +489,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
   s2 = _mm256_xor_si256( s2, t2 ); \
   state2[ 0 ][ j ] = _mm256_xor_si256( state2[ 0 ][ j ], state1[ 3 ][ j3 ] ); \
   state2[ 1 ][ j ] = _mm256_xor_si256( state2[ 1 ][ j ], state1[ 3 ][ j3 ] ); \
-   state2[ 2 ][ j ] = _mm256_xor_si256( state2[ 2 ][ j ], \
-                            _mm256_xor_si256( s2, state1[ 3 ][ j3] ) ); \
+   state2[ 2 ][ j ] = mm256_xor3( state2[ 2 ][ j ], s2, state1[ 3 ][ j3] ); \
   state2[ 3 ][ j ] = _mm256_xor_si256( state2[ 3 ][ j ], s2 ); \
 } while(0)

--- a/algo/fugue/fugue-aesni.c
+++ b/algo/fugue/fugue-aesni.c
@@ -33,11 +33,11 @@ MYALIGN const unsigned long long _supermix4b[]	= {0x07020d08080e0d0d, 0x07070908
 MYALIGN const unsigned long long _supermix4c[]	= {0x0706050403020000, 0x0302000007060504};
 MYALIGN const unsigned long long _supermix7a[]	= {0x010c0b060d080702, 0x0904030e03000104};
 MYALIGN const unsigned long long _supermix7b[]	= {0x8080808080808080, 0x0504070605040f06};
-MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
-MYALIGN const unsigned char _shift_one_mask[]   = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
-MYALIGN const unsigned char _shift_four_mask[]  = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
-MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
-MYALIGN const unsigned char _aes_shift_rows[]   = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
+//MYALIGN const unsigned long long _k_n[] = {0x4E4E4E4E4E4E4E4E, 0x1B1B1B1B0E0E0E0E};
+//MYALIGN const unsigned char _shift_one_mask[]   = {7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14, 3, 0, 1, 2};
+//MYALIGN const unsigned char _shift_four_mask[]  = {13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8};
+//MYALIGN const unsigned char _shift_seven_mask[] = {10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5};
+//MYALIGN const unsigned char _aes_shift_rows[]   = {0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11};
 MYALIGN const unsigned int _inv_shift_rows[] = {0x070a0d00, 0x0b0e0104, 0x0f020508, 0x0306090c};
 MYALIGN const unsigned int _mul2mask[] = {0x1b1b0000, 0x00000000, 0x00000000, 0x00000000};
 MYALIGN const unsigned int _mul4mask[] = {0x2d361b00, 0x00000000, 0x00000000, 0x00000000};
@@ -131,7 +131,7 @@ MYALIGN const unsigned int _IV512[] = {
   t1 = _mm_srli_epi16(t0, 6);\
   t1 = _mm_and_si128(t1, M128(_lsbmask2));\
   t3 = _mm_xor_si128(t3, _mm_shuffle_epi8(M128(_mul2mask), t1));\
-   t0  = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))
+   t0 = _mm_xor_si128(t4, _mm_shuffle_epi8(M128(_mul4mask), t1))

 /*
 #define PRESUPERMIX(x, t1, s1, s2, t2)\
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -139,7 +139,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
@@ -237,7 +237,7 @@ static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -128,7 +128,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
@@ -226,7 +226,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m128_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  b1 = _mm_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
  MUL2(a0, b0, b1);\
  a0 = _mm_xor_si128(a0, TEMP0);\
  MUL2(a1, b0, b1);\
@@ -275,7 +275,7 @@ static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
 */
 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = m128_const_64( 0xffffffffffffffff, 0 ); \
+  b1 = _mm_set_epi64x( 0xffffffffffffffff, 0 ); \
  a0 = _mm_xor_si128( a0, casti_m128i( round_const_l0, i ) ); \
  a1 = _mm_xor_si128( a1, b1 ); \
  a2 = _mm_xor_si128( a2, b1 ); \
--- a/algo/groestl/aes_ni/hash-groestl.c
+++ b/algo/groestl/aes_ni/hash-groestl.c
@@ -31,7 +31,7 @@ HashReturn_gr init_groestl( hashState_groestl* ctx, int hashlen )
  }

  // The only non-zero in the IV is len. It can be hard coded.
-  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+  ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
@@ -48,7 +48,7 @@ HashReturn_gr reinit_groestl( hashState_groestl* ctx )
     ctx->chaining[i] = _mm_setzero_si128();
     ctx->buffer[i]   = _mm_setzero_si128();
  }
-  ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+  ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

@@ -116,7 +116,7 @@ HashReturn_gr final_groestl( hashState_groestl* ctx, void* output )
   else
   {
       // add first padding
-       ctx->buffer[rem_ptr] = m128_const_64( 0, 0x80 );
+       ctx->buffer[rem_ptr] = _mm_set_epi64x( 0, 0x80 );
       // add zero padding
       for ( i = rem_ptr + 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();
@@ -148,7 +148,7 @@ int groestl512_full( hashState_groestl* ctx, void* output,
      ctx->chaining[i] = _mm_setzero_si128();
      ctx->buffer[i]   = _mm_setzero_si128();
   }
-   ctx->chaining[ 6 ] = m128_const_64( 0x0200000000000000, 0 );
+   ctx->chaining[ 6 ] = _mm_set_epi64x( 0x0200000000000000, 0 );
   ctx->buf_ptr = 0;

   // --- update ---
@@ -182,7 +182,7 @@ int groestl512_full( hashState_groestl* ctx, void* output,
   else
   {
       // add first padding
-       ctx->buffer[i] = m128_const_64( 0, 0x80 );
+       ctx->buffer[i] = _mm_set_epi64x( 0, 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();
@@ -239,7 +239,7 @@ HashReturn_gr update_and_final_groestl( hashState_groestl* ctx, void* output,
   else
   {
       // add first padding
-       ctx->buffer[i] = m128_const_64( 0, 0x80 );
+       ctx->buffer[i] = _mm_set_epi64x( 0, 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = _mm_setzero_si128();
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -46,7 +46,7 @@ HashReturn_gr reinit_groestl256(hashState_groestl256* ctx)
     ctx->buffer[i]   = _mm_setzero_si128();
  }

-  ctx->chaining[ 3 ] = m128_const_64( 0, 0x0100000000000000 );
+  ctx->chaining[ 3 ] = _mm_set_epi64x( 0, 0x0100000000000000 );

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -539,7 +539,7 @@ static const __m256i SUBSH_MASK7_2WAY =
  j = _mm256_cmpgt_epi8(j, i );\
  i = _mm256_add_epi8(i, i);\
  j = _mm256_and_si256(j, k);\
-  i = _mm256_xor_si256(i, j);\
+  i = mm256_xorand( i, j, k );\
 }

 #define MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
@@ -550,7 +550,7 @@ static const __m256i SUBSH_MASK7_2WAY =
  b0 = a2;\
  a1 = _mm256_xor_si256(a1, a2);\
  b1 = a3;\
-  a2 = _mm256_xor_si256(a2, a3);\
+  TEMP2 = _mm256_xor_si256(a2, a3);\
  b2 = a4;\
  a3 = _mm256_xor_si256(a3, a4);\
  b3 = a5;\
@@ -562,34 +562,20 @@ static const __m256i SUBSH_MASK7_2WAY =
  a7 = _mm256_xor_si256(a7, b6);\
  \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm256_xor_si256(b0, a4);\
-  b6 = _mm256_xor_si256(b6, a4);\
-  b1 = _mm256_xor_si256(b1, a5);\
-  b7 = _mm256_xor_si256(b7, a5);\
-  b2 = _mm256_xor_si256(b2, a6);\
-  b0 = _mm256_xor_si256(b0, a6);\
-  /* spill values y_4, y_5 to memory */\
-  TEMP0 = b0;\
-  b3 = _mm256_xor_si256(b3, a7);\
-  b1 = _mm256_xor_si256(b1, a7);\
-  TEMP1 = b1;\
-  b4 = _mm256_xor_si256(b4, a0);\
-  b2 = _mm256_xor_si256(b2, a0);\
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  b0 = a0;\
-  b5 = _mm256_xor_si256(b5, a1);\
-  b3 = _mm256_xor_si256(b3, a1);\
-  b1 = a1;\
-  b6 = _mm256_xor_si256(b6, a2);\
-  b4 = _mm256_xor_si256(b4, a2);\
-  TEMP2 = a2;\
-  b7 = _mm256_xor_si256(b7, a3);\
-  b5 = _mm256_xor_si256(b5, a3);\
-  \
+  TEMP0 = mm256_xor3( b0, a4, a6 ); \
+  TEMP1 = mm256_xor3( b1, a5, a7 ); \
+  b2 = mm256_xor3( b2, a6, a0 ); \
+  b0 = a0; \
+  b3 = mm256_xor3( b3, a7, a1 ); \
+  b1 = a1; \
+  b6 = mm256_xor3( b6, a4, TEMP2 ); \
+  b4 = mm256_xor3( b4, a0, TEMP2 ); \
+  b7 = mm256_xor3( b7, a5, a3 ); \
+  b5 = mm256_xor3( b5, a1, a3 ); \
  /* compute x_i = t_i + t_{i+3} */\
  a0 = _mm256_xor_si256(a0, a3);\
  a1 = _mm256_xor_si256(a1, a4);\
-  a2 = _mm256_xor_si256(a2, a5);\
+  a2 = _mm256_xor_si256( TEMP2, a5);\
  a3 = _mm256_xor_si256(a3, a6);\
  a4 = _mm256_xor_si256(a4, a7);\
  a5 = _mm256_xor_si256(a5, b0);\
@@ -671,7 +657,6 @@ static const __m256i SUBSH_MASK7_2WAY =
  \
  /* MixBytes */\
  MixBytes_2way(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7);\
-\
 }

 /* 10 rounds, P and Q in parallel */
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -710,7 +710,7 @@ static const __m256i SUBSH_MASK7_2WAY =
  b0 = a2;\
  a1 = _mm256_xor_si256(a1, a2);\
  b1 = a3;\
-  a2 = _mm256_xor_si256(a2, a3);\
+  TEMP2 = _mm256_xor_si256(a2, a3);\
  b2 = a4;\
  a3 = _mm256_xor_si256(a3, a4);\
  b3 = a5;\
@@ -722,34 +722,23 @@ static const __m256i SUBSH_MASK7_2WAY =
  a7 = _mm256_xor_si256(a7, b6);\
  \
  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm256_xor_si256(b0, a4);\
-  b6 = _mm256_xor_si256(b6, a4);\
-  b1 = _mm256_xor_si256(b1, a5);\
-  b7 = _mm256_xor_si256(b7, a5);\
-  b2 = _mm256_xor_si256(b2, a6);\
-  b0 = _mm256_xor_si256(b0, a6);\
+  TEMP0 = mm256_xor3( b0, a4, a6 ); \
  /* spill values y_4, y_5 to memory */\
-  TEMP0 = b0;\
-  b3 = _mm256_xor_si256(b3, a7);\
-  b1 = _mm256_xor_si256(b1, a7);\
-  TEMP1 = b1;\
-  b4 = _mm256_xor_si256(b4, a0);\
-  b2 = _mm256_xor_si256(b2, a0);\
+  TEMP1 = mm256_xor3( b1, a5, a7 ); \
+  b2 = mm256_xor3( b2, a6, a0 ); \
  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  b0 = a0;\
-  b5 = _mm256_xor_si256(b5, a1);\
-  b3 = _mm256_xor_si256(b3, a1);\
-  b1 = a1;\
-  b6 = _mm256_xor_si256(b6, a2);\
-  b4 = _mm256_xor_si256(b4, a2);\
-  TEMP2 = a2;\
-  b7 = _mm256_xor_si256(b7, a3);\
-  b5 = _mm256_xor_si256(b5, a3);\
+  b0 = a0; \
+  b3 = mm256_xor3( b3, a7, a1 ); \
+  b1 = a1; \
+  b6 = mm256_xor3( b6, a4, TEMP2 ); \
+  b4 = mm256_xor3( b4, a0, TEMP2 ); \
+  b7 = mm256_xor3( b7, a5, a3 ); \
+  b5 = mm256_xor3( b5, a1, a3 ); \
  \
  /* compute x_i = t_i + t_{i+3} */\
  a0 = _mm256_xor_si256(a0, a3);\
  a1 = _mm256_xor_si256(a1, a4);\
-  a2 = _mm256_xor_si256(a2, a5);\
+  a2 = _mm256_xor_si256( TEMP2, a5);\
  a3 = _mm256_xor_si256(a3, a6);\
  a4 = _mm256_xor_si256(a4, a7);\
  a5 = _mm256_xor_si256(a5, b0);\
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -562,14 +562,14 @@ do { \
  for ( int u = 0; u < 64; u++ ) \
  { \
     const __mmask8 dm = _mm512_cmplt_epi64_mask( db, zero ); \
-     m0 = _mm512_mask_xor_epi64( m0, dm, m0, m512_const1_64( tp[0] ) ); \
-     m1 = _mm512_mask_xor_epi64( m1, dm, m1, m512_const1_64( tp[1] ) ); \
-     m2 = _mm512_mask_xor_epi64( m2, dm, m2, m512_const1_64( tp[2] ) ); \
-     m3 = _mm512_mask_xor_epi64( m3, dm, m3, m512_const1_64( tp[3] ) ); \
-     m4 = _mm512_mask_xor_epi64( m4, dm, m4, m512_const1_64( tp[4] ) ); \
-     m5 = _mm512_mask_xor_epi64( m5, dm, m5, m512_const1_64( tp[5] ) ); \
-     m6 = _mm512_mask_xor_epi64( m6, dm, m6, m512_const1_64( tp[6] ) ); \
-     m7 = _mm512_mask_xor_epi64( m7, dm, m7, m512_const1_64( tp[7] ) ); \
+     m0 = _mm512_mask_xor_epi64( m0, dm, m0, _mm512_set1_epi64( tp[0] ) ); \
+     m1 = _mm512_mask_xor_epi64( m1, dm, m1, _mm512_set1_epi64( tp[1] ) ); \
+     m2 = _mm512_mask_xor_epi64( m2, dm, m2, _mm512_set1_epi64( tp[2] ) ); \
+     m3 = _mm512_mask_xor_epi64( m3, dm, m3, _mm512_set1_epi64( tp[3] ) ); \
+     m4 = _mm512_mask_xor_epi64( m4, dm, m4, _mm512_set1_epi64( tp[4] ) ); \
+     m5 = _mm512_mask_xor_epi64( m5, dm, m5, _mm512_set1_epi64( tp[5] ) ); \
+     m6 = _mm512_mask_xor_epi64( m6, dm, m6, _mm512_set1_epi64( tp[6] ) ); \
+     m7 = _mm512_mask_xor_epi64( m7, dm, m7, _mm512_set1_epi64( tp[7] ) ); \
     db = _mm512_ror_epi64( db, 1 ); \
     tp += 8; \
  } \
@@ -733,17 +733,17 @@ do { \
   __m512i alpha[16]; \
   const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
   for( int i = 0; i < 16; i++ ) \
-      alpha[i] = m512_const1_64( ( (uint64_t*)alpha_n )[i] ); \
+      alpha[i] = _mm512_set1_epi64( ( (uint64_t*)alpha_n )[i] ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( (1ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( (1ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( (2ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( (2ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( (3ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( (3ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( (4ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( (4ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( (5ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( (5ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
 } while (0)

@@ -752,29 +752,29 @@ do { \
   __m512i alpha[16]; \
   const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
   for( int i = 0; i < 16; i++ ) \
-      alpha[i] = m512_const1_64( ( (uint64_t*)alpha_f )[i] ); \
+      alpha[i] = _mm512_set1_epi64( ( (uint64_t*)alpha_f )[i] ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( 1ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( ( 1ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( 2ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( ( 2ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( 3ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( ( 3ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( 4ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( ( 4ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( 5ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( ( 5ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( 6ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( ( 6ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( 7ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( ( 7ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( 8ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( ( 8ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( ( 9ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( ( 9ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( (10ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( (10ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
-   alpha[0] = m512_const1_64( (11ULL << 32) ^ A0 ); \
+   alpha[0] = _mm512_set1_epi64( (11ULL << 32) ^ A0 ); \
   ROUND_BIG8( alpha ); \
 } while (0)

@@ -829,14 +829,14 @@ void hamsi512_8way_init( hamsi_8way_big_context *sc )
   sc->partial_len = 0;
   sc->count_high = sc->count_low = 0;

-   sc->h[0] = m512_const1_64( 0x6c70617273746565 );
-   sc->h[1] = m512_const1_64( 0x656e62656b204172 );
-   sc->h[2] = m512_const1_64( 0x302c206272672031 );
-   sc->h[3] = m512_const1_64( 0x3434362c75732032 );
-   sc->h[4] = m512_const1_64( 0x3030312020422d33 );
-   sc->h[5] = m512_const1_64( 0x656e2d484c657576 );
-   sc->h[6] = m512_const1_64( 0x6c65652c65766572 );
-   sc->h[7] = m512_const1_64( 0x6769756d2042656c );
+   sc->h[0] = _mm512_set1_epi64( 0x6c70617273746565 );
+   sc->h[1] = _mm512_set1_epi64( 0x656e62656b204172 );
+   sc->h[2] = _mm512_set1_epi64( 0x302c206272672031 );
+   sc->h[3] = _mm512_set1_epi64( 0x3434362c75732032 );
+   sc->h[4] = _mm512_set1_epi64( 0x3030312020422d33 );
+   sc->h[5] = _mm512_set1_epi64( 0x656e2d484c657576 );
+   sc->h[6] = _mm512_set1_epi64( 0x6c65652c65766572 );
+   sc->h[7] = _mm512_set1_epi64( 0x6769756d2042656c );
 }

 void hamsi512_8way_update( hamsi_8way_big_context *sc, const void *data,
@@ -859,7 +859,7 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )
   sph_enc32be( &ch, sc->count_high );
   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
   pad[0] = _mm512_set1_epi64( ((uint64_t)cl << 32 ) | (uint64_t)ch );
-   sc->buf[0] = m512_const1_64( 0x80 );
+   sc->buf[0] = _mm512_set1_epi64( 0x80 );
   hamsi_8way_big( sc, sc->buf, 1 );
   hamsi_8way_big_final( sc, pad );

@@ -870,6 +870,32 @@ void hamsi512_8way_close( hamsi_8way_big_context *sc, void *dst )

 // Hamsi 4 way AVX2

+#if defined(__AVX512VL__)
+
+#define INPUT_BIG \
+do { \
+  __m256i db = _mm256_ror_epi64( *buf, 1 ); \
+  const __m256i zero = m256_zero; \
+  const uint64_t *tp = (const uint64_t*)T512; \
+  m0 = m1 = m2 = m3 = m4 = m5 = m6 = m7 = zero; \
+  for ( int u = 0; u < 64; u++ ) \
+  { \
+     const __mmask8 dm = _mm256_cmplt_epi64_mask( db, zero ); \
+     m0 = _mm256_mask_xor_epi64( m0, dm, m0, _mm256_set1_epi64x( tp[0] ) ); \
+     m1 = _mm256_mask_xor_epi64( m1, dm, m1, _mm256_set1_epi64x( tp[1] ) ); \
+     m2 = _mm256_mask_xor_epi64( m2, dm, m2, _mm256_set1_epi64x( tp[2] ) ); \
+     m3 = _mm256_mask_xor_epi64( m3, dm, m3, _mm256_set1_epi64x( tp[3] ) ); \
+     m4 = _mm256_mask_xor_epi64( m4, dm, m4, _mm256_set1_epi64x( tp[4] ) ); \
+     m5 = _mm256_mask_xor_epi64( m5, dm, m5, _mm256_set1_epi64x( tp[5] ) ); \
+     m6 = _mm256_mask_xor_epi64( m6, dm, m6, _mm256_set1_epi64x( tp[6] ) ); \
+     m7 = _mm256_mask_xor_epi64( m7, dm, m7, _mm256_set1_epi64x( tp[7] ) ); \
+     db = _mm256_ror_epi64( db, 1 ); \
+     tp += 8; \
+  } \
+} while (0)
+
+#else
+
 #define INPUT_BIG \
 do { \
  __m256i db = *buf; \
@@ -880,25 +906,58 @@ do { \
  { \
     __m256i dm = _mm256_cmpgt_epi64( zero, _mm256_slli_epi64( db, u ) ); \
     m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
-                                          m256_const1_64( tp[0] ) ) ); \
+                                          _mm256_set1_epi64x( tp[0] ) ) ); \
     m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
-                                          m256_const1_64( tp[1] ) ) ); \
+                                          _mm256_set1_epi64x( tp[1] ) ) ); \
     m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
-                                          m256_const1_64( tp[2] ) ) ); \
+                                          _mm256_set1_epi64x( tp[2] ) ) ); \
     m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
-                                          m256_const1_64( tp[3] ) ) ); \
+                                          _mm256_set1_epi64x( tp[3] ) ) ); \
     m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
-                                          m256_const1_64( tp[4] ) ) ); \
+                                          _mm256_set1_epi64x( tp[4] ) ) ); \
     m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
-                                          m256_const1_64( tp[5] ) ) ); \
+                                          _mm256_set1_epi64x( tp[5] ) ) ); \
     m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
-                                          m256_const1_64( tp[6] ) ) ); \
+                                          _mm256_set1_epi64x( tp[6] ) ) ); \
     m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
-                                          m256_const1_64( tp[7] ) ) ); \
+                                          _mm256_set1_epi64x( tp[7] ) ) ); \
     tp += 8; \
  } \
 } while (0)

+#endif
+
+#define SBOX( a, b, c, d ) \
+do { \
+  __m256i t; \
+  t = a; \
+  a = mm256_xorand( d, a, c ); \
+  c = mm256_xor3( a, b, c ); \
+  b = mm256_xoror( b, d, t ); \
+  t = _mm256_xor_si256( t, c ); \
+  d = mm256_xoror( a, b, t ); \
+  t = mm256_xorand( t, a, b ); \
+  a = c; \
+  c = mm256_xor3( b, d, t ); \
+  b = d; \
+  d = mm256_not( t ); \
+} while (0)
+
+#define L( a, b, c, d ) \
+do { \
+   a = mm256_rol_32( a, 13 ); \
+   c = mm256_rol_32( c,  3 ); \
+   b = mm256_xor3( a, b, c ); \
+   d = mm256_xor3( d, c, _mm256_slli_epi32( a, 3 ) ); \
+   b = mm256_rol_32( b, 1 ); \
+   d = mm256_rol_32( d, 7 ); \
+   a = mm256_xor3( a, b, d ); \
+   c = mm256_xor3( c, d, _mm256_slli_epi32( b, 7 ) ); \
+   a = mm256_rol_32( a,  5 ); \
+   c = mm256_rol_32( c, 22 ); \
+} while (0)
+
+/*
 #define SBOX( a, b, c, d ) \
 do { \
  __m256i t; \
@@ -937,6 +996,7 @@ do { \
   a = mm256_rol_32( a,  5 ); \
   c = mm256_rol_32( c, 22 ); \
 } while (0)
+*/

 #define DECL_STATE_BIG \
   __m256i c0, c1, c2, c3, c4, c5, c6, c7; \
@@ -1066,17 +1126,17 @@ do { \
   __m256i alpha[16]; \
   const uint64_t A0 = ( (uint64_t*)alpha_n )[0]; \
   for( int i = 0; i < 16; i++ ) \
-      alpha[i] = m256_const1_64( ( (uint64_t*)alpha_n )[i] ); \
+      alpha[i] = _mm256_set1_epi64x( ( (uint64_t*)alpha_n )[i] ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( (1ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( (1ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( (2ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( (2ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( (3ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( (3ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( (4ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( (4ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( (5ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( (5ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
 } while (0)

@@ -1085,29 +1145,29 @@ do { \
   __m256i alpha[16]; \
   const uint64_t A0 = ( (uint64_t*)alpha_f )[0]; \
   for( int i = 0; i < 16; i++ ) \
-      alpha[i] = m256_const1_64( ( (uint64_t*)alpha_f )[i] ); \
+      alpha[i] = _mm256_set1_epi64x( ( (uint64_t*)alpha_f )[i] ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( 1ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( ( 1ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( 2ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( ( 2ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( 3ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( ( 3ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( 4ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( ( 4ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( 5ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( ( 5ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( 6ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( ( 6ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( 7ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( ( 7ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( 8ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( ( 8ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( ( 9ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( ( 9ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( (10ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( (10ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
-   alpha[0] = m256_const1_64( (11ULL << 32) ^ A0 ); \
+   alpha[0] = _mm256_set1_epi64x( (11ULL << 32) ^ A0 ); \
   ROUND_BIG( alpha ); \
 } while (0)

@@ -1163,14 +1223,14 @@ void hamsi512_4way_init( hamsi_4way_big_context *sc )
   sc->partial_len = 0;
   sc->count_high = sc->count_low = 0;

-   sc->h[0] = m256_const1_64( 0x6c70617273746565 );
-   sc->h[1] = m256_const1_64( 0x656e62656b204172 );
-   sc->h[2] = m256_const1_64( 0x302c206272672031 );
-   sc->h[3] = m256_const1_64( 0x3434362c75732032 );
-   sc->h[4] = m256_const1_64( 0x3030312020422d33 );
-   sc->h[5] = m256_const1_64( 0x656e2d484c657576 );
-   sc->h[6] = m256_const1_64( 0x6c65652c65766572 );
-   sc->h[7] = m256_const1_64( 0x6769756d2042656c );
+   sc->h[0] = _mm256_set1_epi64x( 0x6c70617273746565 );
+   sc->h[1] = _mm256_set1_epi64x( 0x656e62656b204172 );
+   sc->h[2] = _mm256_set1_epi64x( 0x302c206272672031 );
+   sc->h[3] = _mm256_set1_epi64x( 0x3434362c75732032 );
+   sc->h[4] = _mm256_set1_epi64x( 0x3030312020422d33 );
+   sc->h[5] = _mm256_set1_epi64x( 0x656e2d484c657576 );
+   sc->h[6] = _mm256_set1_epi64x( 0x6c65652c65766572 );
+   sc->h[7] = _mm256_set1_epi64x( 0x6769756d2042656c );
 }

 void hamsi512_4way_update( hamsi_4way_big_context *sc, const void *data,
@@ -1193,7 +1253,7 @@ void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
   sph_enc32be( &ch, sc->count_high );
   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
   pad[0] = _mm256_set1_epi64x( ((uint64_t)cl << 32 ) | (uint64_t)ch );
-   sc->buf[0] = m256_const1_64( 0x80 );
+   sc->buf[0] = _mm256_set1_epi64x( 0x80 );
   hamsi_big( sc, sc->buf, 1 );
   hamsi_big_final( sc, pad );

--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -52,6 +52,56 @@ extern "C"{
 #define SPH_SMALL_FOOTPRINT_HAVAL   1
 //#endif

+#if defined(__AVX512VL__)
+
+// ( ~( a ^ b ) ) & c
+#define mm128_andnotxor( a, b, c ) \
+   _mm_ternarylogic_epi32( a, b, c, 0x82  )
+
+#else
+
+#define mm128_andnotxor( a, b, c ) \
+   _mm_andnot_si128( _mm_xor_si128( a, b ), c )
+
+#endif
+
+#define F1(x6, x5, x4, x3, x2, x1, x0) \
+ mm128_xor3( x0, mm128_andxor( x1, x0, x4 ), \
+                 _mm_xor_si128( _mm_and_si128( x2, x5 ), \
+                                _mm_and_si128( x3, x6 ) ) ) \
+
+#define F2(x6, x5, x4, x3, x2, x1, x0) \
+   mm128_xor3( mm128_andxor( x2, _mm_andnot_si128( x3, x1 ), \
+                       mm128_xor3( _mm_and_si128( x4, x5 ), x6, x0 )  ), \
+               mm128_andxor( x4, x1, x5 ), \
+               mm128_xorand( x0, x3, x5 ) ) \
+
+#define F3(x6, x5, x4, x3, x2, x1, x0) \
+  mm128_xor3( x0, \
+              _mm_and_si128( x3, \
+                         mm128_xor3( _mm_and_si128( x1, x2 ), x6, x0 ) ), \
+              _mm_xor_si128( _mm_and_si128( x1, x4 ), \
+                             _mm_and_si128( x2, x5 ) ) )
+
+#define F4(x6, x5, x4, x3, x2, x1, x0) \
+  mm128_xor3( \
+      mm128_andxor( x3, x5, \
+                    _mm_xor_si128( _mm_and_si128( x1, x2 ), \
+                                      _mm_or_si128( x4, x6 ) ) ), \
+      _mm_and_si128( x4, \
+                        mm128_xor3( x0, _mm_andnot_si128( x2, x5 ), \
+                                    _mm_xor_si128( x1, x6 ) ) ), \
+      mm128_xorand( x0, x2, x6 ) )
+
+#define F5(x6, x5, x4, x3, x2, x1, x0) \
+   _mm_xor_si128( \
+         mm128_andnotxor( mm128_and3( x1, x2, x3 ), x5, x0 ), \
+         mm128_xor3( _mm_and_si128( x1, x4 ), \
+                     _mm_and_si128( x2, x5 ), \
+                     _mm_and_si128( x3, x6 ) ) )
+  
+
+/*
 #define F1(x6, x5, x4, x3, x2, x1, x0) \
   _mm_xor_si128( x0, \
       _mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \
@@ -96,6 +146,7 @@ extern "C"{
      _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
                                    _mm_and_si128( x2, x5 ) ), \
                                    _mm_and_si128( x3, x6 ) ) )
+*/

 /*
 * The macros below integrate the phi() permutations, depending on the
@@ -740,14 +791,14 @@ do { \
 static void
 haval_8way_init( haval_8way_context *sc, unsigned olen, unsigned passes )
 {
-   sc->s0 = m256_const1_32( 0x243F6A88UL );
-   sc->s1 = m256_const1_32( 0x85A308D3UL );
-   sc->s2 = m256_const1_32( 0x13198A2EUL );
-   sc->s3 = m256_const1_32( 0x03707344UL );
-   sc->s4 = m256_const1_32( 0xA4093822UL );
-   sc->s5 = m256_const1_32( 0x299F31D0UL );
-   sc->s6 = m256_const1_32( 0x082EFA98UL );
-   sc->s7 = m256_const1_32( 0xEC4E6C89UL );
+   sc->s0 = _mm256_set1_epi32( 0x243F6A88UL );
+   sc->s1 = _mm256_set1_epi32( 0x85A308D3UL );
+   sc->s2 = _mm256_set1_epi32( 0x13198A2EUL );
+   sc->s3 = _mm256_set1_epi32( 0x03707344UL );
+   sc->s4 = _mm256_set1_epi32( 0xA4093822UL );
+   sc->s5 = _mm256_set1_epi32( 0x299F31D0UL );
+   sc->s6 = _mm256_set1_epi32( 0x082EFA98UL );
+   sc->s7 = _mm256_set1_epi32( 0xEC4E6C89UL );
   sc->olen = olen;
   sc->passes = passes;
   sc->count_high = 0;
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -76,19 +76,31 @@ do { \

 #endif

+#if defined(__AVX512VL__)
+//TODO enable for AVX10_256, not used with AVX512VL
+
+#define notxorandnot( a, b, c ) \
+   _mm256_ternarylogic_epi64( a, b, c, 0x2d )
+
+#else
+
+#define notxorandnot( a, b, c ) \
+   _mm256_xor_si256( mm256_not( a ), _mm256_andnot_si256( b, c ) )
+
+#endif
+
 #define Sb(x0, x1, x2, x3, c) \
 do { \
-   const __m256i cc = _mm256_set1_epi64x( c ); \
-    x3 = mm256_not( x3 ); \
-    x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
-    tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
-    x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
-    x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \
-    x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
-    x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \
-    x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
-    x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
-    x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
+    const __m256i cc = _mm256_set1_epi64x( c ); \
+    x0 = mm256_xorandnot( x0, x2, cc ); \
+    tmp = mm256_xorand( cc, x0, x1 ); \
+    x0 = mm256_xorandnot( x0, x3, x2 ); \
+    x3 = notxorandnot( x3, x1, x2 ); \
+    x1 = mm256_xorand( x1, x0, x2 ); \
+    x2 = mm256_xorandnot( x2, x3, x0 ); \
+    x0 = mm256_xoror( x0, x1, x3 ); \
+    x3 = mm256_xorand( x3, x1, x2 ); \
+    x1 = mm256_xorand( x1, tmp, x0 ); \
    x2 = _mm256_xor_si256( x2, tmp ); \
 } while (0)

@@ -96,11 +108,11 @@ do { \
 do { \
    x4 = _mm256_xor_si256( x4, x1 ); \
    x5 = _mm256_xor_si256( x5, x2 ); \
-    x6 = _mm256_xor_si256( x6, _mm256_xor_si256( x3, x0 ) ); \
+    x6 = mm256_xor3( x6, x3, x0 ); \
    x7 = _mm256_xor_si256( x7, x0 ); \
    x0 = _mm256_xor_si256( x0, x5 ); \
    x1 = _mm256_xor_si256( x1, x6 ); \
-    x2 = _mm256_xor_si256( x2, _mm256_xor_si256( x7, x4 ) ); \
+    x2 = mm256_xor3( x2, x7, x4 ); \
    x3 = _mm256_xor_si256( x3, x4 ); \
 } while (0)

@@ -323,12 +335,12 @@ do { \
 } while (0)


-#define W80(x)   Wz_8W(x, m512_const1_64( 0x5555555555555555 ),  1 )
-#define W81(x)   Wz_8W(x, m512_const1_64( 0x3333333333333333 ),  2 )
-#define W82(x)   Wz_8W(x, m512_const1_64( 0x0F0F0F0F0F0F0F0F ),  4 )
-#define W83(x)   Wz_8W(x, m512_const1_64( 0x00FF00FF00FF00FF ),  8 ) 
-#define W84(x)   Wz_8W(x, m512_const1_64( 0x0000FFFF0000FFFF ), 16 )
-#define W85(x)   Wz_8W(x, m512_const1_64( 0x00000000FFFFFFFF ), 32 )
+#define W80(x)   Wz_8W(x, _mm512_set1_epi64( 0x5555555555555555 ),  1 )
+#define W81(x)   Wz_8W(x, _mm512_set1_epi64( 0x3333333333333333 ),  2 )
+#define W82(x)   Wz_8W(x, _mm512_set1_epi64( 0x0F0F0F0F0F0F0F0F ),  4 )
+#define W83(x)   Wz_8W(x, _mm512_set1_epi64( 0x00FF00FF00FF00FF ),  8 ) 
+#define W84(x)   Wz_8W(x, _mm512_set1_epi64( 0x0000FFFF0000FFFF ), 16 )
+#define W85(x)   Wz_8W(x, _mm512_set1_epi64( 0x00000000FFFFFFFF ), 32 )
 #define W86(x) \
 do { \
   __m512i t = x ## h; \
@@ -352,12 +364,12 @@ do { \
   x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
 } while (0)

-#define W0(x)   Wz(x, m256_const1_64( 0x5555555555555555 ),  1 )
-#define W1(x)   Wz(x, m256_const1_64( 0x3333333333333333 ),  2 )
-#define W2(x)   Wz(x, m256_const1_64( 0x0F0F0F0F0F0F0F0F ),  4 )
-#define W3(x)   Wz(x, m256_const1_64( 0x00FF00FF00FF00FF ),  8 ) 
-#define W4(x)   Wz(x, m256_const1_64( 0x0000FFFF0000FFFF ), 16 )
-#define W5(x)   Wz(x, m256_const1_64( 0x00000000FFFFFFFF ), 32 )
+#define W0(x)   Wz(x, _mm256_set1_epi64x( 0x5555555555555555 ),  1 )
+#define W1(x)   Wz(x, _mm256_set1_epi64x( 0x3333333333333333 ),  2 )
+#define W2(x)   Wz(x, _mm256_set1_epi64x( 0x0F0F0F0F0F0F0F0F ),  4 )
+#define W3(x)   Wz(x, _mm256_set1_epi64x( 0x00FF00FF00FF00FF ),  8 ) 
+#define W4(x)   Wz(x, _mm256_set1_epi64x( 0x0000FFFF0000FFFF ), 16 )
+#define W5(x)   Wz(x, _mm256_set1_epi64x( 0x00000000FFFFFFFF ), 32 )
 #define W6(x) \
 do { \
   __m256i t = x ## h; \
@@ -624,22 +636,22 @@ static const sph_u64 IV512[] = {
 void jh256_8way_init( jh_8way_context *sc )
 {
    // bswapped IV256
-    sc->H[ 0] = m512_const1_64( 0xebd3202c41a398eb );
-    sc->H[ 1] = m512_const1_64( 0xc145b29c7bbecd92 );
-    sc->H[ 2] = m512_const1_64( 0xfac7d4609151931c );
-    sc->H[ 3] = m512_const1_64( 0x038a507ed6820026 );
-    sc->H[ 4] = m512_const1_64( 0x45b92677269e23a4 );
-    sc->H[ 5] = m512_const1_64( 0x77941ad4481afbe0 );
-    sc->H[ 6] = m512_const1_64( 0x7a176b0226abb5cd );
-    sc->H[ 7] = m512_const1_64( 0xa82fff0f4224f056 );
-    sc->H[ 8] = m512_const1_64( 0x754d2e7f8996a371 );
-    sc->H[ 9] = m512_const1_64( 0x62e27df70849141d );
-    sc->H[10] = m512_const1_64( 0x948f2476f7957627 );
-    sc->H[11] = m512_const1_64( 0x6c29804757b6d587 );
-    sc->H[12] = m512_const1_64( 0x6c0d8eac2d275e5c );
-    sc->H[13] = m512_const1_64( 0x0f7a0557c6508451 );
-    sc->H[14] = m512_const1_64( 0xea12247067d3e47b );
-    sc->H[15] = m512_const1_64( 0x69d71cd313abe389 );
+    sc->H[ 0] = _mm512_set1_epi64( 0xebd3202c41a398eb );
+    sc->H[ 1] = _mm512_set1_epi64( 0xc145b29c7bbecd92 );
+    sc->H[ 2] = _mm512_set1_epi64( 0xfac7d4609151931c );
+    sc->H[ 3] = _mm512_set1_epi64( 0x038a507ed6820026 );
+    sc->H[ 4] = _mm512_set1_epi64( 0x45b92677269e23a4 );
+    sc->H[ 5] = _mm512_set1_epi64( 0x77941ad4481afbe0 );
+    sc->H[ 6] = _mm512_set1_epi64( 0x7a176b0226abb5cd );
+    sc->H[ 7] = _mm512_set1_epi64( 0xa82fff0f4224f056 );
+    sc->H[ 8] = _mm512_set1_epi64( 0x754d2e7f8996a371 );
+    sc->H[ 9] = _mm512_set1_epi64( 0x62e27df70849141d );
+    sc->H[10] = _mm512_set1_epi64( 0x948f2476f7957627 );
+    sc->H[11] = _mm512_set1_epi64( 0x6c29804757b6d587 );
+    sc->H[12] = _mm512_set1_epi64( 0x6c0d8eac2d275e5c );
+    sc->H[13] = _mm512_set1_epi64( 0x0f7a0557c6508451 );
+    sc->H[14] = _mm512_set1_epi64( 0xea12247067d3e47b );
+    sc->H[15] = _mm512_set1_epi64( 0x69d71cd313abe389 );
    sc->ptr = 0;
    sc->block_count = 0;
 }
@@ -647,22 +659,22 @@ void jh256_8way_init( jh_8way_context *sc )
 void jh512_8way_init( jh_8way_context *sc )
 {
    // bswapped IV512
-    sc->H[ 0] = m512_const1_64( 0x17aa003e964bd16f );
-    sc->H[ 1] = m512_const1_64( 0x43d5157a052e6a63 );
-    sc->H[ 2] = m512_const1_64( 0x0bef970c8d5e228a );
-    sc->H[ 3] = m512_const1_64( 0x61c3b3f2591234e9 );
-    sc->H[ 4] = m512_const1_64( 0x1e806f53c1a01d89 );
-    sc->H[ 5] = m512_const1_64( 0x806d2bea6b05a92a );
-    sc->H[ 6] = m512_const1_64( 0xa6ba7520dbcc8e58 );
-    sc->H[ 7] = m512_const1_64( 0xf73bf8ba763a0fa9 );
-    sc->H[ 8] = m512_const1_64( 0x694ae34105e66901 );
-    sc->H[ 9] = m512_const1_64( 0x5ae66f2e8e8ab546 );
-    sc->H[10] = m512_const1_64( 0x243c84c1d0a74710 );
-    sc->H[11] = m512_const1_64( 0x99c15a2db1716e3b );
-    sc->H[12] = m512_const1_64( 0x56f8b19decf657cf );
-    sc->H[13] = m512_const1_64( 0x56b116577c8806a7 );
-    sc->H[14] = m512_const1_64( 0xfb1785e6dffcc2e3 );
-    sc->H[15] = m512_const1_64( 0x4bdd8ccc78465a54 );
+    sc->H[ 0] = _mm512_set1_epi64( 0x17aa003e964bd16f );
+    sc->H[ 1] = _mm512_set1_epi64( 0x43d5157a052e6a63 );
+    sc->H[ 2] = _mm512_set1_epi64( 0x0bef970c8d5e228a );
+    sc->H[ 3] = _mm512_set1_epi64( 0x61c3b3f2591234e9 );
+    sc->H[ 4] = _mm512_set1_epi64( 0x1e806f53c1a01d89 );
+    sc->H[ 5] = _mm512_set1_epi64( 0x806d2bea6b05a92a );
+    sc->H[ 6] = _mm512_set1_epi64( 0xa6ba7520dbcc8e58 );
+    sc->H[ 7] = _mm512_set1_epi64( 0xf73bf8ba763a0fa9 );
+    sc->H[ 8] = _mm512_set1_epi64( 0x694ae34105e66901 );
+    sc->H[ 9] = _mm512_set1_epi64( 0x5ae66f2e8e8ab546 );
+    sc->H[10] = _mm512_set1_epi64( 0x243c84c1d0a74710 );
+    sc->H[11] = _mm512_set1_epi64( 0x99c15a2db1716e3b );
+    sc->H[12] = _mm512_set1_epi64( 0x56f8b19decf657cf );
+    sc->H[13] = _mm512_set1_epi64( 0x56b116577c8806a7 );
+    sc->H[14] = _mm512_set1_epi64( 0xfb1785e6dffcc2e3 );
+    sc->H[15] = _mm512_set1_epi64( 0x4bdd8ccc78465a54 );
    sc->ptr = 0;
    sc->block_count = 0;
 }
@@ -721,7 +733,7 @@ jh_8way_close( jh_8way_context *sc, unsigned ub, unsigned n, void *dst,
   size_t numz, u;
   uint64_t l0, l1;

-   buf[0] = m512_const1_64( 0x80ULL );
+   buf[0] = _mm512_set1_epi64( 0x80ULL );

   if ( sc->ptr == 0 )
       numz = 48;
@@ -772,22 +784,22 @@ jh512_8way_close(void *cc, void *dst)
 void jh256_4way_init( jh_4way_context *sc )
 {
    // bswapped IV256
-    sc->H[ 0] = m256_const1_64( 0xebd3202c41a398eb );
-    sc->H[ 1] = m256_const1_64( 0xc145b29c7bbecd92 );
-    sc->H[ 2] = m256_const1_64( 0xfac7d4609151931c );
-    sc->H[ 3] = m256_const1_64( 0x038a507ed6820026 );
-    sc->H[ 4] = m256_const1_64( 0x45b92677269e23a4 );
-    sc->H[ 5] = m256_const1_64( 0x77941ad4481afbe0 );
-    sc->H[ 6] = m256_const1_64( 0x7a176b0226abb5cd );
-    sc->H[ 7] = m256_const1_64( 0xa82fff0f4224f056 );
-    sc->H[ 8] = m256_const1_64( 0x754d2e7f8996a371 );
-    sc->H[ 9] = m256_const1_64( 0x62e27df70849141d );
-    sc->H[10] = m256_const1_64( 0x948f2476f7957627 );
-    sc->H[11] = m256_const1_64( 0x6c29804757b6d587 );
-    sc->H[12] = m256_const1_64( 0x6c0d8eac2d275e5c );
-    sc->H[13] = m256_const1_64( 0x0f7a0557c6508451 );
-    sc->H[14] = m256_const1_64( 0xea12247067d3e47b );
-    sc->H[15] = m256_const1_64( 0x69d71cd313abe389 );
+    sc->H[ 0] = _mm256_set1_epi64x( 0xebd3202c41a398eb );
+    sc->H[ 1] = _mm256_set1_epi64x( 0xc145b29c7bbecd92 );
+    sc->H[ 2] = _mm256_set1_epi64x( 0xfac7d4609151931c );
+    sc->H[ 3] = _mm256_set1_epi64x( 0x038a507ed6820026 );
+    sc->H[ 4] = _mm256_set1_epi64x( 0x45b92677269e23a4 );
+    sc->H[ 5] = _mm256_set1_epi64x( 0x77941ad4481afbe0 );
+    sc->H[ 6] = _mm256_set1_epi64x( 0x7a176b0226abb5cd );
+    sc->H[ 7] = _mm256_set1_epi64x( 0xa82fff0f4224f056 );
+    sc->H[ 8] = _mm256_set1_epi64x( 0x754d2e7f8996a371 );
+    sc->H[ 9] = _mm256_set1_epi64x( 0x62e27df70849141d );
+    sc->H[10] = _mm256_set1_epi64x( 0x948f2476f7957627 );
+    sc->H[11] = _mm256_set1_epi64x( 0x6c29804757b6d587 );
+    sc->H[12] = _mm256_set1_epi64x( 0x6c0d8eac2d275e5c );
+    sc->H[13] = _mm256_set1_epi64x( 0x0f7a0557c6508451 );
+    sc->H[14] = _mm256_set1_epi64x( 0xea12247067d3e47b );
+    sc->H[15] = _mm256_set1_epi64x( 0x69d71cd313abe389 );
    sc->ptr = 0;
    sc->block_count = 0;
 }
@@ -795,22 +807,22 @@ void jh256_4way_init( jh_4way_context *sc )
 void jh512_4way_init( jh_4way_context *sc )
 {
    // bswapped IV512
-    sc->H[ 0] = m256_const1_64( 0x17aa003e964bd16f );
-    sc->H[ 1] = m256_const1_64( 0x43d5157a052e6a63 );
-    sc->H[ 2] = m256_const1_64( 0x0bef970c8d5e228a );
-    sc->H[ 3] = m256_const1_64( 0x61c3b3f2591234e9 );
-    sc->H[ 4] = m256_const1_64( 0x1e806f53c1a01d89 );
-    sc->H[ 5] = m256_const1_64( 0x806d2bea6b05a92a );
-    sc->H[ 6] = m256_const1_64( 0xa6ba7520dbcc8e58 );
-    sc->H[ 7] = m256_const1_64( 0xf73bf8ba763a0fa9 );
-    sc->H[ 8] = m256_const1_64( 0x694ae34105e66901 );
-    sc->H[ 9] = m256_const1_64( 0x5ae66f2e8e8ab546 );
-    sc->H[10] = m256_const1_64( 0x243c84c1d0a74710 );
-    sc->H[11] = m256_const1_64( 0x99c15a2db1716e3b );
-    sc->H[12] = m256_const1_64( 0x56f8b19decf657cf );
-    sc->H[13] = m256_const1_64( 0x56b116577c8806a7 );
-    sc->H[14] = m256_const1_64( 0xfb1785e6dffcc2e3 );
-    sc->H[15] = m256_const1_64( 0x4bdd8ccc78465a54 );
+    sc->H[ 0] = _mm256_set1_epi64x( 0x17aa003e964bd16f );
+    sc->H[ 1] = _mm256_set1_epi64x( 0x43d5157a052e6a63 );
+    sc->H[ 2] = _mm256_set1_epi64x( 0x0bef970c8d5e228a );
+    sc->H[ 3] = _mm256_set1_epi64x( 0x61c3b3f2591234e9 );
+    sc->H[ 4] = _mm256_set1_epi64x( 0x1e806f53c1a01d89 );
+    sc->H[ 5] = _mm256_set1_epi64x( 0x806d2bea6b05a92a );
+    sc->H[ 6] = _mm256_set1_epi64x( 0xa6ba7520dbcc8e58 );
+    sc->H[ 7] = _mm256_set1_epi64x( 0xf73bf8ba763a0fa9 );
+    sc->H[ 8] = _mm256_set1_epi64x( 0x694ae34105e66901 );
+    sc->H[ 9] = _mm256_set1_epi64x( 0x5ae66f2e8e8ab546 );
+    sc->H[10] = _mm256_set1_epi64x( 0x243c84c1d0a74710 );
+    sc->H[11] = _mm256_set1_epi64x( 0x99c15a2db1716e3b );
+    sc->H[12] = _mm256_set1_epi64x( 0x56f8b19decf657cf );
+    sc->H[13] = _mm256_set1_epi64x( 0x56b116577c8806a7 );
+    sc->H[14] = _mm256_set1_epi64x( 0xfb1785e6dffcc2e3 );
+    sc->H[15] = _mm256_set1_epi64x( 0x4bdd8ccc78465a54 );
    sc->ptr = 0;
    sc->block_count = 0;
 }
@@ -869,7 +881,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
   size_t numz, u;
   uint64_t l0, l1;

-   buf[0] = m256_const1_64( 0x80ULL );
+   buf[0] = _mm256_set1_epi64x( 0x80ULL );

   if ( sc->ptr == 0 )
       numz = 48;
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -49,7 +49,7 @@ int scanhash_keccak_8way( struct work *work, uint32_t max_nonce,
          }
      }
      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
      n += 8;

   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
@@ -101,7 +101,7 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
          }
      }
      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
      n += 4;
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
   pdata[19] = n;
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -180,15 +180,15 @@ static void keccak64_8way_close( keccak64_ctx_m512i *kc, void *dst,
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
-        u.tmp[0] = m512_const1_64( t );
+        u.tmp[0] = _mm512_set1_epi64( t );
        j = 8;
    }
    else
    {
        j = lim - kc->ptr;
-        u.tmp[0] = m512_const1_64( eb );
+        u.tmp[0] = _mm512_set1_epi64( eb );
        memset_zero_512( u.tmp + 1, (j>>3) - 2 );
-        u.tmp[ (j>>3) - 1] = m512_const1_64( 0x8000000000000000 );
+        u.tmp[ (j>>3) - 1] = _mm512_set1_epi64( 0x8000000000000000 );
    }
    keccak64_8way_core( kc, u.tmp, j, lim );
    /* Finalize the "lane complement" */
@@ -264,8 +264,8 @@ keccak512_8way_close(void *cc, void *dst)
 #define OR64(d, a, b)      (d = _mm256_or_si256(a,b))
 #define NOT64(d, s)        (d = mm256_not( s ) )
 #define ROL64(d, v, n)     (d = mm256_rol_64(v, n))
-#define XOROR(d, a, b, c)  (d = _mm256_xor_si256(a, _mm256_or_si256(b, c)))
-#define XORAND(d, a, b, c) (d = _mm256_xor_si256(a, _mm256_and_si256(b, c)))
+#define XOROR(d, a, b, c)  (d = mm256_xoror( a, b, c ) )
+#define XORAND(d, a, b, c) (d = mm256_xorand( a, b, c ) )
 #define XOR3( d, a, b, c ) (d = mm256_xor3( a, b, c ))

 #include "keccak-macros.c"
@@ -368,15 +368,15 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    if ( kc->ptr == (lim - 8) )
    {
        const uint64_t t = eb | 0x8000000000000000;
-        u.tmp[0] = m256_const1_64( t );
+        u.tmp[0] = _mm256_set1_epi64x( t );
        j = 8;
    }
    else
    {
        j = lim - kc->ptr;
-        u.tmp[0] = m256_const1_64( eb );
+        u.tmp[0] = _mm256_set1_epi64x( eb );
        memset_zero_256( u.tmp + 1, (j>>3) - 2 );
-        u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 );
+        u.tmp[ (j>>3) - 1] = _mm256_set1_epi64x( 0x8000000000000000 );
    }
    keccak64_core( kc, u.tmp, j, lim );
    /* Finalize the "lane complement" */
--- a/algo/keccak/sha3d-4way.c
+++ b/algo/keccak/sha3d-4way.c
@@ -56,7 +56,7 @@ int scanhash_sha3d_8way( struct work *work, uint32_t max_nonce,
          }
      }
      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
      n += 8;

   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
@@ -115,7 +115,7 @@ int scanhash_sha3d_4way( struct work *work, uint32_t max_nonce,
          }
      }
      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
      n += 4;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -69,7 +69,7 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
 #define MULT24W( a0, a1 ) \
 { \
  __m512i b = _mm512_xor_si512( a0, \
-                     _mm512_maskz_shuffle_epi32( 0xbbbb, a1, 16 ) ); \
+                     _mm512_maskz_shuffle_epi32( 0xbbbb, a1, 0x10 ) ); \
  a0 = _mm512_alignr_epi8( a1,  b, 4 ); \
  a1 = _mm512_alignr_epi8(  b, a1, 4 ); \
 }
@@ -107,49 +107,37 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
    ADD_CONSTANT4W( x0, x4, c0, c1 );

 #define STEP_PART24W( a0, a1, t0, t1, c0, c1 ) \
-    a1 = _mm512_shuffle_epi32( a1, 147 ); \
-    t0 = _mm512_load_si512( &a1 ); \
-    a1 = _mm512_unpacklo_epi32( a1, a0 ); \
+    t0 = _mm512_shuffle_epi32( a1, 147 ); \
+    a1 = _mm512_unpacklo_epi32( t0, a0 ); \
    t0 = _mm512_unpackhi_epi32( t0, a0 ); \
    t1 = _mm512_shuffle_epi32( t0, 78 ); \
    a0 = _mm512_shuffle_epi32( a1, 78 ); \
    SUBCRUMB4W( t1, t0, a0, a1 ); \
    t0 = _mm512_unpacklo_epi32( t0, t1 ); \
    a1 = _mm512_unpacklo_epi32( a1, a0 ); \
-    a0 = _mm512_load_si512( &a1 ); \
-    a0 = _mm512_unpackhi_epi64( a0, t0 ); \
+    a0 = _mm512_unpackhi_epi64( a1, t0 ); \
    a1 = _mm512_unpacklo_epi64( a1, t0 ); \
    a1 = _mm512_shuffle_epi32( a1, 57 ); \
    MIXWORD4W( a0, a1 ); \
    ADD_CONSTANT4W( a0, a1, c0, c1 );

 #define NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
-    s1 = _mm512_load_si512(&r3);\
-    q1 = _mm512_load_si512(&p3);\
-    s3 = _mm512_load_si512(&r3);\
-    q3 = _mm512_load_si512(&p3);\
-    s1 = _mm512_unpackhi_epi32(s1,r2);\
-    q1 = _mm512_unpackhi_epi32(q1,p2);\
-    s3 = _mm512_unpacklo_epi32(s3,r2);\
-    q3 = _mm512_unpacklo_epi32(q3,p2);\
-    s0 = _mm512_load_si512(&s1);\
-    q0 = _mm512_load_si512(&q1);\
-    s2 = _mm512_load_si512(&s3);\
-    q2 = _mm512_load_si512(&q3);\
-    r3 = _mm512_load_si512(&r1);\
-    p3 = _mm512_load_si512(&p1);\
-    r1 = _mm512_unpacklo_epi32(r1,r0);\
-    p1 = _mm512_unpacklo_epi32(p1,p0);\
-    r3 = _mm512_unpackhi_epi32(r3,r0);\
-    p3 = _mm512_unpackhi_epi32(p3,p0);\
-    s0 = _mm512_unpackhi_epi64(s0,r3);\
-    q0 = _mm512_unpackhi_epi64(q0,p3);\
-    s1 = _mm512_unpacklo_epi64(s1,r3);\
-    q1 = _mm512_unpacklo_epi64(q1,p3);\
-    s2 = _mm512_unpackhi_epi64(s2,r1);\
-    q2 = _mm512_unpackhi_epi64(q2,p1);\
-    s3 = _mm512_unpacklo_epi64(s3,r1);\
-    q3 = _mm512_unpacklo_epi64(q3,p1);
+    s1 = _mm512_unpackhi_epi32( r3, r2 ); \
+    q1 = _mm512_unpackhi_epi32( p3, p2 ); \
+    s3 = _mm512_unpacklo_epi32( r3, r2 ); \
+    q3 = _mm512_unpacklo_epi32( p3, p2 ); \
+    r3 = _mm512_unpackhi_epi32( r1, r0 ); \
+    r1 = _mm512_unpacklo_epi32( r1, r0 ); \
+    p3 = _mm512_unpackhi_epi32( p1, p0 ); \
+    p1 = _mm512_unpacklo_epi32( p1, p0 ); \
+    s0 = _mm512_unpackhi_epi64( s1, r3 ); \
+    q0 = _mm512_unpackhi_epi64( q1 ,p3 ); \
+    s1 = _mm512_unpacklo_epi64( s1, r3 ); \
+    q1 = _mm512_unpacklo_epi64( q1, p3 ); \
+    s2 = _mm512_unpackhi_epi64( s3, r1 ); \
+    q2 = _mm512_unpackhi_epi64( q3, p1 ); \
+    s3 = _mm512_unpacklo_epi64( s3, r1 ); \
+    q3 = _mm512_unpacklo_epi64( q3, p1 );

 #define MIXTON10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
@@ -198,11 +186,8 @@ void rnd512_4way( luffa_4way_context *state, const __m512i *msg )
    chainv[7] = _mm512_xor_si512(chainv[7], chainv[9]);

    MULT24W( chainv[8], chainv[9] );
-    chainv[8] = _mm512_xor_si512( chainv[8], t0 );
-    chainv[9] = _mm512_xor_si512( chainv[9], t1 );
-
-    t0 = chainv[8];
-    t1 = chainv[9];
+    t0 = chainv[8] = _mm512_xor_si512( chainv[8], t0 );
+    t1 = chainv[9] = _mm512_xor_si512( chainv[9], t1 );

    MULT24W( chainv[8], chainv[9] );
    chainv[8] = _mm512_xor_si512( chainv[8], chainv[6] );
@@ -538,10 +523,39 @@ int luffa_4way_update_close( luffa_4way_context *state,
    a = _mm256_xor_si256( a, c0 ); \
    b = _mm256_xor_si256( b, c1 );

+//TODO Enable for AVX10_256, not used with AVX512 or AVX10_512
+#if defined(__AVX512VL__) 
+
+#define MULT2( a0, a1 ) \
+{ \
+  __m256i b = _mm256_xor_si256( a0, \
+                     _mm256_maskz_shuffle_epi32( 0xbb, a1, 0x10 ) ); \
+  a0 = _mm256_alignr_epi8( a1,  b, 4 ); \
+  a1 = _mm256_alignr_epi8(  b, a1, 4 ); \
+}
+
+#define SUBCRUMB( a0, a1, a2, a3 ) \
+{ \
+    __m256i t = a0; \
+    a0 = mm256_xoror( a3, a0, a1 ); \
+    a2 = _mm256_xor_si256( a2, a3 ); \
+    a1 = _mm256_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a3 = mm256_xorand( a2, a3, t ); \
+    a2 = mm256_xorand( a1, a2, a0); \
+    a1 = _mm256_or_si256( a1, a3 ); \
+    a3 = _mm256_xor_si256( a3, a2 ); \
+    t  = _mm256_xor_si256( t, a1 ); \
+    a2 = _mm256_and_si256( a2, a1 ); \
+    a1 = mm256_xnor( a1, a0 ); \
+    a0 = t; \
+}
+
+#else
+
 #define MULT2( a0, a1 ) \
 { \
  __m256i b = _mm256_xor_si256( a0, _mm256_shuffle_epi32( \
-                         _mm256_blend_epi32( a1, m256_zero, 0xee ), 16 ) ); \
+                         _mm256_blend_epi32( a1, m256_zero, 0xee ), 0x10 ) ); \
  a0 = _mm256_alignr_epi8( a1,  b, 4 ); \
  a1 = _mm256_alignr_epi8(  b, a1, 4 ); \
 }
@@ -567,26 +581,14 @@ int luffa_4way_update_close( luffa_4way_context *state,
    a0 = t; \
 }

+#endif
+
 #define MIXWORD( a, b ) \
-{ \
-    __m256i t1, t2; \
-    b  = _mm256_xor_si256( a,b ); \
-    t1 = _mm256_slli_epi32( a,  2 ); \
-    t2 = _mm256_srli_epi32( a, 30 ); \
-    a  = _mm256_or_si256( t1, t2 ); \
-    a  = _mm256_xor_si256( a, b ); \
-    t1 = _mm256_slli_epi32( b, 14 ); \
-    t2 = _mm256_srli_epi32( b, 18 ); \
-    b  = _mm256_or_si256( t1, t2 ); \
-    b  = _mm256_xor_si256( a, b ); \
-    t1 = _mm256_slli_epi32( a, 10 ); \
-    t2 = _mm256_srli_epi32( a, 22 ); \
-    a  = _mm256_or_si256( t1,t2 ); \
-    a  = _mm256_xor_si256( a,b ); \
-    t1 = _mm256_slli_epi32( b,1 ); \
-    t2 = _mm256_srli_epi32( b,31 ); \
-    b  = _mm256_or_si256( t1, t2 ); \
-}
+    b = _mm256_xor_si256( a, b ); \
+    a = _mm256_xor_si256( b, mm256_rol_32( a,  2 ) ); \
+    b = _mm256_xor_si256( a, mm256_rol_32( b, 14 ) ); \
+    a = _mm256_xor_si256( b, mm256_rol_32( a, 10 ) ); \
+    b = mm256_rol_32( b, 1 );

 #define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
    SUBCRUMB( x0, x1, x2, x3 ); \
@@ -598,49 +600,37 @@ int luffa_4way_update_close( luffa_4way_context *state,
    ADD_CONSTANT( x0, x4, c0, c1 );

 #define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
-    a1 = _mm256_shuffle_epi32( a1, 147); \
-    t0 = _mm256_load_si256( &a1 ); \
-    a1 = _mm256_unpacklo_epi32( a1, a0 ); \
+    t0 = _mm256_shuffle_epi32( a1, 147 ); \
+    a1 = _mm256_unpacklo_epi32( t0, a0 ); \
    t0 = _mm256_unpackhi_epi32( t0, a0 ); \
    t1 = _mm256_shuffle_epi32( t0, 78 ); \
    a0 = _mm256_shuffle_epi32( a1, 78 ); \
-    SUBCRUMB( t1, t0, a0, a1 );\
+    SUBCRUMB( t1, t0, a0, a1 ); \
    t0 = _mm256_unpacklo_epi32( t0, t1 ); \
    a1 = _mm256_unpacklo_epi32( a1, a0 ); \
-    a0 = _mm256_load_si256( &a1 ); \
-    a0 = _mm256_unpackhi_epi64( a0, t0 ); \
+    a0 = _mm256_unpackhi_epi64( a1, t0 ); \
    a1 = _mm256_unpacklo_epi64( a1, t0 ); \
    a1 = _mm256_shuffle_epi32( a1, 57 ); \
    MIXWORD( a0, a1 ); \
    ADD_CONSTANT( a0, a1, c0, c1 );

 #define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
-    s1 = _mm256_load_si256(&r3);\
-    q1 = _mm256_load_si256(&p3);\
-    s3 = _mm256_load_si256(&r3);\
-    q3 = _mm256_load_si256(&p3);\
-    s1 = _mm256_unpackhi_epi32(s1,r2);\
-    q1 = _mm256_unpackhi_epi32(q1,p2);\
-    s3 = _mm256_unpacklo_epi32(s3,r2);\
-    q3 = _mm256_unpacklo_epi32(q3,p2);\
-    s0 = _mm256_load_si256(&s1);\
-    q0 = _mm256_load_si256(&q1);\
-    s2 = _mm256_load_si256(&s3);\
-    q2 = _mm256_load_si256(&q3);\
-    r3 = _mm256_load_si256(&r1);\
-    p3 = _mm256_load_si256(&p1);\
-    r1 = _mm256_unpacklo_epi32(r1,r0);\
-    p1 = _mm256_unpacklo_epi32(p1,p0);\
-    r3 = _mm256_unpackhi_epi32(r3,r0);\
-    p3 = _mm256_unpackhi_epi32(p3,p0);\
-    s0 = _mm256_unpackhi_epi64(s0,r3);\
-    q0 = _mm256_unpackhi_epi64(q0,p3);\
-    s1 = _mm256_unpacklo_epi64(s1,r3);\
-    q1 = _mm256_unpacklo_epi64(q1,p3);\
-    s2 = _mm256_unpackhi_epi64(s2,r1);\
-    q2 = _mm256_unpackhi_epi64(q2,p1);\
-    s3 = _mm256_unpacklo_epi64(s3,r1);\
-    q3 = _mm256_unpacklo_epi64(q3,p1);
+    s1 = _mm256_unpackhi_epi32( r3, r2 ); \
+    q1 = _mm256_unpackhi_epi32( p3, p2 ); \
+    s3 = _mm256_unpacklo_epi32( r3, r2 ); \
+    q3 = _mm256_unpacklo_epi32( p3, p2 ); \
+    r3 = _mm256_unpackhi_epi32( r1, r0 ); \
+    r1 = _mm256_unpacklo_epi32( r1, r0 ); \
+    p3 = _mm256_unpackhi_epi32( p1, p0 ); \
+    p1 = _mm256_unpacklo_epi32( p1, p0 ); \
+    s0 = _mm256_unpackhi_epi64( s1, r3 ); \
+    q0 = _mm256_unpackhi_epi64( q1 ,p3 ); \
+    s1 = _mm256_unpacklo_epi64( s1, r3 ); \
+    q1 = _mm256_unpacklo_epi64( q1, p3 ); \
+    s2 = _mm256_unpackhi_epi64( s3, r1 ); \
+    q2 = _mm256_unpackhi_epi64( q3, p1 ); \
+    s3 = _mm256_unpacklo_epi64( s3, r1 ); \
+    q3 = _mm256_unpacklo_epi64( q3, p1 );

 #define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
@@ -656,17 +646,10 @@ void rnd512_2way( luffa_2way_context *state, const __m256i *msg )
    __m256i *chainv = state->chainv;
    __m256i x0, x1, x2, x3, x4, x5, x6, x7;

-    t0 = chainv[0];
-    t1 = chainv[1];
-
-    t0 = _mm256_xor_si256( t0, chainv[2] );
-    t1 = _mm256_xor_si256( t1, chainv[3] );
-    t0 = _mm256_xor_si256( t0, chainv[4] );
-    t1 = _mm256_xor_si256( t1, chainv[5] );
-    t0 = _mm256_xor_si256( t0, chainv[6] );
-    t1 = _mm256_xor_si256( t1, chainv[7] );
-    t0 = _mm256_xor_si256( t0, chainv[8] );
-    t1 = _mm256_xor_si256( t1, chainv[9] );
+    t0 = mm256_xor3( chainv[0], chainv[2], chainv[4] );
+    t1 = mm256_xor3( chainv[1], chainv[3], chainv[5] );
+    t0 = mm256_xor3( t0, chainv[6], chainv[8] );
+    t1 = mm256_xor3( t1, chainv[7], chainv[9] );

    MULT2( t0, t1 );

@@ -701,11 +684,8 @@ void rnd512_2way( luffa_2way_context *state, const __m256i *msg )
    chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);

    MULT2( chainv[8], chainv[9] );
-    chainv[8] = _mm256_xor_si256( chainv[8], t0 );
-    chainv[9] = _mm256_xor_si256( chainv[9], t1 );
-
-    t0 = chainv[8];
-    t1 = chainv[9];
+    t0 = chainv[8] = _mm256_xor_si256( chainv[8], t0 );
+    t1 = chainv[9] = _mm256_xor_si256( chainv[9], t1 );

    MULT2( chainv[8], chainv[9] );
    chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
@@ -794,29 +774,22 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
 {
    uint32 hash[8*2] __attribute((aligned(64)));
    __m256i* chainv = state->chainv;
-    __m256i t[2];
+    __m256i t0, t1;
    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
                                                 0x0405060700010203 );
    /*---- blank round with m=0 ----*/
    rnd512_2way( state, NULL );

-    t[0] = chainv[0];
-    t[1] = chainv[1];
+    t0 = mm256_xor3( chainv[0], chainv[2], chainv[4] );
+    t1 = mm256_xor3( chainv[1], chainv[3], chainv[5] );
+    t0 = mm256_xor3( t0, chainv[6], chainv[8] );
+    t1 = mm256_xor3( t1, chainv[7], chainv[9] );

-    t[0] = _mm256_xor_si256( t[0], chainv[2] );
-    t[1] = _mm256_xor_si256( t[1], chainv[3] );
-    t[0] = _mm256_xor_si256( t[0], chainv[4] );
-    t[1] = _mm256_xor_si256( t[1], chainv[5] );
-    t[0] = _mm256_xor_si256( t[0], chainv[6] );
-    t[1] = _mm256_xor_si256( t[1], chainv[7] );
-    t[0] = _mm256_xor_si256( t[0], chainv[8] );
-    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+    t0 = _mm256_shuffle_epi32( t0, 27 );
+    t1 = _mm256_shuffle_epi32( t1, 27 );

-    t[0] = _mm256_shuffle_epi32( t[0], 27 );
-    t[1] = _mm256_shuffle_epi32( t[1], 27 );
-
-    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
-    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
+    _mm256_store_si256( (__m256i*)&hash[0], t0 );
+    _mm256_store_si256( (__m256i*)&hash[8], t1 );

    casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
                                  casti_m256i( hash, 0 ), shuff_bswap32 );
@@ -825,22 +798,16 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )

    rnd512_2way( state, NULL );

-    t[0] = chainv[0];
-    t[1] = chainv[1];
-    t[0] = _mm256_xor_si256( t[0], chainv[2] );
-    t[1] = _mm256_xor_si256( t[1], chainv[3] );
-    t[0] = _mm256_xor_si256( t[0], chainv[4] );
-    t[1] = _mm256_xor_si256( t[1], chainv[5] );
-    t[0] = _mm256_xor_si256( t[0], chainv[6] );
-    t[1] = _mm256_xor_si256( t[1], chainv[7] );
-    t[0] = _mm256_xor_si256( t[0], chainv[8] );
-    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+    t0 = mm256_xor3( chainv[0], chainv[2], chainv[4] );
+    t1 = mm256_xor3( chainv[1], chainv[3], chainv[5] );
+    t0 = mm256_xor3( t0, chainv[6], chainv[8] );
+    t1 = mm256_xor3( t1, chainv[7], chainv[9] );
+    
+    t0 = _mm256_shuffle_epi32( t0, 27 );
+    t1 = _mm256_shuffle_epi32( t1, 27 );

-    t[0] = _mm256_shuffle_epi32( t[0], 27 );
-    t[1] = _mm256_shuffle_epi32( t[1], 27 );
-
-    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
-    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
+    _mm256_store_si256( (__m256i*)&hash[0], t0 );
+    _mm256_store_si256( (__m256i*)&hash[8], t1 );

    casti_m256i( b, 2 ) = _mm256_shuffle_epi8( 
                                  casti_m256i( hash, 0 ), shuff_bswap32 );
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -22,20 +22,29 @@
 #include "simd-utils.h"
 #include "luffa_for_sse2.h"

+#define cns(i)  ( ( (__m128i*)CNS_INIT)[i] )
+
+#define ADD_CONSTANT( a, b, c0 ,c1 ) \
+    a = _mm_xor_si128( a, c0 ); \
+    b = _mm_xor_si128( b, c1 ); \
+
 #if defined(__AVX512VL__)
+//TODO enable for AVX10_512 AVX10_256

 #define MULT2( a0, a1 ) \
 { \
-  __m128i b = _mm_xor_si128( a0, _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
-  a0 = _mm_alignr_epi32( a1, b, 1 ); \
-  a1 = _mm_alignr_epi32( b, a1, 1 ); \
+  __m128i b = _mm_xor_si128( a0, \
+                      _mm_maskz_shuffle_epi32( 0xb, a1, 0x10 ) ); \
+  a0 = _mm_alignr_epi8( a1, b, 4 ); \
+  a1 = _mm_alignr_epi8( b, a1, 4 ); \
 }

 #elif defined(__SSE4_1__)

 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
+  __m128i b = _mm_xor_si128( a0, \
+                      _mm_shuffle_epi32( mm128_mask_32( a1, 0xe ), 0x10 ) ); \
  a0 = _mm_alignr_epi8( a1, b, 4 ); \
  a1 = _mm_alignr_epi8( b, a1, 4 ); \
 } while(0)
@@ -44,79 +53,88 @@

 #define MULT2( a0, a1 ) do \
 { \
-  __m128i b = _mm_xor_si128( a0, _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
-  a0 = _mm_or_si128( _mm_srli_si128( b, 4 ), _mm_slli_si128( a1, 12 ) ); \
-  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128( b, 12 ) ); \
+  __m128i b = _mm_xor_si128( a0, \
+                      _mm_shuffle_epi32( _mm_and_si128( a1, MASK ), 0x10 ) ); \
+  a0 = _mm_or_si128( _mm_srli_si128(  b, 4 ), _mm_slli_si128( a1, 12 ) ); \
+  a1 = _mm_or_si128( _mm_srli_si128( a1, 4 ), _mm_slli_si128(  b, 12 ) ); \
 } while(0)

 #endif

-#define STEP_PART(x,c,t)\
-    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
-    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
-    MIXWORD(*x,*(x+4),*t,*(t+1));\
-    MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
-    MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
-    MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
-    ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
+#if defined(__AVX512VL__)
+//TODO enable for AVX10_512 AVX10_256

-#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
-    a1 = _mm_shuffle_epi32(a1,147);\
-    t0 = _mm_load_si128(&a1);\
-    a1 = _mm_unpacklo_epi32(a1,a0);\
-    t0 = _mm_unpackhi_epi32(t0,a0);\
-    t1 = _mm_shuffle_epi32(t0,78);\
-    a0 = _mm_shuffle_epi32(a1,78);\
-    SUBCRUMB(t1,t0,a0,a1,tmp0);\
-    t0 = _mm_unpacklo_epi32(t0,t1);\
-    a1 = _mm_unpacklo_epi32(a1,a0);\
-    a0 = _mm_load_si128(&a1);\
-    a0 = _mm_unpackhi_epi64(a0,t0);\
-    a1 = _mm_unpacklo_epi64(a1,t0);\
-    a1 = _mm_shuffle_epi32(a1,57);\
-    MIXWORD(a0,a1,tmp0,tmp1);\
-    ADD_CONSTANT(a0,a1,c0,c1);
+#define SUBCRUMB( a0, a1, a2, a3 ) \
+{ \
+    __m128i t = a0; \
+    a0 = mm128_xoror( a3, a0, a1 ); \
+    a2 = _mm_xor_si128( a2, a3 ); \
+    a1 = _mm_ternarylogic_epi64( a1, a3, t, 0x87 ); /* a1 xnor (a3 & t) */ \
+    a3 = mm128_xorand( a2, a3, t ); \
+    a2 = mm128_xorand( a1, a2, a0 ); \
+    a1 = _mm_or_si128( a1, a3 ); \
+    a3 = _mm_xor_si128( a3, a2 ); \
+    t  = _mm_xor_si128( t, a1 ); \
+    a2 = _mm_and_si128( a2, a1 ); \
+    a1 = mm128_xnor( a1, a0 ); \
+    a0 = t; \
+}

-#define SUBCRUMB(a0,a1,a2,a3,t)\
-    t  = _mm_load_si128(&a0);\
-    a0 = _mm_or_si128(a0,a1);\
-    a2 = _mm_xor_si128(a2,a3);\
-    a1 = mm128_not( a1 );\
-    a0 = _mm_xor_si128(a0,a3);\
-    a3 = _mm_and_si128(a3,t);\
-    a1 = _mm_xor_si128(a1,a3);\
-    a3 = _mm_xor_si128(a3,a2);\
-    a2 = _mm_and_si128(a2,a0);\
-    a0 = mm128_not( a0 );\
-    a2 = _mm_xor_si128(a2,a1);\
-    a1 = _mm_or_si128(a1,a3);\
-    t  = _mm_xor_si128(t,a1);\
-    a3 = _mm_xor_si128(a3,a2);\
-    a2 = _mm_and_si128(a2,a1);\
-    a1 = _mm_xor_si128(a1,a0);\
-    a0 = _mm_load_si128(&t);\
+#else

-#define MIXWORD(a,b,t1,t2)\
-    b = _mm_xor_si128(a,b);\
-    t1 = _mm_slli_epi32(a,2);\
-    t2 = _mm_srli_epi32(a,30);\
-    a = _mm_or_si128(t1,t2);\
-    a = _mm_xor_si128(a,b);\
-    t1 = _mm_slli_epi32(b,14);\
-    t2 = _mm_srli_epi32(b,18);\
-    b = _mm_or_si128(t1,t2);\
-    b = _mm_xor_si128(a,b);\
-    t1 = _mm_slli_epi32(a,10);\
-    t2 = _mm_srli_epi32(a,22);\
-    a = _mm_or_si128(t1,t2);\
-    a = _mm_xor_si128(a,b);\
-    t1 = _mm_slli_epi32(b,1);\
-    t2 = _mm_srli_epi32(b,31);\
-    b = _mm_or_si128(t1,t2);
+#define SUBCRUMB( a0, a1, a2, a3 ) \
+{ \
+    __m128i t = a0; \
+    a0 = _mm_or_si128( a0, a1 ); \
+    a2 = _mm_xor_si128( a2, a3 ); \
+    a1 = mm128_not( a1 ); \
+    a0 = _mm_xor_si128( a0, a3 ); \
+    a3 = _mm_and_si128( a3, t ); \
+    a1 = _mm_xor_si128( a1, a3 ); \
+    a3 = _mm_xor_si128( a3, a2 ); \
+    a2 = _mm_and_si128( a2, a0 ); \
+    a0 = mm128_not( a0 ); \
+    a2 = _mm_xor_si128( a2, a1 ); \
+    a1 = _mm_or_si128(  a1, a3 ); \
+    t  = _mm_xor_si128( t , a1 ); \
+    a3 = _mm_xor_si128( a3, a2 ); \
+    a2 = _mm_and_si128( a2, a1 ); \
+    a1 = _mm_xor_si128( a1, a0 ); \
+    a0 = t; \
+}

-#define ADD_CONSTANT(a,b,c0,c1)\
-    a = _mm_xor_si128(a,c0);\
-    b = _mm_xor_si128(b,c1);\
+#endif
+
+#define MIXWORD( a, b ) \
+    b = _mm_xor_si128( a, b ); \
+    a = _mm_xor_si128( b, mm128_rol_32( a, 2 ) ); \
+    b = _mm_xor_si128( a, mm128_rol_32( b, 14 ) ); \
+    a = _mm_xor_si128( b, mm128_rol_32( a, 10 ) ); \
+    b = mm128_rol_32( b, 1 );
+
+#define STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, c0, c1 ) \
+    SUBCRUMB( x0, x1, x2, x3 ); \
+    SUBCRUMB( x5, x6, x7, x4 ); \
+    MIXWORD( x0, x4 ); \
+    MIXWORD( x1, x5 ); \
+    MIXWORD( x2, x6 ); \
+    MIXWORD( x3, x7 ); \
+    ADD_CONSTANT( x0, x4, c0, c1 );
+
+#define STEP_PART2( a0, a1, t0, t1, c0, c1 ) \
+    t0 = _mm_shuffle_epi32( a1, 147 ); \
+    a1 = _mm_unpacklo_epi32( t0, a0 ); \
+    t0 = _mm_unpackhi_epi32( t0, a0 ); \
+    t1 = _mm_shuffle_epi32( t0, 78 ); \
+    a0 = _mm_shuffle_epi32( a1, 78 ); \
+    SUBCRUMB( t1, t0, a0, a1 ); \
+    t0 = _mm_unpacklo_epi32( t0, t1 ); \
+    a1 = _mm_unpacklo_epi32( a1, a0 ); \
+    a0 = _mm_unpackhi_epi64( a1, t0 ); \
+    a1 = _mm_unpacklo_epi64( a1, t0 ); \
+    a1 = _mm_shuffle_epi32( a1, 57 ); \
+    MIXWORD( a0, a1 ); \
+    ADD_CONSTANT( a0, a1, c0, c1 );

 #define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
    s2 = _mm_load_si128(&r1);\
@@ -177,32 +195,22 @@
    q1 = _mm_load_si128(&p1);\

 #define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
-    s1 = _mm_load_si128(&r3);\
-    q1 = _mm_load_si128(&p3);\
-    s3 = _mm_load_si128(&r3);\
-    q3 = _mm_load_si128(&p3);\
-    s1 = _mm_unpackhi_epi32(s1,r2);\
-    q1 = _mm_unpackhi_epi32(q1,p2);\
-    s3 = _mm_unpacklo_epi32(s3,r2);\
-    q3 = _mm_unpacklo_epi32(q3,p2);\
-    s0 = _mm_load_si128(&s1);\
-    q0 = _mm_load_si128(&q1);\
-    s2 = _mm_load_si128(&s3);\
-    q2 = _mm_load_si128(&q3);\
-    r3 = _mm_load_si128(&r1);\
-    p3 = _mm_load_si128(&p1);\
-    r1 = _mm_unpacklo_epi32(r1,r0);\
-    p1 = _mm_unpacklo_epi32(p1,p0);\
-    r3 = _mm_unpackhi_epi32(r3,r0);\
-    p3 = _mm_unpackhi_epi32(p3,p0);\
-    s0 = _mm_unpackhi_epi64(s0,r3);\
-    q0 = _mm_unpackhi_epi64(q0,p3);\
-    s1 = _mm_unpacklo_epi64(s1,r3);\
-    q1 = _mm_unpacklo_epi64(q1,p3);\
-    s2 = _mm_unpackhi_epi64(s2,r1);\
-    q2 = _mm_unpackhi_epi64(q2,p1);\
-    s3 = _mm_unpacklo_epi64(s3,r1);\
-    q3 = _mm_unpacklo_epi64(q3,p1);
+    s1 = _mm_unpackhi_epi32( r3, r2 ); \
+    q1 = _mm_unpackhi_epi32( p3, p2 ); \
+    s3 = _mm_unpacklo_epi32( r3, r2 ); \
+    q3 = _mm_unpacklo_epi32( p3, p2 ); \
+    r3 = _mm_unpackhi_epi32( r1, r0 ); \
+    r1 = _mm_unpacklo_epi32( r1, r0 ); \
+    p3 = _mm_unpackhi_epi32( p1, p0 ); \
+    p1 = _mm_unpacklo_epi32( p1, p0 ); \
+    s0 = _mm_unpackhi_epi64( s1, r3 ); \
+    q0 = _mm_unpackhi_epi64( q1 ,p3 ); \
+    s1 = _mm_unpacklo_epi64( s1, r3 ); \
+    q1 = _mm_unpacklo_epi64( q1, p3 ); \
+    s2 = _mm_unpackhi_epi64( s3, r1 ); \
+    q2 = _mm_unpackhi_epi64( q3, p1 ); \
+    s3 = _mm_unpacklo_epi64( s3, r1 ); \
+    q3 = _mm_unpacklo_epi64( q3, p1 );

 #define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
@@ -306,8 +314,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
      // remaining data bytes
      casti_m128i( state->buffer, 0 ) = mm128_bswap_32( cast_m128i( data ) );
      // padding of partial block
-      casti_m128i( state->buffer, 1 ) =
-            _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
+      casti_m128i( state->buffer, 1 ) =  _mm_set_epi32( 0, 0, 0, 0x80000000 );
    }

    return SUCCESS;
@@ -325,8 +332,7 @@ HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval)
    else
    {
      // empty pad block, constant data
-     rnd512( state, _mm_setzero_si128(),
-                       _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
+     rnd512( state, _mm_setzero_si128(), _mm_set_epi32( 0, 0, 0, 0x80000000 ) );
    }

    finalization512(state, (uint32*) hashval);
@@ -423,163 +429,119 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,

 static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
 {
-    __m128i t[2];
+    __m128i t0, t1;
    __m128i *chainv = state->chainv;
-    __m128i tmp[2];
-    __m128i x[8];
+    __m128i x0, x1, x2, x3, x4, x5, x6, x7; 

-    t[0] = chainv[0];
-    t[1] = chainv[1];
+    t0 = mm128_xor3( chainv[0], chainv[2], chainv[4] );
+    t1 = mm128_xor3( chainv[1], chainv[3], chainv[5] );
+    t0 = mm128_xor3( t0, chainv[6], chainv[8] );
+    t1 = mm128_xor3( t1, chainv[7], chainv[9] );

-    t[0] = _mm_xor_si128( t[0], chainv[2] );
-    t[1] = _mm_xor_si128( t[1], chainv[3] );
-    t[0] = _mm_xor_si128( t[0], chainv[4] );
-    t[1] = _mm_xor_si128( t[1], chainv[5] );
-    t[0] = _mm_xor_si128( t[0], chainv[6] );
-    t[1] = _mm_xor_si128( t[1], chainv[7] );
-    t[0] = _mm_xor_si128( t[0], chainv[8] );
-    t[1] = _mm_xor_si128( t[1], chainv[9] );
-
-    MULT2( t[0], t[1] );
+    MULT2( t0, t1 );

    msg0 = _mm_shuffle_epi32( msg0, 27 );
    msg1 = _mm_shuffle_epi32( msg1, 27 );

-    chainv[0] = _mm_xor_si128( chainv[0], t[0] );
-    chainv[1] = _mm_xor_si128( chainv[1], t[1] );
-    chainv[2] = _mm_xor_si128( chainv[2], t[0] );
-    chainv[3] = _mm_xor_si128( chainv[3], t[1] );
-    chainv[4] = _mm_xor_si128( chainv[4], t[0] );
-    chainv[5] = _mm_xor_si128( chainv[5], t[1] );
-    chainv[6] = _mm_xor_si128( chainv[6], t[0] );
-    chainv[7] = _mm_xor_si128( chainv[7], t[1] );
-    chainv[8] = _mm_xor_si128( chainv[8], t[0] );
-    chainv[9] = _mm_xor_si128( chainv[9], t[1] );
+    chainv[0] = _mm_xor_si128( chainv[0], t0 );
+    chainv[1] = _mm_xor_si128( chainv[1], t1 );
+    chainv[2] = _mm_xor_si128( chainv[2], t0 );
+    chainv[3] = _mm_xor_si128( chainv[3], t1 );
+    chainv[4] = _mm_xor_si128( chainv[4], t0 );
+    chainv[5] = _mm_xor_si128( chainv[5], t1 );
+    chainv[6] = _mm_xor_si128( chainv[6], t0 );
+    chainv[7] = _mm_xor_si128( chainv[7], t1 );
+    chainv[8] = _mm_xor_si128( chainv[8], t0 );
+    chainv[9] = _mm_xor_si128( chainv[9], t1 );

-    t[0] = chainv[0];
-    t[1] = chainv[1];
+    t0 = chainv[0];
+    t1 = chainv[1];

    MULT2( chainv[0], chainv[1]);
-
    chainv[0] = _mm_xor_si128( chainv[0], chainv[2] );
    chainv[1] = _mm_xor_si128( chainv[1], chainv[3] );

    MULT2( chainv[2], chainv[3]);
-
    chainv[2] = _mm_xor_si128(chainv[2], chainv[4]);
    chainv[3] = _mm_xor_si128(chainv[3], chainv[5]);

    MULT2( chainv[4], chainv[5]);
-
    chainv[4] = _mm_xor_si128(chainv[4], chainv[6]);
    chainv[5] = _mm_xor_si128(chainv[5], chainv[7]);

    MULT2( chainv[6], chainv[7]);
-
    chainv[6] = _mm_xor_si128(chainv[6], chainv[8]);
    chainv[7] = _mm_xor_si128(chainv[7], chainv[9]);

    MULT2( chainv[8], chainv[9]);
-
-    chainv[8] = _mm_xor_si128( chainv[8], t[0] );
-    chainv[9] = _mm_xor_si128( chainv[9], t[1] );
-
-    t[0] = chainv[8];
-    t[1] = chainv[9];
+    t0 = chainv[8] = _mm_xor_si128( chainv[8], t0 );
+    t1 = chainv[9] = _mm_xor_si128( chainv[9], t1 );

    MULT2( chainv[8], chainv[9]);
-
    chainv[8] = _mm_xor_si128( chainv[8], chainv[6] );
    chainv[9] = _mm_xor_si128( chainv[9], chainv[7] );

    MULT2( chainv[6], chainv[7]);
-
    chainv[6] = _mm_xor_si128( chainv[6], chainv[4] );
    chainv[7] = _mm_xor_si128( chainv[7], chainv[5] );

    MULT2( chainv[4], chainv[5]);
-
    chainv[4] = _mm_xor_si128( chainv[4], chainv[2] );
    chainv[5] = _mm_xor_si128( chainv[5], chainv[3] );

    MULT2( chainv[2], chainv[3] );
-
    chainv[2] = _mm_xor_si128( chainv[2], chainv[0] );
    chainv[3] = _mm_xor_si128( chainv[3], chainv[1] );

    MULT2( chainv[0], chainv[1] );
-
-    chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t[0] ), msg0 );
-    chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t[1] ), msg1 );
+    chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t0 ), msg0 );
+    chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t1 ), msg1 );

    MULT2( msg0, msg1);
-
    chainv[2] = _mm_xor_si128( chainv[2], msg0 );
    chainv[3] = _mm_xor_si128( chainv[3], msg1 );

    MULT2( msg0, msg1);
-
    chainv[4] = _mm_xor_si128( chainv[4], msg0 );
    chainv[5] = _mm_xor_si128( chainv[5], msg1 );

    MULT2( msg0, msg1);
-
    chainv[6] = _mm_xor_si128( chainv[6], msg0 );
    chainv[7] = _mm_xor_si128( chainv[7], msg1 );

    MULT2( msg0, msg1);
-
    chainv[8] = _mm_xor_si128( chainv[8], msg0 );
    chainv[9] = _mm_xor_si128( chainv[9], msg1 );

    MULT2( msg0, msg1);
+    chainv[3] = mm128_rol_32( chainv[3], 1 );    
+    chainv[5] = mm128_rol_32( chainv[5], 2 );
+    chainv[7] = mm128_rol_32( chainv[7], 3 );
+    chainv[9] = mm128_rol_32( chainv[9], 4 );
+    
+    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6], x0, x1, x2, x3,
+                chainv[1], chainv[3], chainv[5], chainv[7], x4, x5, x6, x7 );

-    chainv[3] = _mm_or_si128( _mm_slli_epi32(chainv[3], 1),
-                              _mm_srli_epi32(chainv[3], 31) );
-    chainv[5] = _mm_or_si128( _mm_slli_epi32(chainv[5], 2),
-                              _mm_srli_epi32(chainv[5], 30) );
-    chainv[7] = _mm_or_si128( _mm_slli_epi32(chainv[7], 3),
-                              _mm_srli_epi32(chainv[7], 29) );
-    chainv[9] = _mm_or_si128( _mm_slli_epi32(chainv[9], 4),
-                              _mm_srli_epi32(chainv[9], 28) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 0), cns( 1) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 2), cns( 3) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 4), cns( 5) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 6), cns( 7) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns( 8), cns( 9) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(10), cns(11) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(12), cns(13) );
+    STEP_PART( x0, x1, x2, x3, x4, x5, x6, x7, cns(14), cns(15) );
+    
+    MIXTON1024( x0, x1, x2, x3, chainv[0], chainv[2], chainv[4], chainv[6],
+                x4, x5, x6, x7, chainv[1], chainv[3], chainv[5], chainv[7]);

-
-    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
-                x[0], x[1], x[2], x[3],
-                chainv[1],chainv[3],chainv[5],chainv[7],
-                x[4], x[5], x[6], x[7] );
-
-    STEP_PART( &x[0], &CNS128[ 0], &tmp[0] );
-    STEP_PART( &x[0], &CNS128[ 2], &tmp[0] );
-    STEP_PART( &x[0], &CNS128[ 4], &tmp[0] );
-    STEP_PART( &x[0], &CNS128[ 6], &tmp[0] );
-    STEP_PART( &x[0], &CNS128[ 8], &tmp[0] );
-    STEP_PART( &x[0], &CNS128[10], &tmp[0] );
-    STEP_PART( &x[0], &CNS128[12], &tmp[0] );
-    STEP_PART( &x[0], &CNS128[14], &tmp[0] );
-
-    MIXTON1024( x[0], x[1], x[2], x[3],
-                chainv[0], chainv[2], chainv[4],chainv[6],
-                x[4], x[5], x[6], x[7],
-                chainv[1],chainv[3],chainv[5],chainv[7]);
-
-    /* Process last 256-bit block */
-    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[16], CNS128[17],
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[18], CNS128[19],
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[20], CNS128[21],
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[22], CNS128[23],
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[24], CNS128[25],
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[26], CNS128[27],
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[28], CNS128[29],
-                tmp[0], tmp[1] );
-    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS128[30], CNS128[31],
-                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(16), cns(17) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(18), cns(19) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(20), cns(21) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(22), cns(23) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(24), cns(25) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(26), cns(27) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(28), cns(29) );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, cns(30), cns(31) );
 }


@@ -588,51 +550,6 @@ static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
 /* state: hash context    */
 /* b[8]: hash values      */

-#if defined (__AVX2__)
-
-static void finalization512( hashState_luffa *state, uint32 *b )
-{
-    uint32   hash[8] __attribute((aligned(64)));
-    __m256i* chainv = (__m256i*)state->chainv;
-    __m256i  t;
-    const __m128i zero = m128_zero;
-    const __m256i shuff_bswap32 = _mm256_set_epi64x( 0x1c1d1e1f18191a1b,
-                                                     0x1415161710111213,
-                                                     0x0c0d0e0f08090a0b,
-                                                     0x0405060700010203 );
-
-    rnd512( state, zero, zero );
-
-    t = chainv[0];
-    t = _mm256_xor_si256( t, chainv[1] );
-    t = _mm256_xor_si256( t, chainv[2] );
-    t = _mm256_xor_si256( t, chainv[3] );
-    t = _mm256_xor_si256( t, chainv[4] );
-
-    t = _mm256_shuffle_epi32( t, 27 );
-
-    _mm256_store_si256( (__m256i*)hash, t );
-
-    casti_m256i( b, 0 ) = _mm256_shuffle_epi8(
-                                 casti_m256i( hash, 0 ), shuff_bswap32 );
-
-    rnd512( state, zero, zero );
-
-    t = chainv[0];
-    t = _mm256_xor_si256( t, chainv[1] );
-    t = _mm256_xor_si256( t, chainv[2] );
-    t = _mm256_xor_si256( t, chainv[3] );
-    t = _mm256_xor_si256( t, chainv[4] );
-    t = _mm256_shuffle_epi32( t, 27 );
-
-    _mm256_store_si256( (__m256i*)hash, t );
-
-    casti_m256i( b, 1 ) = _mm256_shuffle_epi8( 
-                                 casti_m256i( hash, 0 ), shuff_bswap32 );
-}
-
-#else
-
 static void finalization512( hashState_luffa *state, uint32 *b )
 {
    uint32 hash[8] __attribute((aligned(64)));
@@ -685,6 +602,5 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    casti_m128i( b, 2 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
    casti_m128i( b, 3 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );
 }
-#endif

 /***************************************************/
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -212,7 +212,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
   const uint32_t last_nonce = max_nonce - 16;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m512i sixteen = m512_const1_32( 16 );
+   const __m512i sixteen = _mm512_set1_epi32( 16 );

   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

@@ -398,7 +398,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;  
   const bool bench = opt_benchmark;
-   const __m256i eight = m256_const1_32( 8 );
+   const __m256i eight = _mm256_set1_epi32( 8 );

   // Prehash first block
   blake256_transform_le( phash, pdata, 512, 0 );
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -203,7 +203,7 @@ int scanhash_lyra2rev2_16way( struct work *work, const uint32_t max_nonce,
             submit_solution( work, lane_hash, mythr );
         }
      }
-      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
+      *noncev = _mm512_add_epi32( *noncev, _mm512_set1_epi32( 16 ) );
      n += 16;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
@@ -345,7 +345,7 @@ int scanhash_lyra2rev2_8way( struct work *work, const uint32_t max_nonce,
             submit_solution( work, lane_hash, mythr );
         }
      }
-      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+      *noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
      n += 8;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -287,7 +287,7 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
             submit_solution( work, lane_hash, mythr );
         }
      }
-      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+      *noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
      n += 8;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
@@ -389,7 +389,7 @@ int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
              submit_solution( work, lane_hash, mythr );
 	      }
      }
-      *noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
+      *noncev = _mm_add_epi32( *noncev, _mm_set1_epi32( 4 ) );
      n += 4;
   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
   pdata[19] = n;
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -103,7 +103,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
   const uint32_t last_nonce = max_nonce - 16;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m512i sixteen = m512_const1_32( 16 );
+   const __m512i sixteen = _mm512_set1_epi32( 16 );

   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

@@ -213,7 +213,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m256i eight = m256_const1_32( 8 );
+   const __m256i eight = _mm256_set1_epi32( 8 );

   // Prehash first block
   blake256_transform_le( phash, pdata, 512, 0 );
@@ -328,7 +328,7 @@ int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
           submit_solution( work, lane_hash, mythr );
        }
      }
-      *noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
+      *noncev = _mm_add_epi32( *noncev, _mm_set1_epi32( 4 ) );
      n += 4;
   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -62,10 +62,10 @@ inline void initState( uint64_t State[/*16*/] )
  state[1] = zero;
  state[2] = zero;
  state[3] = zero;
-  state[4] = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
-  state[5] = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
-  state[6] = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
-  state[7] = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
+  state[4] = _mm_set_epi64x( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state[5] = _mm_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
+  state[6] = _mm_set_epi64x( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
+  state[7] = _mm_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );

 #else
    //First 512 bis are zeros
@@ -299,10 +299,10 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In,
  state1 =
  state2 =
  state3 = m128_zero;
-  state4 = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
-  state5 = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
-  state6 = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
-  state7 = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
+  state4 = _mm_set_epi64x( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state5 = _mm_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
+  state6 = _mm_set_epi64x( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
+  state7 = _mm_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );

  for ( int i = 0; i < nBlocks; i++ )
  { 
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -43,27 +43,29 @@ static const uint64_t blake2b_IV[8] =
  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
 };

-/*Blake2b's rotation*/
-static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
-    return ( w >> c ) | ( w << ( 64 - c ) );
-}
-
-// serial data is only 32 bytes so AVX2 is the limit for that dimension.
-// However, 2 way parallel looks trivial to code for AVX512 except for
-// a data dependency with rowa.
-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define G2W_4X64(a,b,c,d) \
   a = _mm512_add_epi64( a, b ); \
-   d = mm512_ror_64( _mm512_xor_si512( d, a ), 32 ); \
+   d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 32 ); \
   c = _mm512_add_epi64( c, d ); \
-   b = mm512_ror_64( _mm512_xor_si512( b, c ), 24 ); \
+   b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 24 ); \
   a = _mm512_add_epi64( a, b ); \
-   d = mm512_ror_64( _mm512_xor_si512( d, a ), 16 ); \
+   d = _mm512_ror_epi64( _mm512_xor_si512( d, a ), 16 ); \
   c = _mm512_add_epi64( c, d ); \
-   b = mm512_ror_64( _mm512_xor_si512( b, c ), 63 );
+   b = _mm512_ror_epi64( _mm512_xor_si512( b, c ), 63 );

+#define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
+   G2W_4X64( s0, s1, s2, s3 ); \
+   s0 = mm512_shufll256_64( s0 ); \
+   s3 = mm512_swap256_128( s3); \
+   s2 = mm512_shuflr256_64( s2 ); \
+   G2W_4X64( s0, s1, s2, s3 ); \
+   s0 = mm512_shuflr256_64( s0 ); \
+   s3 = mm512_swap256_128( s3 ); \
+   s2 = mm512_shufll256_64( s2 ); 
+
+/*
 #define LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
   G2W_4X64( s0, s1, s2, s3 ); \
   s3 = mm512_shufll256_64( s3 ); \
@@ -73,6 +75,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   s3 = mm512_shuflr256_64( s3 ); \
   s1 = mm512_shufll256_64( s1 ); \
   s2 = mm512_swap256_128( s2 ); 
+*/

 #define LYRA_12_ROUNDS_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
@@ -88,13 +91,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 ) \
   LYRA_ROUND_2WAY_AVX512( s0, s1, s2, s3 )

-
 #endif  // AVX512

-#if defined __AVX2__
+#if defined(__AVX2__)

-// process 4 columns in parallel
-// returns void, updates all args
 #define G_4X64(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
   d = mm256_swap64_32( _mm256_xor_si256( d, a ) ); \
@@ -105,6 +105,18 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );

+// Pivot about s1 instead of s0 reduces latency.
+#define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
+   G_4X64( s0, s1, s2, s3 ); \
+   s0 = mm256_shufll_64( s0 ); \
+   s3 = mm256_swap_128( s3); \
+   s2 = mm256_shuflr_64( s2 ); \
+   G_4X64( s0, s1, s2, s3 ); \
+   s0 = mm256_shuflr_64( s0 ); \
+   s3 = mm256_swap_128( s3 ); \
+   s2 = mm256_shufll_64( s2 );
+
+/*
 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   G_4X64( s0, s1, s2, s3 ); \
   s3 = mm256_shufll_64( s3 ); \
@@ -114,6 +126,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   s3 = mm256_shuflr_64( s3 ); \
   s1 = mm256_shufll_64( s1 ); \
   s2 = mm256_swap_128( s2 );
+*/

 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -182,8 +195,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #endif // AVX2 else SSE2

-// Scalar
-//Blake2b's G function
+/*
+// Scalar, not used.
+
+static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
+    return ( w >> c ) | ( w << ( 64 - c ) );
+}
+
 #define G(r,i,a,b,c,d) \
  do { \
    a = a + b; \
@@ -196,8 +214,6 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    b = rotr64(b ^ c, 63); \
  } while(0)

-
-/*One Round of the Blake2b's compression function*/
 #define ROUND_LYRA(r)  \
    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
@@ -207,6 +223,7 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
+*/

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -51,7 +51,7 @@ void anime_8way_hash( void *state, const void *input )
    __m512i* vhA = (__m512i*)vhashA;
    __m512i* vhB = (__m512i*)vhashB;
    __m512i* vhC = (__m512i*)vhashC;
-    const __m512i bit3_mask = m512_const1_64( 8 );
+    const __m512i bit3_mask = _mm512_set1_epi64( 8 );
    __mmask8 vh_mask;
    anime_8way_context_overlay ctx __attribute__ ((aligned (64)));

@@ -209,7 +209,7 @@ int scanhash_anime_8way( struct work *work, uint32_t max_nonce,
          }
       }
       *noncev = _mm512_add_epi32( *noncev,
-                                   m512_const1_64( 0x0000000800000000 ) );
+                                   _mm512_set1_epi64( 0x0000000800000000 ) );
       n += 8;
    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
    pdata[19] = n;
@@ -248,7 +248,7 @@ void anime_4way_hash( void *state, const void *input )
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
    int h_mask;
-    const __m256i bit3_mask = m256_const1_64( 8 );
+    const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
    const __m256i zero = _mm256_setzero_si256();
    anime_4way_context_overlay ctx __attribute__ ((aligned (64)));

@@ -388,7 +388,7 @@ int scanhash_anime_4way( struct work *work, uint32_t max_nonce,
          }
       }
       *noncev = _mm256_add_epi32( *noncev,
-                                   m256_const1_64( 0x0000000400000000 ) );
+                                   _mm256_set1_epi64x( 0x0000000400000000 ) );
       n += 4;
    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
    pdata[19] = n;
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -75,7 +75,7 @@ extern void hmq1725_8way_hash(void *state, const void *input)
   uint32_t hash7 [16]    __attribute__ ((aligned (32)));
   hmq1725_8way_context_overlay ctx __attribute__ ((aligned (64)));
   __mmask8 vh_mask;
-   const __m512i vmask = m512_const1_64( 24 );
+   const __m512i vmask = _mm512_set1_epi64( 24 );
   const uint32_t mask = 24;
   __m512i* vh  = (__m512i*)vhash;
   __m512i* vhA = (__m512i*)vhashA;
@@ -593,7 +593,7 @@ int scanhash_hmq1725_8way( struct work *work, uint32_t max_nonce,
          }
       }
       *noncev = _mm512_add_epi32( *noncev,
-                                   m512_const1_64( 0x0000000800000000 ) );
+                                   _mm512_set1_epi64( 0x0000000800000000 ) );
       n += 8;
    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );

@@ -647,7 +647,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
   hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
   __m256i vh_mask;     
   int h_mask;
-   const __m256i vmask = m256_const1_64( 24 );
+   const __m256i vmask = _mm256_set1_epi64x( 24 );
   const uint32_t mask = 24;
   __m256i* vh  = (__m256i*)vhash;
   __m256i* vhA = (__m256i*)vhashA;
@@ -1041,7 +1041,7 @@ int scanhash_hmq1725_4way( struct work *work, uint32_t max_nonce,
          }
       }
       *noncev = _mm256_add_epi32( *noncev,
-                                   m256_const1_64( 0x0000000400000000 ) );
+                                   _mm256_set1_epi64x( 0x0000000400000000 ) );
       n += 4;
    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
    pdata[19] = n;
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -67,7 +67,7 @@ void quark_8way_hash( void *state, const void *input )
    __mmask8 vh_mask;
    quark_8way_ctx_holder ctx;
    const uint32_t mask = 8;
-    const __m512i bit3_mask = m512_const1_64( mask );
+    const __m512i bit3_mask = _mm512_set1_epi64( mask );

    memcpy( &ctx, &quark_8way_ctx, sizeof(quark_8way_ctx) );

@@ -224,7 +224,7 @@ int scanhash_quark_8way( struct work *work, uint32_t max_nonce,
          }
       }
       *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
       n += 8;
    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );

@@ -271,7 +271,7 @@ void quark_4way_hash( void *state, const void *input )
    __m256i vh_mask;
    int h_mask;
    quark_4way_ctx_holder ctx;
-    const __m256i bit3_mask = m256_const1_64( 8 );
+    const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
    const __m256i zero = _mm256_setzero_si256();

    memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
@@ -397,7 +397,7 @@ int scanhash_quark_4way( struct work *work, uint32_t max_nonce,
          }
       }
       *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
       n += 4;
    } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );

--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -47,7 +47,7 @@ static const uint32_t IV[5] =
 do{ \
   a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
                _mm_add_epi32( a, f( b ,c, d ) ), r ), \
-                                 m128_const1_64( k ) ), s ), e ); \
+                                 _mm_set1_epi64x( k ) ), s ), e ); \
   c = mm128_rol_32( c, 10 );\
 } while (0)

@@ -251,11 +251,11 @@ static void ripemd160_4way_round( ripemd160_4way_context *sc )

 void ripemd160_4way_init( ripemd160_4way_context *sc )
 {
-   sc->val[0] = m128_const1_64( 0x6745230167452301 );
-   sc->val[1] = m128_const1_64( 0xEFCDAB89EFCDAB89 );
-   sc->val[2] = m128_const1_64( 0x98BADCFE98BADCFE );
-   sc->val[3] = m128_const1_64( 0x1032547610325476 );
-   sc->val[4] = m128_const1_64( 0xC3D2E1F0C3D2E1F0 );
+   sc->val[0] = _mm_set1_epi64x( 0x6745230167452301 );
+   sc->val[1] = _mm_set1_epi64x( 0xEFCDAB89EFCDAB89 );
+   sc->val[2] = _mm_set1_epi64x( 0x98BADCFE98BADCFE );
+   sc->val[3] = _mm_set1_epi64x( 0x1032547610325476 );
+   sc->val[4] = _mm_set1_epi64x( 0xC3D2E1F0C3D2E1F0 );
   sc->count_high = sc->count_low = 0;
 }

@@ -347,7 +347,7 @@ void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
 do{ \
   a = _mm256_add_epi32( mm256_rol_32( _mm256_add_epi32( _mm256_add_epi32( \
                _mm256_add_epi32( a, f( b ,c, d ) ), r ), \
-                                 m256_const1_64( k ) ), s ), e ); \
+                                 _mm256_set1_epi64x( k ) ), s ), e ); \
   c = mm256_rol_32( c, 10 );\
 } while (0)
    
@@ -552,11 +552,11 @@ static void ripemd160_8way_round( ripemd160_8way_context *sc )

 void ripemd160_8way_init( ripemd160_8way_context *sc )
 {
-   sc->val[0] = m256_const1_64( 0x6745230167452301 );
-   sc->val[1] = m256_const1_64( 0xEFCDAB89EFCDAB89 );
-   sc->val[2] = m256_const1_64( 0x98BADCFE98BADCFE );
-   sc->val[3] = m256_const1_64( 0x1032547610325476 );
-   sc->val[4] = m256_const1_64( 0xC3D2E1F0C3D2E1F0 );
+   sc->val[0] = _mm256_set1_epi64x( 0x6745230167452301 );
+   sc->val[1] = _mm256_set1_epi64x( 0xEFCDAB89EFCDAB89 );
+   sc->val[2] = _mm256_set1_epi64x( 0x98BADCFE98BADCFE );
+   sc->val[3] = _mm256_set1_epi64x( 0x1032547610325476 );
+   sc->val[4] = _mm256_set1_epi64x( 0xC3D2E1F0C3D2E1F0 );
   sc->count_high = sc->count_low = 0;
 }

@@ -649,7 +649,7 @@ void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )
 do{ \
   a = _mm512_add_epi32( mm512_rol_32( _mm512_add_epi32( _mm512_add_epi32( \
                _mm512_add_epi32( a, f( b ,c, d ) ), r ), \
-                                 m512_const1_64( k ) ), s ), e ); \
+                                 _mm512_set1_epi64( k ) ), s ), e ); \
   c = mm512_rol_32( c, 10 );\
 } while (0)

@@ -853,11 +853,11 @@ static void ripemd160_16way_round( ripemd160_16way_context *sc )

 void ripemd160_16way_init( ripemd160_16way_context *sc )
 {
-   sc->val[0] = m512_const1_64( 0x6745230167452301 );
-   sc->val[1] = m512_const1_64( 0xEFCDAB89EFCDAB89 );
-   sc->val[2] = m512_const1_64( 0x98BADCFE98BADCFE );
-   sc->val[3] = m512_const1_64( 0x1032547610325476 );
-   sc->val[4] = m512_const1_64( 0xC3D2E1F0C3D2E1F0 );
+   sc->val[0] = _mm512_set1_epi64( 0x6745230167452301 );
+   sc->val[1] = _mm512_set1_epi64( 0xEFCDAB89EFCDAB89 );
+   sc->val[2] = _mm512_set1_epi64( 0x98BADCFE98BADCFE );
+   sc->val[3] = _mm512_set1_epi64( 0x1032547610325476 );
+   sc->val[4] = _mm512_set1_epi64( 0xC3D2E1F0C3D2E1F0 );
   sc->count_high = sc->count_low = 0;
 }

@@ -902,7 +902,7 @@ void ripemd160_16way_close( ripemd160_16way_context  *sc, void *dst )
   const int pad = block_size - 8;

   ptr = (unsigned)sc->count_low & ( block_size - 1U);
-   sc->buf[ ptr>>2 ] = m512_const1_32( 0x80 );
+   sc->buf[ ptr>>2 ] = _mm512_set1_epi32( 0x80 );
   ptr += 4;

   if ( ptr > pad )
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -311,7 +311,7 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
   __m128i A, B, C, D, E, F, G, H;
   __m128i W[16];      memcpy_128( W, data, 16 );
   // Value required by H after round 60 to produce valid final hash
-   const __m128i H_ = m128_const1_32( 0x136032ED );
+   const __m128i H_ = _mm_set1_epi32( 0x136032ED );

   A = _mm_load_si128( state_in   );
   B = _mm_load_si128( state_in+1 );
@@ -408,14 +408,14 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
 void sha256_4way_init( sha256_4way_context *sc )
 {
   sc->count_high = sc->count_low = 0;
-   sc->val[0] = m128_const1_64( 0x6A09E6676A09E667 );
-   sc->val[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
-   sc->val[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
-   sc->val[3] = m128_const1_64( 0xA54FF53AA54FF53A );
-   sc->val[4] = m128_const1_64( 0x510E527F510E527F );
-   sc->val[5] = m128_const1_64( 0x9B05688C9B05688C );
-   sc->val[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
-   sc->val[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
+   sc->val[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
+   sc->val[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
+   sc->val[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
+   sc->val[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
+   sc->val[4] = _mm_set1_epi64x( 0x510E527F510E527F );
+   sc->val[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
+   sc->val[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
+   sc->val[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
 }

 void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
@@ -458,7 +458,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
    const int pad = buf_size - 8;

    ptr = (unsigned)sc->count_low & (buf_size - 1U);
-    sc->buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 );
+    sc->buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
    ptr += 4;

    if ( ptr > pad )
@@ -474,8 +474,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
    high = (sc->count_high << 3) | (low >> 29);
    low = low << 3;

-    sc->buf[  pad     >> 2 ] = m128_const1_32( bswap_32( high ) );
-    sc->buf[( pad+4 ) >> 2 ] = m128_const1_32( bswap_32( low ) );
+    sc->buf[  pad     >> 2 ] = _mm_set1_epi32( bswap_32( high ) );
+    sc->buf[( pad+4 ) >> 2 ] = _mm_set1_epi32( bswap_32( low ) );
    sha256_4way_transform_be( sc->val, sc->buf, sc->val );

    mm128_block_bswap_32( dst, sc->val );
@@ -589,7 +589,6 @@ do { \
  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                         Y_xor_Z ) )

-
 #define SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H, i, j ) \
 do { \
  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
@@ -863,7 +862,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
 {
   __m256i A, B, C, D, E, F, G, H;
   __m256i W[16];  memcpy_256( W, data, 16 );
-   const __m256i H_ = m256_const1_32( 0x136032ED );
+   const __m256i H_ = _mm256_set1_epi32( 0x136032ED );

   A = _mm256_load_si256( state_in   );
   B = _mm256_load_si256( state_in+1 );
@@ -979,14 +978,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
 void sha256_8way_init( sha256_8way_context *sc )
 {
   sc->count_high = sc->count_low = 0;
-   sc->val[0] = m256_const1_64( 0x6A09E6676A09E667 );
-   sc->val[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
-   sc->val[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
-   sc->val[3] = m256_const1_64( 0xA54FF53AA54FF53A );
-   sc->val[4] = m256_const1_64( 0x510E527F510E527F );
-   sc->val[5] = m256_const1_64( 0x9B05688C9B05688C );
-   sc->val[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
-   sc->val[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
+   sc->val[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
+   sc->val[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
+   sc->val[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
+   sc->val[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
+   sc->val[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
+   sc->val[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
+   sc->val[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
+   sc->val[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
 }

 // need to handle odd byte length for yespower.
@@ -1032,7 +1031,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    const int pad = buf_size - 8;

    ptr = (unsigned)sc->count_low & (buf_size - 1U);
-    sc->buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 );
+    sc->buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
    ptr += 4;

    if ( ptr > pad )
@@ -1048,8 +1047,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    high = (sc->count_high << 3) | (low >> 29);
    low = low << 3;

-    sc->buf[   pad     >> 2 ] = m256_const1_32( bswap_32( high ) );
-    sc->buf[ ( pad+4 ) >> 2 ] = m256_const1_32( bswap_32( low ) );
+    sc->buf[   pad     >> 2 ] = _mm256_set1_epi32( bswap_32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] = _mm256_set1_epi32( bswap_32( low ) );

    sha256_8way_transform_be( sc->val, sc->buf, sc->val );

@@ -1360,7 +1359,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   // Value for H at round 60, before adding K, needed to produce valid final
   // hash where H == 0.
   // H_ =  -( H256[7] + K256[60] );
-   const __m512i H_ = m512_const1_32( 0x136032ED );
+   const __m512i H_ = _mm512_set1_epi32( 0x136032ED );

   A = _mm512_load_si512( state_in   );
   B = _mm512_load_si512( state_in+1 );
@@ -1453,14 +1452,14 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
 void sha256_16way_init( sha256_16way_context *sc )
 {
   sc->count_high = sc->count_low = 0;
-   sc->val[0] = m512_const1_64( 0x6A09E6676A09E667 );
-   sc->val[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
-   sc->val[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
-   sc->val[3] = m512_const1_64( 0xA54FF53AA54FF53A );
-   sc->val[4] = m512_const1_64( 0x510E527F510E527F );
-   sc->val[5] = m512_const1_64( 0x9B05688C9B05688C );
-   sc->val[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
-   sc->val[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
+   sc->val[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
+   sc->val[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
+   sc->val[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
+   sc->val[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
+   sc->val[4] = _mm512_set1_epi64( 0x510E527F510E527F );
+   sc->val[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
+   sc->val[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
+   sc->val[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
 }

 void sha256_16way_update( sha256_16way_context *sc, const void *data,
@@ -1504,7 +1503,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
    const int pad = buf_size - 8;

    ptr = (unsigned)sc->count_low & (buf_size - 1U);
-    sc->buf[ ptr>>2 ] = m512_const1_64( 0x0000008000000080 );
+    sc->buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
    ptr += 4;

    if ( ptr > pad )
@@ -1520,8 +1519,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
    high = (sc->count_high << 3) | (low >> 29);
    low = low << 3;

-    sc->buf[   pad     >> 2 ] = m512_const1_32( bswap_32( high ) );
-    sc->buf[ ( pad+4 ) >> 2 ] = m512_const1_32( bswap_32( low ) );
+    sc->buf[   pad     >> 2 ] = _mm512_set1_epi32( bswap_32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] = _mm512_set1_epi32( bswap_32( low ) );

    sha256_16way_transform_be( sc->val, sc->buf, sc->val );

--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -28,32 +28,32 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   __m512i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m512i last_byte = m512_const1_32( 0x80000000 );
-   const __m512i sixteen = m512_const1_32( 16 );
+   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
+   const __m512i sixteen = _mm512_set1_epi32( 16 );

   for ( int i = 0; i < 19; i++ )
-       vdata[i] = m512_const1_32( pdata[i] );
+       vdata[i] = _mm512_set1_epi32( pdata[i] );

   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_512( vdata+16 + 5, 10 );
-   vdata[16+15] = m512_const1_32( 80*8 ); // bit count
+   vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count

   block[ 8] = last_byte;
   memset_zero_512( block + 9, 6 );
-   block[15] = m512_const1_32( 32*8 ); // bit count
+   block[15] = _mm512_set1_epi32( 32*8 ); // bit count
   
   // initialize state
-   initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
-   initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
-   initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
-   initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
-   initstate[4] = m512_const1_64( 0x510E527F510E527F );
-   initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
-   initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
-   initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
+   initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
+   initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
+   initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
+   initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
+   initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
+   initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
+   initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );

   sha256_16way_transform_le( midstate1, vdata, initstate );

@@ -116,31 +116,31 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   __m256i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m256i last_byte = m256_const1_32( 0x80000000 );
-   const __m256i eight = m256_const1_32( 8 );
+   const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
+   const __m256i eight = _mm256_set1_epi32( 8 );

   for ( int i = 0; i < 19; i++ )
-      vdata[i] = m256_const1_32( pdata[i] );
+      vdata[i] = _mm256_set1_epi32( pdata[i] );

   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_256( vdata+16 + 5, 10 );
-   vdata[16+15] = m256_const1_32( 80*8 ); // bit count
+   vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count

   block[ 8] = last_byte;
   memset_zero_256( block + 9, 6 );
-   block[15] = m256_const1_32( 32*8 ); // bit count
+   block[15] = _mm256_set1_epi32( 32*8 ); // bit count
   
   // initialize state
-   initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
-   initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
-   initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
-   initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
-   initstate[4] = m256_const1_64( 0x510E527F510E527F );
-   initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
-   initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
-   initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
+   initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
+   initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
+   initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
+   initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
+   initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
+   initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
+   initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
+   initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );

   sha256_8way_transform_le( midstate1, vdata, initstate );
   
@@ -204,31 +204,31 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
   __m128i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m128i last_byte = m128_const1_32( 0x80000000 );
-   const __m128i four = m128_const1_32( 4 );
+   const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
+   const __m128i four = _mm_set1_epi32( 4 );

   for ( int i = 0; i < 19; i++ )
-       vdata[i] = m128_const1_32( pdata[i] );
+       vdata[i] = _mm_set1_epi32( pdata[i] );

   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_128( vdata+16 + 5, 10 );
-   vdata[16+15] = m128_const1_32( 80*8 ); // bit count
+   vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count

   block[ 8] = last_byte;
   memset_zero_128( block + 9, 6 );
-   block[15] = m128_const1_32( 32*8 ); // bit count
+   block[15] = _mm_set1_epi32( 32*8 ); // bit count

   // initialize state
-   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
-   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
-   initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
-   initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
-   initstate[4] = m128_const1_64( 0x510E527F510E527F );
-   initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
-   initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
-   initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
+   initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
+   initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
+   initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
+   initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
+   initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
+   initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
+   initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
+   initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );

   // hash first 64 bytes of data
   sha256_4way_transform_le( midstate1, vdata, initstate );
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -68,7 +68,7 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
           submit_solution( work, lane_hash, mythr );
        }
      }
-      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
+      *noncev = _mm512_add_epi32( *noncev, _mm512_set1_epi32( 16 ) );
      n += 16;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
@@ -140,7 +140,7 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
           submit_solution( work, lane_hash, mythr );
        }
      }
-      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
+      *noncev = _mm256_add_epi32( *noncev, _mm256_set1_epi32( 8 ) );
      n += 8;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -28,31 +28,31 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   __m512i *noncev = vdata + 19; 
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m512i last_byte = m512_const1_32( 0x80000000 );
-   const __m512i sixteen = m512_const1_32( 16 );
+   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
+   const __m512i sixteen = _mm512_set1_epi32( 16 );

   for ( int i = 0; i < 19; i++ )
-      vdata[i] = m512_const1_32( pdata[i] );
+      vdata[i] = _mm512_set1_epi32( pdata[i] );

   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_512( vdata+16 + 5, 10 );
-   vdata[16+15] = m512_const1_32( 80*8 ); // bit count
+   vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
   
   block[ 8] = last_byte;
   memset_zero_512( block + 9, 6 );
-   block[15] = m512_const1_32( 32*8 ); // bit count
+   block[15] = _mm512_set1_epi32( 32*8 ); // bit count
   
-   initstate[0] = m512_const1_64( 0x6A09E6676A09E667 );
-   initstate[1] = m512_const1_64( 0xBB67AE85BB67AE85 );
-   initstate[2] = m512_const1_64( 0x3C6EF3723C6EF372 );
-   initstate[3] = m512_const1_64( 0xA54FF53AA54FF53A );
-   initstate[4] = m512_const1_64( 0x510E527F510E527F );
-   initstate[5] = m512_const1_64( 0x9B05688C9B05688C );
-   initstate[6] = m512_const1_64( 0x1F83D9AB1F83D9AB );
-   initstate[7] = m512_const1_64( 0x5BE0CD195BE0CD19 );
+   initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
+   initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
+   initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
+   initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
+   initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
+   initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
+   initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
+   initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );

   sha256_16way_transform_le( midstate1, vdata, initstate );
   
@@ -120,31 +120,31 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   __m256i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m256i last_byte = m256_const1_32( 0x80000000 );
-   const __m256i eight = m256_const1_32( 8 );
+   const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
+   const __m256i eight = _mm256_set1_epi32( 8 );

   for ( int i = 0; i < 19; i++ )
-      vdata[i] = m256_const1_32( pdata[i] );
+      vdata[i] = _mm256_set1_epi32( pdata[i] );

   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_256( vdata+16 + 5, 10 );
-   vdata[16+15] = m256_const1_32( 80*8 ); // bit count
+   vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count

   block[ 8] = last_byte;
   memset_zero_256( block + 9, 6 );
-   block[15] = m256_const1_32( 32*8 ); // bit count
+   block[15] = _mm256_set1_epi32( 32*8 ); // bit count
   
   // initialize state
-   initstate[0] = m256_const1_64( 0x6A09E6676A09E667 );
-   initstate[1] = m256_const1_64( 0xBB67AE85BB67AE85 );
-   initstate[2] = m256_const1_64( 0x3C6EF3723C6EF372 );
-   initstate[3] = m256_const1_64( 0xA54FF53AA54FF53A );
-   initstate[4] = m256_const1_64( 0x510E527F510E527F );
-   initstate[5] = m256_const1_64( 0x9B05688C9B05688C );
-   initstate[6] = m256_const1_64( 0x1F83D9AB1F83D9AB );
-   initstate[7] = m256_const1_64( 0x5BE0CD195BE0CD19 );
+   initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
+   initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
+   initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
+   initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
+   initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
+   initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
+   initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
+   initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );

   sha256_8way_transform_le( midstate1, vdata, initstate );

@@ -215,31 +215,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
   __m128i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m128i last_byte = m128_const1_32( 0x80000000 );
-   const __m128i four = m128_const1_32( 4 );
+   const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
+   const __m128i four = _mm_set1_epi32( 4 );

   for ( int i = 0; i < 19; i++ )
-       vdata[i] = m128_const1_32( pdata[i] );
+       vdata[i] = _mm_set1_epi32( pdata[i] );

   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_128( vdata+16 + 5, 10 );
-   vdata[16+15] = m128_const1_32( 80*8 ); // bit count
+   vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count

   block[ 8] = last_byte;
   memset_zero_128( block + 9, 6 );
-   block[15] = m128_const1_32( 32*8 ); // bit count
+   block[15] = _mm_set1_epi32( 32*8 ); // bit count
   
   // initialize state
-   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
-   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
-   initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
-   initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
-   initstate[4] = m128_const1_64( 0x510E527F510E527F );
-   initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
-   initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
-   initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
+   initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
+   initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
+   initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
+   initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
+   initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
+   initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
+   initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
+   initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );

   // hash first 64 bytes of data
   sha256_4way_transform_le( midstate1, vdata, initstate );
@@ -302,31 +302,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
   __m128i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m128i last_byte = m128_const1_32( 0x80000000 );
-   const __m128i four = m128_const1_32( 4 );
+   const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
+   const __m128i four = _mm_set1_epi32( 4 );

   for ( int i = 0; i < 19; i++ )
-       vdata[i] = m128_const1_32( pdata[i] );
+       vdata[i] = _mm_set1_epi32( pdata[i] );

   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_128( vdata+16 + 5, 10 );
-   vdata[16+15] = m128_const1_32( 80*8 ); // bit count
+   vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count

   block[ 8] = last_byte;
   memset_zero_128( block + 9, 6 );
-   block[15] = m128_const1_32( 32*8 ); // bit count
+   block[15] = _mm_set1_epi32( 32*8 ); // bit count
   
   // initialize state
-   initstate[0] = m128_const1_64( 0x6A09E6676A09E667 );
-   initstate[1] = m128_const1_64( 0xBB67AE85BB67AE85 );
-   initstate[2] = m128_const1_64( 0x3C6EF3723C6EF372 );
-   initstate[3] = m128_const1_64( 0xA54FF53AA54FF53A );
-   initstate[4] = m128_const1_64( 0x510E527F510E527F );
-   initstate[5] = m128_const1_64( 0x9B05688C9B05688C );
-   initstate[6] = m128_const1_64( 0x1F83D9AB1F83D9AB );
-   initstate[7] = m128_const1_64( 0x5BE0CD195BE0CD19 );
+   initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
+   initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
+   initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
+   initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
+   initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
+   initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
+   initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
+   initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );

   // hash first 64 bytes of data
   sha256_4way_transform_le( midstate, vdata, initstate );
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -243,7 +243,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

    ptr = (unsigned)sc->count & (buf_size - 1U);
-    sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
+    sc->buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
    ptr += 8;
    if ( ptr > pad )
    {
@@ -268,51 +268,56 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )

 // SHA-512 4 way 64 bit

+#define BSG5_0( x )     mm256_xor3( mm256_ror_64( x, 28 ), \
+                                    mm256_ror_64( x, 34 ), \
+                                    mm256_ror_64( x, 39 ) )
+
+#define BSG5_1( x )     mm256_xor3( mm256_ror_64( x, 14 ), \
+                                    mm256_ror_64( x, 18 ), \
+                                    mm256_ror_64( x, 41 ) )
+
+#define SSG5_0( x )     mm256_xor3( mm256_ror_64( x,  1 ), \
+                                    mm256_ror_64( x,  8 ), \
+                                    _mm256_srli_epi64( x, 7 ) ) 
+
+#define SSG5_1( x )     mm256_xor3( mm256_ror_64( x, 19 ), \
+                                    mm256_ror_64( x, 61 ), \
+                                    _mm256_srli_epi64( x, 6 ) )
+
+#if defined(__AVX512VL__)
+//TODO Enable for AVX10_256
+// 4 way is not used whith AVX512 but will be whith AVX10_256 when it
+// becomes available.
+
+#define CH( X, Y, Z )    _mm256_ternarylogic_epi64( X, Y, Z, 0xca )
+
+#define MAJ( X, Y, Z )   _mm256_ternarylogic_epi64( X, Y, Z, 0xe8 )
+   
+#define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
+do { \
+  __m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
+  __m256i T1 = BSG5_1( E ); \
+  __m256i T2 = BSG5_0( A ); \
+  T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
+  T1 = _mm256_add_epi64( T1, H ); \
+  T2 = _mm256_add_epi64( T2, MAJ( A, B, C ) ); \
+  T1 = _mm256_add_epi64( T1, T0 ); \
+  D  = _mm256_add_epi64( D,  T1 ); \
+  H  = _mm256_add_epi64( T1, T2 ); \
+} while (0)
+
+#else   // AVX2 only
+
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

 #define MAJ(X, Y, Z) \
  _mm256_xor_si256( Y, _mm256_and_si256( X_xor_Y = _mm256_xor_si256( X, Y ), \
                                         Y_xor_Z ) )
-                    
-#define BSG5_0(x) \
-  mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
-                   _mm256_xor_si256( mm256_ror_64( x,  5 ), x ), 6 ), x ), 28 )
-
-#define BSG5_1(x) \
-  mm256_ror_64( _mm256_xor_si256( mm256_ror_64( \
-                   _mm256_xor_si256( mm256_ror_64( x, 23 ), x ), 4 ), x ), 14 )
-
-/*
-#define SSG5_0(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-        mm256_ror_64(x,  1), mm256_ror_64(x,  8) ), _mm256_srli_epi64(x, 7) ) 
-
-#define SSG5_1(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-        mm256_ror_64(x, 19), mm256_ror_64(x, 61) ), _mm256_srli_epi64(x, 6) )
-*/
-// Interleave SSG0 & SSG1 for better throughput.
-// return ssg0(w0) + ssg1(w1)
-static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
-{
-   __m256i w0a, w1a, w0b, w1b;
-   w0a = mm256_ror_64( w0, 1 );
-   w1a = mm256_ror_64( w1,19 );
-   w0b = mm256_ror_64( w0, 8 );
-   w1b = mm256_ror_64( w1,61 );
-   w0a = _mm256_xor_si256( w0a, w0b );
-   w1a = _mm256_xor_si256( w1a, w1b );
-   w0b = _mm256_srli_epi64( w0, 7 );
-   w1b = _mm256_srli_epi64( w1, 6 );
-   w0a = _mm256_xor_si256( w0a, w0b );
-   w1a = _mm256_xor_si256( w1a, w1b );
-   return _mm256_add_epi64( w0a, w1a );
-}

 #define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
 do { \
-  __m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[ i ] ); \
+  __m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
  __m256i T1 = BSG5_1( E ); \
  __m256i T2 = BSG5_0( A ); \
  T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
@@ -324,19 +329,27 @@ do { \
  H  = _mm256_add_epi64( T1, T2 ); \
 } while (0)

+#endif  // AVX512VL AVX10_256
+
 static void
 sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
 {
   int i;
-   register __m256i A, B, C, D, E, F, G, H, X_xor_Y, Y_xor_Z;
+   register __m256i A, B, C, D, E, F, G, H;
+
+#if !defined(__AVX512VL__)
+// Disable for AVX10_256
+   __m256i X_xor_Y, Y_xor_Z;
+#endif
+
   __m256i W[80];

   mm256_block_bswap_64( W  , in );
   mm256_block_bswap_64( W+8, in+8 );

   for ( i = 16; i < 80; i++ )
-      W[i] = _mm256_add_epi64( ssg512_add( W[i-15], W[i-2] ),
-                               _mm256_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
+       W[i] = mm256_add4_64( SSG5_0( W[i-15] ), SSG5_1( W[i-2] ),
+                             W[ i- 7 ], W[ i-16 ] );

   if ( ctx->initialized )
   {
@@ -351,17 +364,20 @@ sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
   }
   else
   {
-      A = m256_const1_64( 0x6A09E667F3BCC908 );
-      B = m256_const1_64( 0xBB67AE8584CAA73B );
-      C = m256_const1_64( 0x3C6EF372FE94F82B );
-      D = m256_const1_64( 0xA54FF53A5F1D36F1 );
-      E = m256_const1_64( 0x510E527FADE682D1 );
-      F = m256_const1_64( 0x9B05688C2B3E6C1F );
-      G = m256_const1_64( 0x1F83D9ABFB41BD6B );
-      H = m256_const1_64( 0x5BE0CD19137E2179 );
+      A = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
+      B = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
+      C = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
+      D = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
+      E = _mm256_set1_epi64x( 0x510E527FADE682D1 );
+      F = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
+      G = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
+      H = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
   }

+#if !defined(__AVX512VL__)
+// Disable for AVX10_256
   Y_xor_Z = _mm256_xor_si256( B, C );
+#endif

   for ( i = 0; i < 80; i += 8 )
   {
@@ -389,14 +405,14 @@ sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
   else
   {
      ctx->initialized = true;
-      r[0] = _mm256_add_epi64( A, m256_const1_64( 0x6A09E667F3BCC908 ) );
-      r[1] = _mm256_add_epi64( B, m256_const1_64( 0xBB67AE8584CAA73B ) );
-      r[2] = _mm256_add_epi64( C, m256_const1_64( 0x3C6EF372FE94F82B ) );
-      r[3] = _mm256_add_epi64( D, m256_const1_64( 0xA54FF53A5F1D36F1 ) );
-      r[4] = _mm256_add_epi64( E, m256_const1_64( 0x510E527FADE682D1 ) );
-      r[5] = _mm256_add_epi64( F, m256_const1_64( 0x9B05688C2B3E6C1F ) );
-      r[6] = _mm256_add_epi64( G, m256_const1_64( 0x1F83D9ABFB41BD6B ) );
-      r[7] = _mm256_add_epi64( H, m256_const1_64( 0x5BE0CD19137E2179 ) );
+      r[0] = _mm256_add_epi64( A, _mm256_set1_epi64x( 0x6A09E667F3BCC908 ) );
+      r[1] = _mm256_add_epi64( B, _mm256_set1_epi64x( 0xBB67AE8584CAA73B ) );
+      r[2] = _mm256_add_epi64( C, _mm256_set1_epi64x( 0x3C6EF372FE94F82B ) );
+      r[3] = _mm256_add_epi64( D, _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 ) );
+      r[4] = _mm256_add_epi64( E, _mm256_set1_epi64x( 0x510E527FADE682D1 ) );
+      r[5] = _mm256_add_epi64( F, _mm256_set1_epi64x( 0x9B05688C2B3E6C1F ) );
+      r[6] = _mm256_add_epi64( G, _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B ) );
+      r[7] = _mm256_add_epi64( H, _mm256_set1_epi64x( 0x5BE0CD19137E2179 ) );
   }
 }

@@ -441,7 +457,7 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

    ptr = (unsigned)sc->count & (buf_size - 1U);
-    sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
+    sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
    ptr += 8;
    if ( ptr > pad )
    {
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -112,50 +112,50 @@ extern "C"{
   else \
   { \
       (state)->state_loaded = true; \
-       A0 = m256_const1_64( 0x20728DFD20728DFD ); \
-       A1 = m256_const1_64( 0x46C0BD5346C0BD53 ); \
-       A2 = m256_const1_64( 0xE782B699E782B699 ); \
-       A3 = m256_const1_64( 0x5530463255304632 ); \
-       A4 = m256_const1_64( 0x71B4EF9071B4EF90 ); \
-       A5 = m256_const1_64( 0x0EA9E82C0EA9E82C ); \
-       A6 = m256_const1_64( 0xDBB930F1DBB930F1 ); \
-       A7 = m256_const1_64( 0xFAD06B8BFAD06B8B ); \
-       A8 = m256_const1_64( 0xBE0CAE40BE0CAE40 ); \
-       A9 = m256_const1_64( 0x8BD144108BD14410 ); \
-       AA = m256_const1_64( 0x76D2ADAC76D2ADAC ); \
-       AB = m256_const1_64( 0x28ACAB7F28ACAB7F ); \
-       B0 = m256_const1_64( 0xC1099CB7C1099CB7 ); \
-       B1 = m256_const1_64( 0x07B385F307B385F3 ); \
-       B2 = m256_const1_64( 0xE7442C26E7442C26 ); \
-       B3 = m256_const1_64( 0xCC8AD640CC8AD640 ); \
-       B4 = m256_const1_64( 0xEB6F56C7EB6F56C7 ); \
-       B5 = m256_const1_64( 0x1EA81AA91EA81AA9 ); \
-       B6 = m256_const1_64( 0x73B9D31473B9D314 ); \
-       B7 = m256_const1_64( 0x1DE85D081DE85D08 ); \
-       B8 = m256_const1_64( 0x48910A5A48910A5A ); \
-       B9 = m256_const1_64( 0x893B22DB893B22DB ); \
-       BA = m256_const1_64( 0xC5A0DF44C5A0DF44 ); \
-       BB = m256_const1_64( 0xBBC4324EBBC4324E ); \
-       BC = m256_const1_64( 0x72D2F24072D2F240 ); \
-       BD = m256_const1_64( 0x75941D9975941D99 ); \
-       BE = m256_const1_64( 0x6D8BDE826D8BDE82 ); \
-       BF = m256_const1_64( 0xA1A7502BA1A7502B ); \
-       C0 = m256_const1_64( 0xD9BF68D1D9BF68D1 ); \
-       C1 = m256_const1_64( 0x58BAD75058BAD750 ); \
-       C2 = m256_const1_64( 0x56028CB256028CB2 ); \
-       C3 = m256_const1_64( 0x8134F3598134F359 ); \
-       C4 = m256_const1_64( 0xB5D469D8B5D469D8 ); \
-       C5 = m256_const1_64( 0x941A8CC2941A8CC2 ); \
-       C6 = m256_const1_64( 0x418B2A6E418B2A6E ); \
-       C7 = m256_const1_64( 0x0405278004052780 ); \
-       C8 = m256_const1_64( 0x7F07D7877F07D787 ); \
-       C9 = m256_const1_64( 0x5194358F5194358F ); \
-       CA = m256_const1_64( 0x3C60D6653C60D665 ); \
-       CB = m256_const1_64( 0xBE97D79ABE97D79A ); \
-       CC = m256_const1_64( 0x950C3434950C3434 ); \
-       CD = m256_const1_64( 0xAED9A06DAED9A06D ); \
-       CE = m256_const1_64( 0x2537DC8D2537DC8D ); \
-       CF = m256_const1_64( 0x7CDB59697CDB5969 ); \
+       A0 = _mm256_set1_epi64x( 0x20728DFD20728DFD ); \
+       A1 = _mm256_set1_epi64x( 0x46C0BD5346C0BD53 ); \
+       A2 = _mm256_set1_epi64x( 0xE782B699E782B699 ); \
+       A3 = _mm256_set1_epi64x( 0x5530463255304632 ); \
+       A4 = _mm256_set1_epi64x( 0x71B4EF9071B4EF90 ); \
+       A5 = _mm256_set1_epi64x( 0x0EA9E82C0EA9E82C ); \
+       A6 = _mm256_set1_epi64x( 0xDBB930F1DBB930F1 ); \
+       A7 = _mm256_set1_epi64x( 0xFAD06B8BFAD06B8B ); \
+       A8 = _mm256_set1_epi64x( 0xBE0CAE40BE0CAE40 ); \
+       A9 = _mm256_set1_epi64x( 0x8BD144108BD14410 ); \
+       AA = _mm256_set1_epi64x( 0x76D2ADAC76D2ADAC ); \
+       AB = _mm256_set1_epi64x( 0x28ACAB7F28ACAB7F ); \
+       B0 = _mm256_set1_epi64x( 0xC1099CB7C1099CB7 ); \
+       B1 = _mm256_set1_epi64x( 0x07B385F307B385F3 ); \
+       B2 = _mm256_set1_epi64x( 0xE7442C26E7442C26 ); \
+       B3 = _mm256_set1_epi64x( 0xCC8AD640CC8AD640 ); \
+       B4 = _mm256_set1_epi64x( 0xEB6F56C7EB6F56C7 ); \
+       B5 = _mm256_set1_epi64x( 0x1EA81AA91EA81AA9 ); \
+       B6 = _mm256_set1_epi64x( 0x73B9D31473B9D314 ); \
+       B7 = _mm256_set1_epi64x( 0x1DE85D081DE85D08 ); \
+       B8 = _mm256_set1_epi64x( 0x48910A5A48910A5A ); \
+       B9 = _mm256_set1_epi64x( 0x893B22DB893B22DB ); \
+       BA = _mm256_set1_epi64x( 0xC5A0DF44C5A0DF44 ); \
+       BB = _mm256_set1_epi64x( 0xBBC4324EBBC4324E ); \
+       BC = _mm256_set1_epi64x( 0x72D2F24072D2F240 ); \
+       BD = _mm256_set1_epi64x( 0x75941D9975941D99 ); \
+       BE = _mm256_set1_epi64x( 0x6D8BDE826D8BDE82 ); \
+       BF = _mm256_set1_epi64x( 0xA1A7502BA1A7502B ); \
+       C0 = _mm256_set1_epi64x( 0xD9BF68D1D9BF68D1 ); \
+       C1 = _mm256_set1_epi64x( 0x58BAD75058BAD750 ); \
+       C2 = _mm256_set1_epi64x( 0x56028CB256028CB2 ); \
+       C3 = _mm256_set1_epi64x( 0x8134F3598134F359 ); \
+       C4 = _mm256_set1_epi64x( 0xB5D469D8B5D469D8 ); \
+       C5 = _mm256_set1_epi64x( 0x941A8CC2941A8CC2 ); \
+       C6 = _mm256_set1_epi64x( 0x418B2A6E418B2A6E ); \
+       C7 = _mm256_set1_epi64x( 0x0405278004052780 ); \
+       C8 = _mm256_set1_epi64x( 0x7F07D7877F07D787 ); \
+       C9 = _mm256_set1_epi64x( 0x5194358F5194358F ); \
+       CA = _mm256_set1_epi64x( 0x3C60D6653C60D665 ); \
+       CB = _mm256_set1_epi64x( 0xBE97D79ABE97D79A ); \
+       CC = _mm256_set1_epi64x( 0x950C3434950C3434 ); \
+       CD = _mm256_set1_epi64x( 0xAED9A06DAED9A06D ); \
+       CE = _mm256_set1_epi64x( 0x2537DC8D2537DC8D ); \
+       CF = _mm256_set1_epi64x( 0x7CDB59697CDB5969 ); \
   } \
   Wlow = (state)->Wlow; \
   Whigh = (state)->Whigh; \
@@ -303,7 +303,7 @@ do { \

 #define PERM_ELT8( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
 do { \
-   xa0 = mm256_xor3( xm, xb1, mm256_xorandnot(  \
+   xa0 = mm256_xor3( xm, xb1, mm256_xorandnot( \
           _mm256_mullo_epi32( mm256_xor3( xa0, xc, \
              _mm256_mullo_epi32( mm256_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
           xb3, xb2 ) ); \
@@ -443,52 +443,52 @@ shabal_8way_init( void *cc, unsigned size )
   else
   {  // No users
       sc->state_loaded = true;
-       sc->A[ 0] = m256_const1_64( 0x52F8455252F84552 );
-       sc->A[ 1] = m256_const1_64( 0xE54B7999E54B7999 );
-       sc->A[ 2] = m256_const1_64( 0x2D8EE3EC2D8EE3EC );
-       sc->A[ 3] = m256_const1_64( 0xB9645191B9645191 );
-       sc->A[ 4] = m256_const1_64( 0xE0078B86E0078B86 );
-       sc->A[ 5] = m256_const1_64( 0xBB7C44C9BB7C44C9 );
-       sc->A[ 6] = m256_const1_64( 0xD2B5C1CAD2B5C1CA );
-       sc->A[ 7] = m256_const1_64( 0xB0D2EB8CB0D2EB8C );
-       sc->A[ 8] = m256_const1_64( 0x14CE5A4514CE5A45 );
-       sc->A[ 9] = m256_const1_64( 0x22AF50DC22AF50DC );
-       sc->A[10] = m256_const1_64( 0xEFFDBC6BEFFDBC6B );
-       sc->A[11] = m256_const1_64( 0xEB21B74AEB21B74A );
+       sc->A[ 0] = _mm256_set1_epi64x( 0x52F8455252F84552 );
+       sc->A[ 1] = _mm256_set1_epi64x( 0xE54B7999E54B7999 );
+       sc->A[ 2] = _mm256_set1_epi64x( 0x2D8EE3EC2D8EE3EC );
+       sc->A[ 3] = _mm256_set1_epi64x( 0xB9645191B9645191 );
+       sc->A[ 4] = _mm256_set1_epi64x( 0xE0078B86E0078B86 );
+       sc->A[ 5] = _mm256_set1_epi64x( 0xBB7C44C9BB7C44C9 );
+       sc->A[ 6] = _mm256_set1_epi64x( 0xD2B5C1CAD2B5C1CA );
+       sc->A[ 7] = _mm256_set1_epi64x( 0xB0D2EB8CB0D2EB8C );
+       sc->A[ 8] = _mm256_set1_epi64x( 0x14CE5A4514CE5A45 );
+       sc->A[ 9] = _mm256_set1_epi64x( 0x22AF50DC22AF50DC );
+       sc->A[10] = _mm256_set1_epi64x( 0xEFFDBC6BEFFDBC6B );
+       sc->A[11] = _mm256_set1_epi64x( 0xEB21B74AEB21B74A );

-       sc->B[ 0] = m256_const1_64( 0xB555C6EEB555C6EE );
-       sc->B[ 1] = m256_const1_64( 0x3E7105963E710596 );
-       sc->B[ 2] = m256_const1_64( 0xA72A652FA72A652F );
-       sc->B[ 3] = m256_const1_64( 0x9301515F9301515F );
-       sc->B[ 4] = m256_const1_64( 0xDA28C1FADA28C1FA );
-       sc->B[ 5] = m256_const1_64( 0x696FD868696FD868 );
-       sc->B[ 6] = m256_const1_64( 0x9CB6BF729CB6BF72 );
-       sc->B[ 7] = m256_const1_64( 0x0AFE40020AFE4002 );
-       sc->B[ 8] = m256_const1_64( 0xA6E03615A6E03615 );
-       sc->B[ 9] = m256_const1_64( 0x5138C1D45138C1D4 );
-       sc->B[10] = m256_const1_64( 0xBE216306BE216306 );
-       sc->B[11] = m256_const1_64( 0xB38B8890B38B8890 );
-       sc->B[12] = m256_const1_64( 0x3EA8B96B3EA8B96B );
-       sc->B[13] = m256_const1_64( 0x3299ACE43299ACE4 );
-       sc->B[14] = m256_const1_64( 0x30924DD430924DD4 );
-       sc->B[15] = m256_const1_64( 0x55CB34A555CB34A5 );
+       sc->B[ 0] = _mm256_set1_epi64x( 0xB555C6EEB555C6EE );
+       sc->B[ 1] = _mm256_set1_epi64x( 0x3E7105963E710596 );
+       sc->B[ 2] = _mm256_set1_epi64x( 0xA72A652FA72A652F );
+       sc->B[ 3] = _mm256_set1_epi64x( 0x9301515F9301515F );
+       sc->B[ 4] = _mm256_set1_epi64x( 0xDA28C1FADA28C1FA );
+       sc->B[ 5] = _mm256_set1_epi64x( 0x696FD868696FD868 );
+       sc->B[ 6] = _mm256_set1_epi64x( 0x9CB6BF729CB6BF72 );
+       sc->B[ 7] = _mm256_set1_epi64x( 0x0AFE40020AFE4002 );
+       sc->B[ 8] = _mm256_set1_epi64x( 0xA6E03615A6E03615 );
+       sc->B[ 9] = _mm256_set1_epi64x( 0x5138C1D45138C1D4 );
+       sc->B[10] = _mm256_set1_epi64x( 0xBE216306BE216306 );
+       sc->B[11] = _mm256_set1_epi64x( 0xB38B8890B38B8890 );
+       sc->B[12] = _mm256_set1_epi64x( 0x3EA8B96B3EA8B96B );
+       sc->B[13] = _mm256_set1_epi64x( 0x3299ACE43299ACE4 );
+       sc->B[14] = _mm256_set1_epi64x( 0x30924DD430924DD4 );
+       sc->B[15] = _mm256_set1_epi64x( 0x55CB34A555CB34A5 );

-       sc->C[ 0] = m256_const1_64( 0xB405F031B405F031 );
-       sc->C[ 1] = m256_const1_64( 0xC4233EBAC4233EBA );
-       sc->C[ 2] = m256_const1_64( 0xB3733979B3733979 );
-       sc->C[ 3] = m256_const1_64( 0xC0DD9D55C0DD9D55 );
-       sc->C[ 4] = m256_const1_64( 0xC51C28AEC51C28AE );
-       sc->C[ 5] = m256_const1_64( 0xA327B8E1A327B8E1 );
-       sc->C[ 6] = m256_const1_64( 0x56C5616756C56167 );
-       sc->C[ 7] = m256_const1_64( 0xED614433ED614433 );
-       sc->C[ 8] = m256_const1_64( 0x88B59D6088B59D60 );
-       sc->C[ 9] = m256_const1_64( 0x60E2CEBA60E2CEBA );
-       sc->C[10] = m256_const1_64( 0x758B4B8B758B4B8B );
-       sc->C[11] = m256_const1_64( 0x83E82A7F83E82A7F );
-       sc->C[12] = m256_const1_64( 0xBC968828BC968828 );
-       sc->C[13] = m256_const1_64( 0xE6E00BF7E6E00BF7 );
-       sc->C[14] = m256_const1_64( 0xBA839E55BA839E55 );
-       sc->C[15] = m256_const1_64( 0x9B491C609B491C60 );
+       sc->C[ 0] = _mm256_set1_epi64x( 0xB405F031B405F031 );
+       sc->C[ 1] = _mm256_set1_epi64x( 0xC4233EBAC4233EBA );
+       sc->C[ 2] = _mm256_set1_epi64x( 0xB3733979B3733979 );
+       sc->C[ 3] = _mm256_set1_epi64x( 0xC0DD9D55C0DD9D55 );
+       sc->C[ 4] = _mm256_set1_epi64x( 0xC51C28AEC51C28AE );
+       sc->C[ 5] = _mm256_set1_epi64x( 0xA327B8E1A327B8E1 );
+       sc->C[ 6] = _mm256_set1_epi64x( 0x56C5616756C56167 );
+       sc->C[ 7] = _mm256_set1_epi64x( 0xED614433ED614433 );
+       sc->C[ 8] = _mm256_set1_epi64x( 0x88B59D6088B59D60 );
+       sc->C[ 9] = _mm256_set1_epi64x( 0x60E2CEBA60E2CEBA );
+       sc->C[10] = _mm256_set1_epi64x( 0x758B4B8B758B4B8B );
+       sc->C[11] = _mm256_set1_epi64x( 0x83E82A7F83E82A7F );
+       sc->C[12] = _mm256_set1_epi64x( 0xBC968828BC968828 );
+       sc->C[13] = _mm256_set1_epi64x( 0xE6E00BF7E6E00BF7 );
+       sc->C[14] = _mm256_set1_epi64x( 0xBA839E55BA839E55 );
+       sc->C[15] = _mm256_set1_epi64x( 0x9B491C609B491C60 );
   }
    sc->Wlow = 1;
    sc->Whigh = 0;
@@ -707,50 +707,50 @@ shabal512_8way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
   else \
   { \
       (state)->state_loaded = true; \
-       A0 = m128_const1_64( 0x20728DFD20728DFD ); \
-       A1 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
-       A2 = m128_const1_64( 0xE782B699E782B699 ); \
-       A3 = m128_const1_64( 0x5530463255304632 ); \
-       A4 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
-       A5 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
-       A6 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
-       A7 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
-       A8 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
-       A9 = m128_const1_64( 0x8BD144108BD14410 ); \
-       AA = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
-       AB = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
-       B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \
-       B1 = m128_const1_64( 0x07B385F307B385F3 ); \
-       B2 = m128_const1_64( 0xE7442C26E7442C26 ); \
-       B3 = m128_const1_64( 0xCC8AD640CC8AD640 ); \
-       B4 = m128_const1_64( 0xEB6F56C7EB6F56C7 ); \
-       B5 = m128_const1_64( 0x1EA81AA91EA81AA9 ); \
-       B6 = m128_const1_64( 0x73B9D31473B9D314 ); \
-       B7 = m128_const1_64( 0x1DE85D081DE85D08 ); \
-       B8 = m128_const1_64( 0x48910A5A48910A5A ); \
-       B9 = m128_const1_64( 0x893B22DB893B22DB ); \
-       BA = m128_const1_64( 0xC5A0DF44C5A0DF44 ); \
-       BB = m128_const1_64( 0xBBC4324EBBC4324E ); \
-       BC = m128_const1_64( 0x72D2F24072D2F240 ); \
-       BD = m128_const1_64( 0x75941D9975941D99 ); \
-       BE = m128_const1_64( 0x6D8BDE826D8BDE82 ); \
-       BF = m128_const1_64( 0xA1A7502BA1A7502B ); \
-       C0 = m128_const1_64( 0xD9BF68D1D9BF68D1 ); \
-       C1 = m128_const1_64( 0x58BAD75058BAD750 ); \
-       C2 = m128_const1_64( 0x56028CB256028CB2 ); \
-       C3 = m128_const1_64( 0x8134F3598134F359 ); \
-       C4 = m128_const1_64( 0xB5D469D8B5D469D8 ); \
-       C5 = m128_const1_64( 0x941A8CC2941A8CC2 ); \
-       C6 = m128_const1_64( 0x418B2A6E418B2A6E ); \
-       C7 = m128_const1_64( 0x0405278004052780 ); \
-       C8 = m128_const1_64( 0x7F07D7877F07D787 ); \
-       C9 = m128_const1_64( 0x5194358F5194358F ); \
-       CA = m128_const1_64( 0x3C60D6653C60D665 ); \
-       CB = m128_const1_64( 0xBE97D79ABE97D79A ); \
-       CC = m128_const1_64( 0x950C3434950C3434 ); \
-       CD = m128_const1_64( 0xAED9A06DAED9A06D ); \
-       CE = m128_const1_64( 0x2537DC8D2537DC8D ); \
-       CF = m128_const1_64( 0x7CDB59697CDB5969 ); \
+       A0 = _mm_set1_epi64x( 0x20728DFD20728DFD ); \
+       A1 = _mm_set1_epi64x( 0x46C0BD5346C0BD53 ); \
+       A2 = _mm_set1_epi64x( 0xE782B699E782B699 ); \
+       A3 = _mm_set1_epi64x( 0x5530463255304632 ); \
+       A4 = _mm_set1_epi64x( 0x71B4EF9071B4EF90 ); \
+       A5 = _mm_set1_epi64x( 0x0EA9E82C0EA9E82C ); \
+       A6 = _mm_set1_epi64x( 0xDBB930F1DBB930F1 ); \
+       A7 = _mm_set1_epi64x( 0xFAD06B8BFAD06B8B ); \
+       A8 = _mm_set1_epi64x( 0xBE0CAE40BE0CAE40 ); \
+       A9 = _mm_set1_epi64x( 0x8BD144108BD14410 ); \
+       AA = _mm_set1_epi64x( 0x76D2ADAC76D2ADAC ); \
+       AB = _mm_set1_epi64x( 0x28ACAB7F28ACAB7F ); \
+       B0 = _mm_set1_epi64x( 0xC1099CB7C1099CB7 ); \
+       B1 = _mm_set1_epi64x( 0x07B385F307B385F3 ); \
+       B2 = _mm_set1_epi64x( 0xE7442C26E7442C26 ); \
+       B3 = _mm_set1_epi64x( 0xCC8AD640CC8AD640 ); \
+       B4 = _mm_set1_epi64x( 0xEB6F56C7EB6F56C7 ); \
+       B5 = _mm_set1_epi64x( 0x1EA81AA91EA81AA9 ); \
+       B6 = _mm_set1_epi64x( 0x73B9D31473B9D314 ); \
+       B7 = _mm_set1_epi64x( 0x1DE85D081DE85D08 ); \
+       B8 = _mm_set1_epi64x( 0x48910A5A48910A5A ); \
+       B9 = _mm_set1_epi64x( 0x893B22DB893B22DB ); \
+       BA = _mm_set1_epi64x( 0xC5A0DF44C5A0DF44 ); \
+       BB = _mm_set1_epi64x( 0xBBC4324EBBC4324E ); \
+       BC = _mm_set1_epi64x( 0x72D2F24072D2F240 ); \
+       BD = _mm_set1_epi64x( 0x75941D9975941D99 ); \
+       BE = _mm_set1_epi64x( 0x6D8BDE826D8BDE82 ); \
+       BF = _mm_set1_epi64x( 0xA1A7502BA1A7502B ); \
+       C0 = _mm_set1_epi64x( 0xD9BF68D1D9BF68D1 ); \
+       C1 = _mm_set1_epi64x( 0x58BAD75058BAD750 ); \
+       C2 = _mm_set1_epi64x( 0x56028CB256028CB2 ); \
+       C3 = _mm_set1_epi64x( 0x8134F3598134F359 ); \
+       C4 = _mm_set1_epi64x( 0xB5D469D8B5D469D8 ); \
+       C5 = _mm_set1_epi64x( 0x941A8CC2941A8CC2 ); \
+       C6 = _mm_set1_epi64x( 0x418B2A6E418B2A6E ); \
+       C7 = _mm_set1_epi64x( 0x0405278004052780 ); \
+       C8 = _mm_set1_epi64x( 0x7F07D7877F07D787 ); \
+       C9 = _mm_set1_epi64x( 0x5194358F5194358F ); \
+       CA = _mm_set1_epi64x( 0x3C60D6653C60D665 ); \
+       CB = _mm_set1_epi64x( 0xBE97D79ABE97D79A ); \
+       CC = _mm_set1_epi64x( 0x950C3434950C3434 ); \
+       CD = _mm_set1_epi64x( 0xAED9A06DAED9A06D ); \
+       CE = _mm_set1_epi64x( 0x2537DC8D2537DC8D ); \
+       CF = _mm_set1_epi64x( 0x7CDB59697CDB5969 ); \
   } \
   Wlow = (state)->Wlow; \
   Whigh = (state)->Whigh; \
@@ -896,6 +896,16 @@ do { \
    mm128_swap256_128( BF, CF ); \
 } while (0)

+#define PERM_ELT( xa0, xa1, xb0, xb1, xb2, xb3, xc, xm ) \
+do { \
+   xa0 = mm128_xor3( xm, xb1, mm128_xorandnot( \
+           _mm_mullo_epi32( mm128_xor3( xa0, xc, \
+              _mm_mullo_epi32( mm128_rol_32( xa1, 15 ), FIVE ) ), THREE ), \
+           xb3, xb2 ) ); \
+   xb0 = mm128_xnor( xa0, mm128_rol_32( xb0, 1 ) ); \
+} while (0)
+
+/*
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
 do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
@@ -905,6 +915,7 @@ do { \
                   ) ), THREE ) ) ) ); \
   xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
 } while (0)
+*/

 #define PERM_STEP_0   do { \
 		PERM_ELT(A0, AB, B0, BD, B9, B6, C8, M0); \
@@ -1078,103 +1089,103 @@ shabal_4way_init( void *cc, unsigned size )
   { // copy immediate constants directly to working registers later.
       sc->state_loaded = false;
 /*
-       sc->A[ 0] = m128_const1_64( 0x20728DFD20728DFD );
-       sc->A[ 1] = m128_const1_64( 0x46C0BD5346C0BD53 );
-       sc->A[ 2] = m128_const1_64( 0xE782B699E782B699 );
-       sc->A[ 3] = m128_const1_64( 0x5530463255304632 );
-       sc->A[ 4] = m128_const1_64( 0x71B4EF9071B4EF90 );
-       sc->A[ 5] = m128_const1_64( 0x0EA9E82C0EA9E82C );
-       sc->A[ 6] = m128_const1_64( 0xDBB930F1DBB930F1 );
-       sc->A[ 7] = m128_const1_64( 0xFAD06B8BFAD06B8B );
-       sc->A[ 8] = m128_const1_64( 0xBE0CAE40BE0CAE40 );
-       sc->A[ 9] = m128_const1_64( 0x8BD144108BD14410 );
-       sc->A[10] = m128_const1_64( 0x76D2ADAC76D2ADAC );
-       sc->A[11] = m128_const1_64( 0x28ACAB7F28ACAB7F );
+       sc->A[ 0] = _mm_set1_epi64x( 0x20728DFD20728DFD );
+       sc->A[ 1] = _mm_set1_epi64x( 0x46C0BD5346C0BD53 );
+       sc->A[ 2] = _mm_set1_epi64x( 0xE782B699E782B699 );
+       sc->A[ 3] = _mm_set1_epi64x( 0x5530463255304632 );
+       sc->A[ 4] = _mm_set1_epi64x( 0x71B4EF9071B4EF90 );
+       sc->A[ 5] = _mm_set1_epi64x( 0x0EA9E82C0EA9E82C );
+       sc->A[ 6] = _mm_set1_epi64x( 0xDBB930F1DBB930F1 );
+       sc->A[ 7] = _mm_set1_epi64x( 0xFAD06B8BFAD06B8B );
+       sc->A[ 8] = _mm_set1_epi64x( 0xBE0CAE40BE0CAE40 );
+       sc->A[ 9] = _mm_set1_epi64x( 0x8BD144108BD14410 );
+       sc->A[10] = _mm_set1_epi64x( 0x76D2ADAC76D2ADAC );
+       sc->A[11] = _mm_set1_epi64x( 0x28ACAB7F28ACAB7F );

-       sc->B[ 0] = m128_const1_64( 0xC1099CB7C1099CB7 );
-       sc->B[ 1] = m128_const1_64( 0x07B385F307B385F3 );
-       sc->B[ 2] = m128_const1_64( 0xE7442C26E7442C26 );
-       sc->B[ 3] = m128_const1_64( 0xCC8AD640CC8AD640 );
-       sc->B[ 4] = m128_const1_64( 0xEB6F56C7EB6F56C7 );
-       sc->B[ 5] = m128_const1_64( 0x1EA81AA91EA81AA9 );
-       sc->B[ 6] = m128_const1_64( 0x73B9D31473B9D314 );
-       sc->B[ 7] = m128_const1_64( 0x1DE85D081DE85D08 );
-       sc->B[ 8] = m128_const1_64( 0x48910A5A48910A5A );
-       sc->B[ 9] = m128_const1_64( 0x893B22DB893B22DB );
-       sc->B[10] = m128_const1_64( 0xC5A0DF44C5A0DF44 );
-       sc->B[11] = m128_const1_64( 0xBBC4324EBBC4324E );
-       sc->B[12] = m128_const1_64( 0x72D2F24072D2F240 );
-       sc->B[13] = m128_const1_64( 0x75941D9975941D99 );
-       sc->B[14] = m128_const1_64( 0x6D8BDE826D8BDE82 );
-       sc->B[15] = m128_const1_64( 0xA1A7502BA1A7502B );
+       sc->B[ 0] = _mm_set1_epi64x( 0xC1099CB7C1099CB7 );
+       sc->B[ 1] = _mm_set1_epi64x( 0x07B385F307B385F3 );
+       sc->B[ 2] = _mm_set1_epi64x( 0xE7442C26E7442C26 );
+       sc->B[ 3] = _mm_set1_epi64x( 0xCC8AD640CC8AD640 );
+       sc->B[ 4] = _mm_set1_epi64x( 0xEB6F56C7EB6F56C7 );
+       sc->B[ 5] = _mm_set1_epi64x( 0x1EA81AA91EA81AA9 );
+       sc->B[ 6] = _mm_set1_epi64x( 0x73B9D31473B9D314 );
+       sc->B[ 7] = _mm_set1_epi64x( 0x1DE85D081DE85D08 );
+       sc->B[ 8] = _mm_set1_epi64x( 0x48910A5A48910A5A );
+       sc->B[ 9] = _mm_set1_epi64x( 0x893B22DB893B22DB );
+       sc->B[10] = _mm_set1_epi64x( 0xC5A0DF44C5A0DF44 );
+       sc->B[11] = _mm_set1_epi64x( 0xBBC4324EBBC4324E );
+       sc->B[12] = _mm_set1_epi64x( 0x72D2F24072D2F240 );
+       sc->B[13] = _mm_set1_epi64x( 0x75941D9975941D99 );
+       sc->B[14] = _mm_set1_epi64x( 0x6D8BDE826D8BDE82 );
+       sc->B[15] = _mm_set1_epi64x( 0xA1A7502BA1A7502B );

-       sc->C[ 0] = m128_const1_64( 0xD9BF68D1D9BF68D1 );
-       sc->C[ 1] = m128_const1_64( 0x58BAD75058BAD750 );
-       sc->C[ 2] = m128_const1_64( 0x56028CB256028CB2 );
-       sc->C[ 3] = m128_const1_64( 0x8134F3598134F359 );
-       sc->C[ 4] = m128_const1_64( 0xB5D469D8B5D469D8 );
-       sc->C[ 5] = m128_const1_64( 0x941A8CC2941A8CC2 );
-       sc->C[ 6] = m128_const1_64( 0x418B2A6E418B2A6E );
-       sc->C[ 7] = m128_const1_64( 0x0405278004052780 );
-       sc->C[ 8] = m128_const1_64( 0x7F07D7877F07D787 );
-       sc->C[ 9] = m128_const1_64( 0x5194358F5194358F );
-       sc->C[10] = m128_const1_64( 0x3C60D6653C60D665 );
-       sc->C[11] = m128_const1_64( 0xBE97D79ABE97D79A );
-       sc->C[12] = m128_const1_64( 0x950C3434950C3434 );
-       sc->C[13] = m128_const1_64( 0xAED9A06DAED9A06D );
-       sc->C[14] = m128_const1_64( 0x2537DC8D2537DC8D );
-       sc->C[15] = m128_const1_64( 0x7CDB59697CDB5969 );
+       sc->C[ 0] = _mm_set1_epi64x( 0xD9BF68D1D9BF68D1 );
+       sc->C[ 1] = _mm_set1_epi64x( 0x58BAD75058BAD750 );
+       sc->C[ 2] = _mm_set1_epi64x( 0x56028CB256028CB2 );
+       sc->C[ 3] = _mm_set1_epi64x( 0x8134F3598134F359 );
+       sc->C[ 4] = _mm_set1_epi64x( 0xB5D469D8B5D469D8 );
+       sc->C[ 5] = _mm_set1_epi64x( 0x941A8CC2941A8CC2 );
+       sc->C[ 6] = _mm_set1_epi64x( 0x418B2A6E418B2A6E );
+       sc->C[ 7] = _mm_set1_epi64x( 0x0405278004052780 );
+       sc->C[ 8] = _mm_set1_epi64x( 0x7F07D7877F07D787 );
+       sc->C[ 9] = _mm_set1_epi64x( 0x5194358F5194358F );
+       sc->C[10] = _mm_set1_epi64x( 0x3C60D6653C60D665 );
+       sc->C[11] = _mm_set1_epi64x( 0xBE97D79ABE97D79A );
+       sc->C[12] = _mm_set1_epi64x( 0x950C3434950C3434 );
+       sc->C[13] = _mm_set1_epi64x( 0xAED9A06DAED9A06D );
+       sc->C[14] = _mm_set1_epi64x( 0x2537DC8D2537DC8D );
+       sc->C[15] = _mm_set1_epi64x( 0x7CDB59697CDB5969 );
 */
   }
   else
   {  // No users
       sc->state_loaded = true;
-       sc->A[ 0] = m128_const1_64( 0x52F8455252F84552 );
-       sc->A[ 1] = m128_const1_64( 0xE54B7999E54B7999 );
-       sc->A[ 2] = m128_const1_64( 0x2D8EE3EC2D8EE3EC );
-       sc->A[ 3] = m128_const1_64( 0xB9645191B9645191 );
-       sc->A[ 4] = m128_const1_64( 0xE0078B86E0078B86 );
-       sc->A[ 5] = m128_const1_64( 0xBB7C44C9BB7C44C9 );
-       sc->A[ 6] = m128_const1_64( 0xD2B5C1CAD2B5C1CA );
-       sc->A[ 7] = m128_const1_64( 0xB0D2EB8CB0D2EB8C );
-       sc->A[ 8] = m128_const1_64( 0x14CE5A4514CE5A45 );
-       sc->A[ 9] = m128_const1_64( 0x22AF50DC22AF50DC );
-       sc->A[10] = m128_const1_64( 0xEFFDBC6BEFFDBC6B );
-       sc->A[11] = m128_const1_64( 0xEB21B74AEB21B74A );
+       sc->A[ 0] = _mm_set1_epi64x( 0x52F8455252F84552 );
+       sc->A[ 1] = _mm_set1_epi64x( 0xE54B7999E54B7999 );
+       sc->A[ 2] = _mm_set1_epi64x( 0x2D8EE3EC2D8EE3EC );
+       sc->A[ 3] = _mm_set1_epi64x( 0xB9645191B9645191 );
+       sc->A[ 4] = _mm_set1_epi64x( 0xE0078B86E0078B86 );
+       sc->A[ 5] = _mm_set1_epi64x( 0xBB7C44C9BB7C44C9 );
+       sc->A[ 6] = _mm_set1_epi64x( 0xD2B5C1CAD2B5C1CA );
+       sc->A[ 7] = _mm_set1_epi64x( 0xB0D2EB8CB0D2EB8C );
+       sc->A[ 8] = _mm_set1_epi64x( 0x14CE5A4514CE5A45 );
+       sc->A[ 9] = _mm_set1_epi64x( 0x22AF50DC22AF50DC );
+       sc->A[10] = _mm_set1_epi64x( 0xEFFDBC6BEFFDBC6B );
+       sc->A[11] = _mm_set1_epi64x( 0xEB21B74AEB21B74A );

-       sc->B[ 0] = m128_const1_64( 0xB555C6EEB555C6EE );
-       sc->B[ 1] = m128_const1_64( 0x3E7105963E710596 );
-       sc->B[ 2] = m128_const1_64( 0xA72A652FA72A652F );
-       sc->B[ 3] = m128_const1_64( 0x9301515F9301515F );
-       sc->B[ 4] = m128_const1_64( 0xDA28C1FADA28C1FA );
-       sc->B[ 5] = m128_const1_64( 0x696FD868696FD868 );
-       sc->B[ 6] = m128_const1_64( 0x9CB6BF729CB6BF72 );
-       sc->B[ 7] = m128_const1_64( 0x0AFE40020AFE4002 );
-       sc->B[ 8] = m128_const1_64( 0xA6E03615A6E03615 );
-       sc->B[ 9] = m128_const1_64( 0x5138C1D45138C1D4 );
-       sc->B[10] = m128_const1_64( 0xBE216306BE216306 );
-       sc->B[11] = m128_const1_64( 0xB38B8890B38B8890 );
-       sc->B[12] = m128_const1_64( 0x3EA8B96B3EA8B96B );
-       sc->B[13] = m128_const1_64( 0x3299ACE43299ACE4 );
-       sc->B[14] = m128_const1_64( 0x30924DD430924DD4 );
-       sc->B[15] = m128_const1_64( 0x55CB34A555CB34A5 );
+       sc->B[ 0] = _mm_set1_epi64x( 0xB555C6EEB555C6EE );
+       sc->B[ 1] = _mm_set1_epi64x( 0x3E7105963E710596 );
+       sc->B[ 2] = _mm_set1_epi64x( 0xA72A652FA72A652F );
+       sc->B[ 3] = _mm_set1_epi64x( 0x9301515F9301515F );
+       sc->B[ 4] = _mm_set1_epi64x( 0xDA28C1FADA28C1FA );
+       sc->B[ 5] = _mm_set1_epi64x( 0x696FD868696FD868 );
+       sc->B[ 6] = _mm_set1_epi64x( 0x9CB6BF729CB6BF72 );
+       sc->B[ 7] = _mm_set1_epi64x( 0x0AFE40020AFE4002 );
+       sc->B[ 8] = _mm_set1_epi64x( 0xA6E03615A6E03615 );
+       sc->B[ 9] = _mm_set1_epi64x( 0x5138C1D45138C1D4 );
+       sc->B[10] = _mm_set1_epi64x( 0xBE216306BE216306 );
+       sc->B[11] = _mm_set1_epi64x( 0xB38B8890B38B8890 );
+       sc->B[12] = _mm_set1_epi64x( 0x3EA8B96B3EA8B96B );
+       sc->B[13] = _mm_set1_epi64x( 0x3299ACE43299ACE4 );
+       sc->B[14] = _mm_set1_epi64x( 0x30924DD430924DD4 );
+       sc->B[15] = _mm_set1_epi64x( 0x55CB34A555CB34A5 );

-       sc->C[ 0] = m128_const1_64( 0xB405F031B405F031 );
-       sc->C[ 1] = m128_const1_64( 0xC4233EBAC4233EBA );
-       sc->C[ 2] = m128_const1_64( 0xB3733979B3733979 );
-       sc->C[ 3] = m128_const1_64( 0xC0DD9D55C0DD9D55 );
-       sc->C[ 4] = m128_const1_64( 0xC51C28AEC51C28AE );
-       sc->C[ 5] = m128_const1_64( 0xA327B8E1A327B8E1 );
-       sc->C[ 6] = m128_const1_64( 0x56C5616756C56167 );
-       sc->C[ 7] = m128_const1_64( 0xED614433ED614433 );
-       sc->C[ 8] = m128_const1_64( 0x88B59D6088B59D60 );
-       sc->C[ 9] = m128_const1_64( 0x60E2CEBA60E2CEBA );
-       sc->C[10] = m128_const1_64( 0x758B4B8B758B4B8B );
-       sc->C[11] = m128_const1_64( 0x83E82A7F83E82A7F );
-       sc->C[12] = m128_const1_64( 0xBC968828BC968828 );
-       sc->C[13] = m128_const1_64( 0xE6E00BF7E6E00BF7 );
-       sc->C[14] = m128_const1_64( 0xBA839E55BA839E55 );
-       sc->C[15] = m128_const1_64( 0x9B491C609B491C60 );
+       sc->C[ 0] = _mm_set1_epi64x( 0xB405F031B405F031 );
+       sc->C[ 1] = _mm_set1_epi64x( 0xC4233EBAC4233EBA );
+       sc->C[ 2] = _mm_set1_epi64x( 0xB3733979B3733979 );
+       sc->C[ 3] = _mm_set1_epi64x( 0xC0DD9D55C0DD9D55 );
+       sc->C[ 4] = _mm_set1_epi64x( 0xC51C28AEC51C28AE );
+       sc->C[ 5] = _mm_set1_epi64x( 0xA327B8E1A327B8E1 );
+       sc->C[ 6] = _mm_set1_epi64x( 0x56C5616756C56167 );
+       sc->C[ 7] = _mm_set1_epi64x( 0xED614433ED614433 );
+       sc->C[ 8] = _mm_set1_epi64x( 0x88B59D6088B59D60 );
+       sc->C[ 9] = _mm_set1_epi64x( 0x60E2CEBA60E2CEBA );
+       sc->C[10] = _mm_set1_epi64x( 0x758B4B8B758B4B8B );
+       sc->C[11] = _mm_set1_epi64x( 0x83E82A7F83E82A7F );
+       sc->C[12] = _mm_set1_epi64x( 0xBC968828BC968828 );
+       sc->C[13] = _mm_set1_epi64x( 0xE6E00BF7E6E00BF7 );
+       sc->C[14] = _mm_set1_epi64x( 0xBA839E55BA839E55 );
+       sc->C[15] = _mm_set1_epi64x( 0x9B491C609B491C60 );
   }
    sc->Wlow = 1;
    sc->Whigh = 0;
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -32,6 +32,44 @@ static const uint32_t IV512[] =

 #endif

+#if defined (__AVX512VL__)
+//TODO Enable for AVX10_256
+
+#define DECL_m256i_count \
+   const __m256i count = \
+          mm256_set4_32( ctx->count3, ctx->count2, ctx->count1, ctx->count0 );
+
+#define COUNT_R0 \
+  _mm256_mask_xor_epi32( count, 0x88, count, m256_neg1 )
+
+#define COUNT_R1 \
+  mm256_shuflr128_32( _mm256_mask_xor_epi32( count, 0x11, count, m256_neg1 ) )
+
+#define COUNT_R2 \
+  mm256_swap128_64( _mm256_mask_xor_epi32( count, 0x22, count, m256_neg1 ) )
+
+#define COUNT_R13 \
+  mm256_swap64_32( _mm256_mask_xor_epi32( count, 0x44, count, m256_neg1 ) )
+
+#else
+
+#define DECL_m256i_count
+
+// R matches the loop index not the round number, should changet that
+#define COUNT_R0 \
+  mm256_set4_32( ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 )
+
+#define COUNT_R1 \
+  mm256_set4_32( ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) 
+
+#define COUNT_R2 \
+  mm256_set4_32( ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 )
+
+#define COUNT_R13 \
+  mm256_set4_32( ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 )
+
+#endif
+
 static void
 c512_2way( shavite512_2way_context *ctx, const void *msg )
 {
@@ -40,6 +78,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
   __m256i k00, k01, k02, k03, k10, k11, k12, k13;
   __m256i *m = (__m256i*)msg;
   __m256i *h = (__m256i*)ctx->h;
+   DECL_m256i_count;
   int r;

   p0 = h[0];
@@ -47,7 +86,8 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
   p2 = h[2];
   p3 = h[3];

-   // round
+   // round 0
+
   k00 = m[0];
   x = mm256_aesenc_2x128( _mm256_xor_si256( p1, k00 ), zero );
   k01 = m[1];
@@ -78,18 +118,14 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
                                  mm256_aesenc_2x128( k00, zero ) ) );

     if ( r == 0 )
-        k00 = _mm256_xor_si256( k00, _mm256_set_epi32( 
-		      ~ctx->count3, ctx->count2, ctx->count1, ctx->count0,
-                      ~ctx->count3, ctx->count2, ctx->count1, ctx->count0 ) );
+        k00 = _mm256_xor_si256( k00, COUNT_R0 );

     x = mm256_aesenc_2x128( _mm256_xor_si256( p0, k00 ), zero );
     k01 = _mm256_xor_si256( k00,
 		     mm256_shuflr128_32( mm256_aesenc_2x128( k01, zero ) ) );

     if ( r == 1 )
-        k01 = _mm256_xor_si256( k01, _mm256_set_epi32(
-	               ~ctx->count0, ctx->count1, ctx->count2, ctx->count3,
-                       ~ctx->count0, ctx->count1, ctx->count2, ctx->count3 ) );
+        k01 = _mm256_xor_si256( k01, COUNT_R1 );

     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k01 ), zero );
     k02 = _mm256_xor_si256( k01,
@@ -114,9 +150,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
 		     mm256_shuflr128_32( mm256_aesenc_2x128( k13, zero ) ) );

     if ( r == 2 )
-        k13 = _mm256_xor_si256( k13, _mm256_set_epi32(
-                  ~ctx->count1, ctx->count0, ctx->count3, ctx->count2,
-                  ~ctx->count1, ctx->count0, ctx->count3, ctx->count2 ) );
+        k13 = _mm256_xor_si256( k13, COUNT_R2 );
 
     x = mm256_aesenc_2x128( _mm256_xor_si256( x, k13 ), zero );
     p1 = _mm256_xor_si256( p1, x );
@@ -228,9 +262,7 @@ c512_2way( shavite512_2way_context *ctx, const void *msg )
   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k11 ), zero );

   k12 = mm256_shuflr128_32( mm256_aesenc_2x128( k12, zero ) );
-   k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, _mm256_set_epi32(
-	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1,
-	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
+   k12 = _mm256_xor_si256( k12, _mm256_xor_si256( k11, COUNT_R13 ) );

   x = mm256_aesenc_2x128( _mm256_xor_si256( x, k12 ), zero );
   k13 = _mm256_xor_si256( mm256_shuflr128_32(
--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -204,11 +204,9 @@ c512_4way( shavite512_4way_context *ctx, const void *msg )
   K5 = _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K5, m512_zero ) ), K4 );
   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K5 ), m512_zero );
-
   K6 = mm512_shuflr128_32( _mm512_aesenc_epi128( K6, m512_zero ) );
-   K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5, _mm512_set4_epi32(
-	       ~ctx->count2, ctx->count3, ctx->count0, ctx->count1 ) ) );
-
+   K6 = _mm512_xor_si512( K6, _mm512_xor_si512( K5,  mm512_swap64_32( 
+              _mm512_mask_xor_epi32( count, 0x4444, count, m512_neg1 ) ) ) );
   X = _mm512_aesenc_epi128( _mm512_xor_si512( X, K6 ), m512_zero );
   K7= _mm512_xor_si512( mm512_shuflr128_32(
 			             _mm512_aesenc_epi128( K7, m512_zero ) ), K6 );
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -212,14 +212,24 @@ do { \
 // targetted
 #define shufxor2w(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s ))

+#if defined(__AVX512VL__)
+//TODO Enable for AVX10_256
+
 #define REDUCE(x) \
-  _mm256_sub_epi16( _mm256_and_si256( x, m256_const1_64( \
+  _mm256_sub_epi16( _mm256_maskz_mov_epi8( 0x55555555, x ), \
+                    _mm256_srai_epi16( x, 8 ) )
+#else
+
+#define REDUCE(x) \
+  _mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi64x( \
                         0x00ff00ff00ff00ff ) ), _mm256_srai_epi16( x, 8 ) )

+#endif
+
 #define EXTRA_REDUCE_S(x)\
  _mm256_sub_epi16( x, _mm256_and_si256( \
-             m256_const1_64( 0x0101010101010101 ), \
-             _mm256_cmpgt_epi16( x, m256_const1_64( 0x0080008000800080 ) ) ) )
+          _mm256_set1_epi64x( 0x0101010101010101 ), \
+          _mm256_cmpgt_epi16( x, _mm256_set1_epi64x( 0x0080008000800080 ) ) ) )

 #define REDUCE_FULL_S( x )  EXTRA_REDUCE_S( REDUCE (x ) )

@@ -387,17 +397,11 @@ static const m512_v16 FFT256_Twiddle4w[] =
  _mm512_sub_epi16( _mm512_maskz_mov_epi8( 0x5555555555555555, x ), \
                    _mm512_srai_epi16( x, 8 ) )

-/*
-#define REDUCE4w(x) \
-  _mm512_sub_epi16( _mm512_and_si512( x, m512_const1_64( \
-                         0x00ff00ff00ff00ff ) ), _mm512_srai_epi16( x, 8 ) )
-*/
-
 #define EXTRA_REDUCE_S4w(x) \
  _mm512_sub_epi16( x, _mm512_and_si512( \
-             m512_const1_64( 0x0101010101010101 ), \
+             _mm512_set1_epi64( 0x0101010101010101 ), \
             _mm512_movm_epi16( _mm512_cmpgt_epi16_mask( \
-                               x, m512_const1_64( 0x0080008000800080 ) ) ) ) )
+                             x, _mm512_set1_epi64( 0x0080008000800080 ) ) ) ) )

 // generic, except it calls targetted macros
 #define REDUCE_FULL_S4w( x )  EXTRA_REDUCE_S4w( REDUCE4w (x ) )
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -63,7 +63,7 @@ int scanhash_skein_8way( struct work *work, uint32_t max_nonce,
          }
       }
       *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
       n += 8;
    } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

@@ -151,7 +151,7 @@ int scanhash_skein_4way( struct work *work, uint32_t max_nonce,
          }
       }
       *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
       n += 4;
    } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -285,7 +285,7 @@ static const uint64_t IV512[] = {
 #define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
 #define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))

-#define READ_STATE_BIG(sc)   do { \
+#define READ_STATE_BIG(sc) \
      h0 = (sc)->h0; \
      h1 = (sc)->h1; \
      h2 = (sc)->h2; \
@@ -294,10 +294,9 @@ static const uint64_t IV512[] = {
      h5 = (sc)->h5; \
      h6 = (sc)->h6; \
      h7 = (sc)->h7; \
-      bcount = sc->bcount; \
-   } while (0)
+      bcount = sc->bcount;

-#define WRITE_STATE_BIG(sc)   do { \
+#define WRITE_STATE_BIG(sc) \
      (sc)->h0 = h0; \
      (sc)->h1 = h1; \
      (sc)->h2 = h2; \
@@ -306,62 +305,54 @@ static const uint64_t IV512[] = {
      (sc)->h5 = h5; \
      (sc)->h6 = h6; \
      (sc)->h7 = h7; \
-      sc->bcount = bcount; \
-   } while (0)
+      sc->bcount = bcount;
   

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #define TFBIG_KINIT_8WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
-do { \
-  k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), mm512_xor3( k3, k4, k5 ), \
-                   mm512_xor3( k6, k7, m512_const1_64( 0x1BD11BDAA9FC1A22) ));\
-  t2 = t0 ^ t1; \
-} while (0)
+  k8 = mm512_xor3( mm512_xor3( k0, k1, k2 ), \
+                   mm512_xor3( k3, k4, k5 ), \
+                   mm512_xor3( k6, k7, \
+                              _mm512_set1_epi64( 0x1BD11BDAA9FC1A22) ) ); \
+  t2 = t0 ^ t1;

 #define TFBIG_ADDKEY_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
-do { \
  w0 = _mm512_add_epi64( w0, SKBI(k,s,0) ); \
  w1 = _mm512_add_epi64( w1, SKBI(k,s,1) ); \
  w2 = _mm512_add_epi64( w2, SKBI(k,s,2) ); \
  w3 = _mm512_add_epi64( w3, SKBI(k,s,3) ); \
  w4 = _mm512_add_epi64( w4, SKBI(k,s,4) ); \
  w5 = _mm512_add_epi64( w5, _mm512_add_epi64( SKBI(k,s,5), \
-                                         m512_const1_64( SKBT(t,s,0) ) ) ); \
+                                       _mm512_set1_epi64( SKBT(t,s,0) ) ) ); \
  w6 = _mm512_add_epi64( w6, _mm512_add_epi64( SKBI(k,s,6), \
-                                         m512_const1_64( SKBT(t,s,1) ) ) ); \
+                                       _mm512_set1_epi64( SKBT(t,s,1) ) ) ); \
  w7 = _mm512_add_epi64( w7, _mm512_add_epi64( SKBI(k,s,7), \
-                                         m512_const1_64( s ) ) ); \
-} while (0)
+                                        _mm512_set1_epi64( s ) ) );

 #define TFBIG_MIX_8WAY(x0, x1, rc) \
-do { \
     x0 = _mm512_add_epi64( x0, x1 ); \
-     x1 = _mm512_xor_si512( mm512_rol_64( x1, rc ), x0 ); \
-} while (0)
+     x1 = _mm512_xor_si512( mm512_rol_64( x1, rc ), x0 );

-#define TFBIG_MIX8_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
+#define TFBIG_MIX8_8WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) \
      TFBIG_MIX_8WAY(w0, w1, rc0); \
      TFBIG_MIX_8WAY(w2, w3, rc1); \
      TFBIG_MIX_8WAY(w4, w5, rc2); \
-      TFBIG_MIX_8WAY(w6, w7, rc3); \
-   } while (0)
+      TFBIG_MIX_8WAY(w6, w7, rc3);

-#define TFBIG_8WAY_4e(s)   do { \
+#define TFBIG_8WAY_4e(s) \
      TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
      TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
      TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
      TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
-      TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
-   } while (0)
+      TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56);

-#define TFBIG_8WAY_4o(s)   do { \
+#define TFBIG_8WAY_4o(s) \
      TFBIG_ADDKEY_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
      TFBIG_MIX8_8WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
      TFBIG_MIX8_8WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
      TFBIG_MIX8_8WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
-      TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
-   } while (0)
+      TFBIG_MIX8_8WAY(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22);

 #define UBI_BIG_8WAY(etype, extra) \
 do { \
@@ -424,59 +415,48 @@ do { \
 #endif // AVX512

 #define TFBIG_KINIT_4WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
-do { \
-  k8 = _mm256_xor_si256( _mm256_xor_si256( \
-                            _mm256_xor_si256( _mm256_xor_si256( k0, k1 ), \
-                                              _mm256_xor_si256( k2, k3 ) ), \
-                            _mm256_xor_si256( _mm256_xor_si256( k4, k5 ), \
-                                              _mm256_xor_si256( k6, k7 ) ) ), \
-                         m256_const1_64( 0x1BD11BDAA9FC1A22) ); \
-  t2 = t0 ^ t1; \
-} while (0)
+  k8 = mm256_xor3( mm256_xor3( k0, k1, k2 ), \
+                   mm256_xor3( k3, k4, k5 ), \
+                   mm256_xor3( k6, k7, \
+                               _mm256_set1_epi64x( 0x1BD11BDAA9FC1A22) ) ); \
+  t2 = t0 ^ t1;

 #define TFBIG_ADDKEY_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
-do { \
  w0 = _mm256_add_epi64( w0, SKBI(k,s,0) ); \
  w1 = _mm256_add_epi64( w1, SKBI(k,s,1) ); \
  w2 = _mm256_add_epi64( w2, SKBI(k,s,2) ); \
  w3 = _mm256_add_epi64( w3, SKBI(k,s,3) ); \
  w4 = _mm256_add_epi64( w4, SKBI(k,s,4) ); \
  w5 = _mm256_add_epi64( w5, _mm256_add_epi64( SKBI(k,s,5), \
-                                         m256_const1_64( SKBT(t,s,0) ) ) ); \
+                                       _mm256_set1_epi64x( SKBT(t,s,0) ) ) ); \
  w6 = _mm256_add_epi64( w6, _mm256_add_epi64( SKBI(k,s,6), \
-                                         m256_const1_64( SKBT(t,s,1) ) ) ); \
+                                       _mm256_set1_epi64x( SKBT(t,s,1) ) ) ); \
  w7 = _mm256_add_epi64( w7, _mm256_add_epi64( SKBI(k,s,7), \
-                                         m256_const1_64( s ) ) ); \
-} while (0)
+                                       _mm256_set1_epi64x( s ) ) );

 #define TFBIG_MIX_4WAY(x0, x1, rc) \
-do { \
     x0 = _mm256_add_epi64( x0, x1 ); \
-     x1 = _mm256_xor_si256( mm256_rol_64( x1, rc ), x0 ); \
-} while (0)
+     x1 = _mm256_xor_si256( mm256_rol_64( x1, rc ), x0 );

-#define TFBIG_MIX8_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
+#define TFBIG_MIX8_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) \
      TFBIG_MIX_4WAY(w0, w1, rc0); \
      TFBIG_MIX_4WAY(w2, w3, rc1); \
      TFBIG_MIX_4WAY(w4, w5, rc2); \
-      TFBIG_MIX_4WAY(w6, w7, rc3); \
-   } while (0)
+      TFBIG_MIX_4WAY(w6, w7, rc3);

-#define TFBIG_4WAY_4e(s)   do { \
+#define TFBIG_4WAY_4e(s) \
      TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
      TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
      TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
      TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
-      TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
-   } while (0)
+      TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56);

-#define TFBIG_4WAY_4o(s)   do { \
+#define TFBIG_4WAY_4o(s) \
      TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
      TFBIG_MIX8_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
      TFBIG_MIX8_4WAY(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
      TFBIG_MIX8_4WAY(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
-      TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
-   } while (0)
+      TFBIG_MIX8_4WAY(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22);

 // scale buf offset by 4
 #define UBI_BIG_4WAY(etype, extra) \
@@ -541,28 +521,28 @@ do { \

 void skein256_8way_init( skein256_8way_context *sc )
 {
-        sc->h0 = m512_const1_64( 0xCCD044A12FDB3E13 );
-        sc->h1 = m512_const1_64( 0xE83590301A79A9EB );
-        sc->h2 = m512_const1_64( 0x55AEA0614F816E6F );
-        sc->h3 = m512_const1_64( 0x2A2767A4AE9B94DB );
-        sc->h4 = m512_const1_64( 0xEC06025E74DD7683 );
-        sc->h5 = m512_const1_64( 0xE7A436CDC4746251 );
-        sc->h6 = m512_const1_64( 0xC36FBAF9393AD185 );
-        sc->h7 = m512_const1_64( 0x3EEDBA1833EDFC13 );
+        sc->h0 = _mm512_set1_epi64( 0xCCD044A12FDB3E13 );
+        sc->h1 = _mm512_set1_epi64( 0xE83590301A79A9EB );
+        sc->h2 = _mm512_set1_epi64( 0x55AEA0614F816E6F );
+        sc->h3 = _mm512_set1_epi64( 0x2A2767A4AE9B94DB );
+        sc->h4 = _mm512_set1_epi64( 0xEC06025E74DD7683 );
+        sc->h5 = _mm512_set1_epi64( 0xE7A436CDC4746251 );
+        sc->h6 = _mm512_set1_epi64( 0xC36FBAF9393AD185 );
+        sc->h7 = _mm512_set1_epi64( 0x3EEDBA1833EDFC13 );
        sc->bcount = 0;
        sc->ptr = 0;
 }

 void skein512_8way_init( skein512_8way_context *sc )
 {
-        sc->h0 = m512_const1_64( 0x4903ADFF749C51CE );
-        sc->h1 = m512_const1_64( 0x0D95DE399746DF03 );
-        sc->h2 = m512_const1_64( 0x8FD1934127C79BCE );
-        sc->h3 = m512_const1_64( 0x9A255629FF352CB1 );
-        sc->h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
-        sc->h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
-        sc->h6 = m512_const1_64( 0x991112C71A75B523 );
-        sc->h7 = m512_const1_64( 0xAE18A40B660FCC33 );
+        sc->h0 = _mm512_set1_epi64( 0x4903ADFF749C51CE );
+        sc->h1 = _mm512_set1_epi64( 0x0D95DE399746DF03 );
+        sc->h2 = _mm512_set1_epi64( 0x8FD1934127C79BCE );
+        sc->h3 = _mm512_set1_epi64( 0x9A255629FF352CB1 );
+        sc->h4 = _mm512_set1_epi64( 0x5DB62599DF6CA7B0 );
+        sc->h5 = _mm512_set1_epi64( 0xEABE394CA9D5C3F4 );
+        sc->h6 = _mm512_set1_epi64( 0x991112C71A75B523 );
+        sc->h7 = _mm512_set1_epi64( 0xAE18A40B660FCC33 );
        sc->bcount = 0;
        sc->ptr = 0;
 }
@@ -660,14 +640,14 @@ void skein512_8way_full( skein512_8way_context *sc, void *out, const void *data,

 // Init

-        h0 = m512_const1_64( 0x4903ADFF749C51CE );
-        h1 = m512_const1_64( 0x0D95DE399746DF03 );
-        h2 = m512_const1_64( 0x8FD1934127C79BCE );
-        h3 = m512_const1_64( 0x9A255629FF352CB1 );
-        h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
-        h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
-        h6 = m512_const1_64( 0x991112C71A75B523 );
-        h7 = m512_const1_64( 0xAE18A40B660FCC33 );
+        h0 = _mm512_set1_epi64( 0x4903ADFF749C51CE );
+        h1 = _mm512_set1_epi64( 0x0D95DE399746DF03 );
+        h2 = _mm512_set1_epi64( 0x8FD1934127C79BCE );
+        h3 = _mm512_set1_epi64( 0x9A255629FF352CB1 );
+        h4 = _mm512_set1_epi64( 0x5DB62599DF6CA7B0 );
+        h5 = _mm512_set1_epi64( 0xEABE394CA9D5C3F4 );
+        h6 = _mm512_set1_epi64( 0x991112C71A75B523 );
+        h7 = _mm512_set1_epi64( 0xAE18A40B660FCC33 );

 // Update

@@ -734,14 +714,14 @@ skein512_8way_prehash64( skein512_8way_context *sc, const void *data )
   buf[5] = vdata[5];
   buf[6] = vdata[6];
   buf[7] = vdata[7];
-   register __m512i h0 = m512_const1_64( 0x4903ADFF749C51CE );
-   register __m512i h1 = m512_const1_64( 0x0D95DE399746DF03 );
-   register __m512i h2 = m512_const1_64( 0x8FD1934127C79BCE );
-   register __m512i h3 = m512_const1_64( 0x9A255629FF352CB1 );
-   register __m512i h4 = m512_const1_64( 0x5DB62599DF6CA7B0 );
-   register __m512i h5 = m512_const1_64( 0xEABE394CA9D5C3F4 );
-   register __m512i h6 = m512_const1_64( 0x991112C71A75B523 );
-   register __m512i h7 = m512_const1_64( 0xAE18A40B660FCC33 );
+   register __m512i h0 = _mm512_set1_epi64( 0x4903ADFF749C51CE );
+   register __m512i h1 = _mm512_set1_epi64( 0x0D95DE399746DF03 );
+   register __m512i h2 = _mm512_set1_epi64( 0x8FD1934127C79BCE );
+   register __m512i h3 = _mm512_set1_epi64( 0x9A255629FF352CB1 );
+   register __m512i h4 = _mm512_set1_epi64( 0x5DB62599DF6CA7B0 );
+   register __m512i h5 = _mm512_set1_epi64( 0xEABE394CA9D5C3F4 );
+   register __m512i h6 = _mm512_set1_epi64( 0x991112C71A75B523 );
+   register __m512i h7 = _mm512_set1_epi64( 0xAE18A40B660FCC33 );
   uint64_t bcount = 1;

   UBI_BIG_8WAY( 224, 0 );
@@ -830,28 +810,28 @@ skein512_8way_close(void *cc, void *dst)

 void skein256_4way_init( skein256_4way_context *sc )
 {
-        sc->h0 = m256_const1_64( 0xCCD044A12FDB3E13 );
-        sc->h1 = m256_const1_64( 0xE83590301A79A9EB );
-        sc->h2 = m256_const1_64( 0x55AEA0614F816E6F );
-        sc->h3 = m256_const1_64( 0x2A2767A4AE9B94DB );
-        sc->h4 = m256_const1_64( 0xEC06025E74DD7683 );
-        sc->h5 = m256_const1_64( 0xE7A436CDC4746251 );
-        sc->h6 = m256_const1_64( 0xC36FBAF9393AD185 );
-        sc->h7 = m256_const1_64( 0x3EEDBA1833EDFC13 );
+        sc->h0 = _mm256_set1_epi64x( 0xCCD044A12FDB3E13 );
+        sc->h1 = _mm256_set1_epi64x( 0xE83590301A79A9EB );
+        sc->h2 = _mm256_set1_epi64x( 0x55AEA0614F816E6F );
+        sc->h3 = _mm256_set1_epi64x( 0x2A2767A4AE9B94DB );
+        sc->h4 = _mm256_set1_epi64x( 0xEC06025E74DD7683 );
+        sc->h5 = _mm256_set1_epi64x( 0xE7A436CDC4746251 );
+        sc->h6 = _mm256_set1_epi64x( 0xC36FBAF9393AD185 );
+        sc->h7 = _mm256_set1_epi64x( 0x3EEDBA1833EDFC13 );
        sc->bcount = 0;
        sc->ptr = 0;
 }

 void skein512_4way_init( skein512_4way_context *sc )
 {
-        sc->h0 = m256_const1_64( 0x4903ADFF749C51CE );
-        sc->h1 = m256_const1_64( 0x0D95DE399746DF03 );
-        sc->h2 = m256_const1_64( 0x8FD1934127C79BCE );
-        sc->h3 = m256_const1_64( 0x9A255629FF352CB1 );
-        sc->h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
-        sc->h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
-        sc->h6 = m256_const1_64( 0x991112C71A75B523 );
-        sc->h7 = m256_const1_64( 0xAE18A40B660FCC33 );
+        sc->h0 = _mm256_set1_epi64x( 0x4903ADFF749C51CE );
+        sc->h1 = _mm256_set1_epi64x( 0x0D95DE399746DF03 );
+        sc->h2 = _mm256_set1_epi64x( 0x8FD1934127C79BCE );
+        sc->h3 = _mm256_set1_epi64x( 0x9A255629FF352CB1 );
+        sc->h4 = _mm256_set1_epi64x( 0x5DB62599DF6CA7B0 );
+        sc->h5 = _mm256_set1_epi64x( 0xEABE394CA9D5C3F4 );
+        sc->h6 = _mm256_set1_epi64x( 0x991112C71A75B523 );
+        sc->h7 = _mm256_set1_epi64x( 0xAE18A40B660FCC33 );
        sc->bcount = 0;
        sc->ptr = 0;
 }
@@ -954,14 +934,14 @@ skein512_4way_full( skein512_4way_context *sc, void *out, const void *data,
   const int buf_size = 64;   // 64 * __m256i
   uint64_t bcount = 0;

-   h0 = m256_const1_64( 0x4903ADFF749C51CE );
-   h1 = m256_const1_64( 0x0D95DE399746DF03 );
-   h2 = m256_const1_64( 0x8FD1934127C79BCE );
-   h3 = m256_const1_64( 0x9A255629FF352CB1 );
-   h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
-   h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
-   h6 = m256_const1_64( 0x991112C71A75B523 );
-   h7 = m256_const1_64( 0xAE18A40B660FCC33 );
+   h0 = _mm256_set1_epi64x( 0x4903ADFF749C51CE );
+   h1 = _mm256_set1_epi64x( 0x0D95DE399746DF03 );
+   h2 = _mm256_set1_epi64x( 0x8FD1934127C79BCE );
+   h3 = _mm256_set1_epi64x( 0x9A255629FF352CB1 );
+   h4 = _mm256_set1_epi64x( 0x5DB62599DF6CA7B0 );
+   h5 = _mm256_set1_epi64x( 0xEABE394CA9D5C3F4 );
+   h6 = _mm256_set1_epi64x( 0x991112C71A75B523 );
+   h7 = _mm256_set1_epi64x( 0xAE18A40B660FCC33 );

 // Update     

@@ -1028,14 +1008,14 @@ skein512_4way_prehash64( skein512_4way_context *sc, const void *data )
   buf[5] = vdata[5];
   buf[6] = vdata[6];
   buf[7] = vdata[7];
-   register __m256i h0 = m256_const1_64( 0x4903ADFF749C51CE );
-   register __m256i h1 = m256_const1_64( 0x0D95DE399746DF03 );
-   register __m256i h2 = m256_const1_64( 0x8FD1934127C79BCE );
-   register __m256i h3 = m256_const1_64( 0x9A255629FF352CB1 );
-   register __m256i h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
-   register __m256i h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
-   register __m256i h6 = m256_const1_64( 0x991112C71A75B523 );
-   register __m256i h7 = m256_const1_64( 0xAE18A40B660FCC33 );
+   register __m256i h0 = _mm256_set1_epi64x( 0x4903ADFF749C51CE );
+   register __m256i h1 = _mm256_set1_epi64x( 0x0D95DE399746DF03 );
+   register __m256i h2 = _mm256_set1_epi64x( 0x8FD1934127C79BCE );
+   register __m256i h3 = _mm256_set1_epi64x( 0x9A255629FF352CB1 );
+   register __m256i h4 = _mm256_set1_epi64x( 0x5DB62599DF6CA7B0 );
+   register __m256i h5 = _mm256_set1_epi64x( 0xEABE394CA9D5C3F4 );
+   register __m256i h6 = _mm256_set1_epi64x( 0x991112C71A75B523 );
+   register __m256i h7 = _mm256_set1_epi64x( 0xAE18A40B660FCC33 );
   uint64_t bcount = 1;

   UBI_BIG_4WAY( 224, 0 );
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -57,7 +57,7 @@ int scanhash_skein2_8way( struct work *work, uint32_t max_nonce,
          }
       }
       *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
       n += 8;
    } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

@@ -119,7 +119,7 @@ int scanhash_skein2_4way( struct work *work, uint32_t max_nonce,
          }
       }
       *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
       n += 4;
    } while ( (n < last_nonce) && !work_restart[thr_id].restart );

--- a/algo/swifftx/swifftx.c
+++ b/algo/swifftx/swifftx.c
@@ -630,36 +630,35 @@ void InitializeSWIFFTX()
 }

 // In the original code the F matrix is rotated so it was not aranged
-// the same as all the other data. Rearanging F to match all the other
-// data made vectorizing possible, the compiler probably could have been
-// able to auto-vectorize with proper data organisation.
-// Also in the original code the custom 16 bit data types are all now 32
-// bit int32_t regardless of the type name.
-//
-void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
+// the same as the other data. Rearanging F made vectorizing up to 256 bits
+// possible. 
+// Also in the original code the custom 16 bit data types are all now aliased
+// to 32 bit int32_t.
+
+void FFT( const unsigned char input[EIGHTH_N], swift_int32_t *output )
 {
 #if defined(__AVX2__)

-   __m256i F[8] __attribute__ ((aligned (64)));
+   __m256i F0, F1, F2, F3, F4, F5, F6, F7;
+   __m256i tbl = *(__m256i*)&( fftTable[ input[0] << 3 ] );
   __m256i *mul = (__m256i*)multipliers;
   __m256i *out = (__m256i*)output;
-   __m256i *tbl = (__m256i*)&( fftTable[ input[0] << 3 ] );

-   F[0] = _mm256_mullo_epi32( mul[0], *tbl );
-   tbl = (__m256i*)&( fftTable[ input[1] << 3 ] );
-   F[1] = _mm256_mullo_epi32( mul[1], *tbl );
-   tbl = (__m256i*)&( fftTable[ input[2] << 3 ] );
-   F[2] = _mm256_mullo_epi32( mul[2], *tbl );
-   tbl = (__m256i*)&( fftTable[ input[3] << 3 ] );
-   F[3] = _mm256_mullo_epi32( mul[3], *tbl );
-   tbl = (__m256i*)&( fftTable[ input[4] << 3 ] );
-   F[4] = _mm256_mullo_epi32( mul[4], *tbl );
-   tbl = (__m256i*)&( fftTable[ input[5] << 3 ] );
-   F[5] = _mm256_mullo_epi32( mul[5], *tbl );
-   tbl = (__m256i*)&( fftTable[ input[6] << 3 ] );
-   F[6] = _mm256_mullo_epi32( mul[6], *tbl );
-   tbl = (__m256i*)&( fftTable[ input[7] << 3 ] );
-   F[7] = _mm256_mullo_epi32( mul[7], *tbl );
+   F0 = _mm256_mullo_epi32( mul[0], tbl );
+   tbl = *(__m256i*)&( fftTable[ input[1] << 3 ] );
+   F1 = _mm256_mullo_epi32( mul[1], tbl );
+   tbl = *(__m256i*)&( fftTable[ input[2] << 3 ] );
+   F2 = _mm256_mullo_epi32( mul[2], tbl );
+   tbl = *(__m256i*)&( fftTable[ input[3] << 3 ] );
+   F3 = _mm256_mullo_epi32( mul[3], tbl );
+   tbl = *(__m256i*)&( fftTable[ input[4] << 3 ] );
+   F4 = _mm256_mullo_epi32( mul[4], tbl );
+   tbl = *(__m256i*)&( fftTable[ input[5] << 3 ] );
+   F5 = _mm256_mullo_epi32( mul[5], tbl );
+   tbl = *(__m256i*)&( fftTable[ input[6] << 3 ] );
+   F6 = _mm256_mullo_epi32( mul[6], tbl );
+   tbl = *(__m256i*)&( fftTable[ input[7] << 3 ] );
+   F7 = _mm256_mullo_epi32( mul[7], tbl );

   #define ADD_SUB( a, b ) \
   { \
@@ -668,52 +667,50 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
      a = _mm256_add_epi32( a, tmp ); \
   }
   
-   ADD_SUB( F[0], F[1] );
-   ADD_SUB( F[2], F[3] );
-   ADD_SUB( F[4], F[5] );
-   ADD_SUB( F[6], F[7] );
-
-   F[3] = _mm256_slli_epi32( F[3], 4 );
-   F[7] = _mm256_slli_epi32( F[7], 4 );
-
-   ADD_SUB( F[0], F[2] );
-   ADD_SUB( F[1], F[3] );
-   ADD_SUB( F[4], F[6] );
-   ADD_SUB( F[5], F[7] );  
-
-   F[5] = _mm256_slli_epi32( F[5], 2 );
-   F[6] = _mm256_slli_epi32( F[6], 4 );
-   F[7] = _mm256_slli_epi32( F[7], 6 );
-
-   ADD_SUB( F[0], F[4] );
-   ADD_SUB( F[1], F[5] );
-   ADD_SUB( F[2], F[6] );
-   ADD_SUB( F[3], F[7] );
+   ADD_SUB( F0, F1 );
+   ADD_SUB( F2, F3 );
+   ADD_SUB( F4, F5 );
+   ADD_SUB( F6, F7 );
+   F3 = _mm256_slli_epi32( F3, 4 );
+   F7 = _mm256_slli_epi32( F7, 4 );
+   ADD_SUB( F0, F2 );
+   ADD_SUB( F1, F3 );
+   ADD_SUB( F4, F6 );
+   ADD_SUB( F5, F7 );  
+   F5 = _mm256_slli_epi32( F5, 2 );
+   F6 = _mm256_slli_epi32( F6, 4 );
+   F7 = _mm256_slli_epi32( F7, 6 );
+   ADD_SUB( F0, F4 );
+   ADD_SUB( F1, F5 );
+   ADD_SUB( F2, F6 );
+   ADD_SUB( F3, F7 );

   #undef ADD_SUB

 #if defined (__AVX512VL__) && defined(__AVX512BW__)   

-   const __m256i mask = _mm256_movm_epi8( 0x11111111 );
-
+   #define Q_REDUCE( a ) \
+       _mm256_sub_epi32( _mm256_maskz_mov_epi8( 0x11111111, a ), \
+                         _mm256_srai_epi32( a, 8 ) )
+         
 #else

-   const __m256i mask = m256_const1_32( 0x000000ff );
-
-#endif
+   const __m256i mask = _mm256_set1_epi32( 0x000000ff );

   #define Q_REDUCE( a ) \
       _mm256_sub_epi32( _mm256_and_si256( a, mask ), \
                         _mm256_srai_epi32( a, 8 ) )
+   
+#endif

-   out[0] = Q_REDUCE( F[0] );  
-   out[1] = Q_REDUCE( F[1] );                        
-   out[2] = Q_REDUCE( F[2] );                        
-   out[3] = Q_REDUCE( F[3] );                        
-   out[4] = Q_REDUCE( F[4] );                        
-   out[5] = Q_REDUCE( F[5] );                        
-   out[6] = Q_REDUCE( F[6] );                        
-   out[7] = Q_REDUCE( F[7] );
+   out[0] = Q_REDUCE( F0 );  
+   out[1] = Q_REDUCE( F1 );                        
+   out[2] = Q_REDUCE( F2 );                        
+   out[3] = Q_REDUCE( F3 );                        
+   out[4] = Q_REDUCE( F4 );                        
+   out[5] = Q_REDUCE( F5 );                        
+   out[6] = Q_REDUCE( F6 );                        
+   out[7] = Q_REDUCE( F7 );

   #undef Q_REDUCE

@@ -763,12 +760,10 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
   ADD_SUB( F[ 9], F[11] );
   ADD_SUB( F[12], F[14] );
   ADD_SUB( F[13], F[15] );
-
   F[ 6] = _mm_slli_epi32( F[ 6], 4 );
   F[ 7] = _mm_slli_epi32( F[ 7], 4 );
   F[14] = _mm_slli_epi32( F[14], 4 );
   F[15] = _mm_slli_epi32( F[15], 4 );
-
   ADD_SUB( F[ 0], F[ 4] );
   ADD_SUB( F[ 1], F[ 5] );
   ADD_SUB( F[ 2], F[ 6] );
@@ -777,14 +772,12 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
   ADD_SUB( F[ 9], F[13] );
   ADD_SUB( F[10], F[14] );
   ADD_SUB( F[11], F[15] );
-
   F[10] = _mm_slli_epi32( F[10], 2 );
   F[11] = _mm_slli_epi32( F[11], 2 );
   F[12] = _mm_slli_epi32( F[12], 4 );
   F[13] = _mm_slli_epi32( F[13], 4 );
   F[14] = _mm_slli_epi32( F[14], 6 );
   F[15] = _mm_slli_epi32( F[15], 6 );
-   
   ADD_SUB( F[ 0], F[ 8] );
   ADD_SUB( F[ 1], F[ 9] );
   ADD_SUB( F[ 2], F[10] );
@@ -796,7 +789,7 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)

   #undef ADD_SUB

-   const __m128i mask = m128_const1_32( 0x000000ff );
+   const __m128i mask = _mm_set1_epi32( 0x000000ff );

   #define Q_REDUCE( a ) \
      _mm_sub_epi32( _mm_and_si128( a, mask ), _mm_srai_epi32( a, 8 ) ) 
@@ -820,16 +813,13 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)

   #undef Q_REDUCE

-#else   // < SSE4.1
+#else   // AVX256 elif SSE4_1
   
   swift_int16_t *mult = multipliers;
-
-   // First loop unrolling:
-	register swift_int16_t *table = &(fftTable[input[0] << 3]);
-
-/*
+	swift_int16_t *table = &( fftTable[ input[0] << 3 ] );
   swift_int32_t F[64];

+   /*
   for (int i = 0; i < 8; i++)
   {
      int j = i<<3;
@@ -845,99 +835,91 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
   }
 */

-   register swift_int32_t F0, F1, F2, F3, F4, F5, F6, F7, F8, F9,
-                F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
-                F20, F21, F22, F23, F24, F25, F26, F27, F28, F29,
-                F30, F31, F32, F33, F34, F35, F36, F37, F38, F39,
-                F40, F41, F42, F43, F44, F45, F46, F47, F48, F49,
-                F50, F51, F52, F53, F54, F55, F56, F57, F58, F59,
-                F60, F61, F62, F63;
-   
-	F0  = mult[0] * table[0];
-	F8  = mult[1] * table[1];
-	F16 = mult[2] * table[2];
-	F24 = mult[3] * table[3];
-	F32 = mult[4] * table[4];
-	F40 = mult[5] * table[5];
-	F48 = mult[6] * table[6];
-	F56 = mult[7] * table[7];
+	F[ 0] = mult[ 0] * table[0];
+	F[ 8] = mult[ 1] * table[1];
+	F[16] = mult[ 2] * table[2];
+	F[24] = mult[ 3] * table[3];
+	F[32] = mult[ 4] * table[4];
+	F[40] = mult[ 5] * table[5];
+	F[48] = mult[ 6] * table[6];
+	F[56] = mult[ 7] * table[7];

 	table = &(fftTable[input[1] << 3]);

-	F1  = mult[ 8] * table[0];
-	F9  = mult[ 9] * table[1];
-	F17 = mult[10] * table[2];
-	F25 = mult[11] * table[3];
-	F33 = mult[12] * table[4];
-	F41 = mult[13] * table[5];
-	F49 = mult[14] * table[6];
-	F57 = mult[15] * table[7];
+	F[ 1] = mult[ 8] * table[0];
+	F[ 9] = mult[ 9] * table[1];
+	F[17] = mult[10] * table[2];
+	F[25] = mult[11] * table[3];
+	F[33] = mult[12] * table[4];
+	F[41] = mult[13] * table[5];
+	F[49] = mult[14] * table[6];
+	F[57] = mult[15] * table[7];

 	table = &(fftTable[input[2] << 3]);

-	F2  = mult[16] * table[0];
-	F10 = mult[17] * table[1];
-	F18 = mult[18] * table[2];
-	F26 = mult[19] * table[3];
-	F34 = mult[20] * table[4];
-	F42 = mult[21] * table[5];
-	F50 = mult[22] * table[6];
-	F58 = mult[23] * table[7];
+	F[ 2] = mult[16] * table[0];
+	F[10] = mult[17] * table[1];
+	F[18] = mult[18] * table[2];
+	F[26] = mult[19] * table[3];
+	F[34] = mult[20] * table[4];
+	F[42] = mult[21] * table[5];
+	F[50] = mult[22] * table[6];
+	F[58] = mult[23] * table[7];

 	table = &(fftTable[input[3] << 3]);

-	F3  = mult[24] * table[0];
-	F11 = mult[25] * table[1];
-	F19 = mult[26] * table[2];
-	F27 = mult[27] * table[3];
-	F35 = mult[28] * table[4];
-	F43 = mult[29] * table[5];
-	F51 = mult[30] * table[6];
-	F59 = mult[31] * table[7];
+	F[ 3] = mult[24] * table[0];
+	F[11] = mult[25] * table[1];
+	F[19] = mult[26] * table[2];
+	F[27] = mult[27] * table[3];
+	F[35] = mult[28] * table[4];
+	F[43] = mult[29] * table[5];
+	F[51] = mult[30] * table[6];
+	F[59] = mult[31] * table[7];

 	table = &(fftTable[input[4] << 3]);

-	F4  = mult[32] * table[0];
-	F12 = mult[33] * table[1];
-	F20 = mult[34] * table[2];
-	F28 = mult[35] * table[3];
-	F36 = mult[36] * table[4];
-	F44 = mult[37] * table[5];
-	F52 = mult[38] * table[6];
-	F60 = mult[39] * table[7];
+	F[ 4] = mult[32] * table[0];
+	F[12] = mult[33] * table[1];
+	F[20] = mult[34] * table[2];
+	F[28] = mult[35] * table[3];
+	F[36] = mult[36] * table[4];
+	F[44] = mult[37] * table[5];
+	F[52] = mult[38] * table[6];
+	F[60] = mult[39] * table[7];

 	table = &(fftTable[input[5] << 3]);

-	F5  = mult[40] * table[0];
-	F13 = mult[41] * table[1];
-	F21 = mult[42] * table[2];
-	F29 = mult[43] * table[3];
-	F37 = mult[44] * table[4];
-	F45 = mult[45] * table[5];
-	F53 = mult[46] * table[6];
-	F61 = mult[47] * table[7];
+	F[ 5] = mult[40] * table[0];
+	F[13] = mult[41] * table[1];
+	F[21] = mult[42] * table[2];
+	F[29] = mult[43] * table[3];
+	F[37] = mult[44] * table[4];
+	F[45] = mult[45] * table[5];
+	F[53] = mult[46] * table[6];
+	F[61] = mult[47] * table[7];

 	table = &(fftTable[input[6] << 3]);

-	F6  = mult[48] * table[0];
-	F14 = mult[49] * table[1];
-	F22 = mult[50] * table[2];
-	F30 = mult[51] * table[3];
-	F38 = mult[52] * table[4];
-	F46 = mult[53] * table[5];
-	F54 = mult[54] * table[6];
-	F62 = mult[55] * table[7];
+	F[ 6] = mult[48] * table[0];
+	F[14] = mult[49] * table[1];
+	F[22] = mult[50] * table[2];
+	F[30] = mult[51] * table[3];
+	F[38] = mult[52] * table[4];
+	F[46] = mult[53] * table[5];
+	F[54] = mult[54] * table[6];
+	F[62] = mult[55] * table[7];

 	table = &(fftTable[input[7] << 3]);

-	F7  = mult[56] * table[0];
-	F15 = mult[57] * table[1];
-	F23 = mult[58] * table[2];
-	F31 = mult[59] * table[3];
-	F39 = mult[60] * table[4];
-	F47 = mult[61] * table[5];
-	F55 = mult[62] * table[6];
-	F63 = mult[63] * table[7];
+	F[ 7] = mult[56] * table[0];
+	F[15] = mult[57] * table[1];
+	F[23] = mult[58] * table[2];
+	F[31] = mult[59] * table[3];
+	F[39] = mult[60] * table[4];
+	F[47] = mult[61] * table[5];
+	F[55] = mult[62] * table[6];
+	F[63] = mult[63] * table[7];

   #define ADD_SUB( a, b ) \
   { \
@@ -987,262 +969,229 @@ void FFT(const unsigned char input[EIGHTH_N], swift_int32_t *output)
   }
 */

-	// Second loop unrolling:
 	// Iteration 0:
-	ADD_SUB(F0, F1);
-	ADD_SUB(F2, F3);
-	ADD_SUB(F4, F5);
-	ADD_SUB(F6, F7);
+	ADD_SUB( F[ 0], F[ 1] );
+	ADD_SUB( F[ 2], F[ 3] );
+	ADD_SUB( F[ 4], F[ 5] );
+	ADD_SUB( F[ 6], F[ 7] );
+	F[ 3] <<= 4;
+	F[ 7] <<= 4;
+	ADD_SUB( F[ 0], F[ 2] );
+	ADD_SUB( F[ 1], F[ 3] );
+	ADD_SUB( F[ 4], F[ 6] );
+	ADD_SUB( F[ 5], F[ 7] );
+	F[ 5] <<= 2;
+	F[ 6] <<= 4;
+	F[ 7] <<= 6;
+	ADD_SUB( F[ 0], F[ 4] );
+	ADD_SUB( F[ 1], F[ 5] );
+	ADD_SUB( F[ 2], F[ 6] );
+	ADD_SUB( F[ 3], F[ 7] );

-	F3 <<= 4;
-	F7 <<= 4;
-
-	ADD_SUB(F0, F2);
-	ADD_SUB(F1, F3);
-	ADD_SUB(F4, F6);
-	ADD_SUB(F5, F7);
-
-	F5 <<= 2;
-	F6 <<= 4;
-	F7 <<= 6;
-
-	ADD_SUB(F0, F4);
-	ADD_SUB(F1, F5);
-	ADD_SUB(F2, F6);
-	ADD_SUB(F3, F7);
-
-	output[0] = Q_REDUCE(F0);
-	output[8] = Q_REDUCE(F1);
-	output[16] = Q_REDUCE(F2);
-	output[24] = Q_REDUCE(F3);
-	output[32] = Q_REDUCE(F4);
-	output[40] = Q_REDUCE(F5);
-	output[48] = Q_REDUCE(F6);
-	output[56] = Q_REDUCE(F7);
+   output[ 0] = Q_REDUCE( F[ 0] );
+	output[ 8] = Q_REDUCE( F[ 1] );
+	output[16] = Q_REDUCE( F[ 2] );
+	output[24] = Q_REDUCE( F[ 3] );
+	output[32] = Q_REDUCE( F[ 4] );
+	output[40] = Q_REDUCE( F[ 5] );
+	output[48] = Q_REDUCE( F[ 6] );
+	output[56] = Q_REDUCE( F[ 7] );

 	// Iteration 1:
-	ADD_SUB(F8, F9);
-	ADD_SUB(F10, F11);
-	ADD_SUB(F12, F13);
-	ADD_SUB(F14, F15);
+	ADD_SUB( F[ 8], F[ 9] );
+	ADD_SUB( F[10], F[11] );
+	ADD_SUB( F[12], F[13] );
+	ADD_SUB( F[14], F[15] );
+	F[11] <<= 4;
+	F[15] <<= 4;
+	ADD_SUB( F[ 8], F[10] );
+	ADD_SUB( F[ 9], F[11] );
+	ADD_SUB( F[12], F[14] );
+	ADD_SUB( F[13], F[15] );
+	F[13] <<= 2;
+	F[14] <<= 4;
+	F[15] <<= 6;
+	ADD_SUB( F[ 8], F[12] );
+	ADD_SUB( F[ 9], F[13] );
+	ADD_SUB( F[10], F[14] );
+	ADD_SUB( F[11], F[15] );

-	F11 <<= 4;
-	F15 <<= 4;
-
-	ADD_SUB(F8, F10);
-	ADD_SUB(F9, F11);
-	ADD_SUB(F12, F14);
-	ADD_SUB(F13, F15);
-
-	F13 <<= 2;
-	F14 <<= 4;
-	F15 <<= 6;
-
-	ADD_SUB(F8, F12);
-	ADD_SUB(F9, F13);
-	ADD_SUB(F10, F14);
-	ADD_SUB(F11, F15);
-
-	output[1] = Q_REDUCE(F8);
-	output[9] = Q_REDUCE(F9);
-	output[17] = Q_REDUCE(F10);
-	output[25] = Q_REDUCE(F11);
-	output[33] = Q_REDUCE(F12);
-	output[41] = Q_REDUCE(F13);
-	output[49] = Q_REDUCE(F14);
-	output[57] = Q_REDUCE(F15);
+	output[ 1] = Q_REDUCE( F[ 8] );
+	output[ 9] = Q_REDUCE( F[ 9] );
+	output[17] = Q_REDUCE( F[10] );
+	output[25] = Q_REDUCE( F[11] );
+	output[33] = Q_REDUCE( F[12] );
+	output[41] = Q_REDUCE( F[13] );
+	output[49] = Q_REDUCE( F[14] );
+	output[57] = Q_REDUCE( F[15] );

 	// Iteration 2:
-	ADD_SUB(F16, F17);
-	ADD_SUB(F18, F19);
-	ADD_SUB(F20, F21);
-	ADD_SUB(F22, F23);
+	ADD_SUB( F[16], F[17] );
+	ADD_SUB( F[18], F[19] );
+	ADD_SUB( F[20], F[21] );
+	ADD_SUB( F[22], F[23] );
+	F[19] <<= 4;
+	F[23] <<= 4;
+	ADD_SUB( F[16], F[18]);
+	ADD_SUB( F[17], F[19]);
+	ADD_SUB( F[20], F[22]);
+	ADD_SUB( F[21], F[23]);
+	F[21] <<= 2;
+	F[22] <<= 4;
+	F[23] <<= 6;
+	ADD_SUB( F[16], F[20] );
+	ADD_SUB( F[17], F[21] );
+	ADD_SUB( F[18], F[22] );
+	ADD_SUB( F[19], F[23] );

-	F19 <<= 4;
-	F23 <<= 4;
-
-	ADD_SUB(F16, F18);
-	ADD_SUB(F17, F19);
-	ADD_SUB(F20, F22);
-	ADD_SUB(F21, F23);
-
-	F21 <<= 2;
-	F22 <<= 4;
-	F23 <<= 6;
-
-	ADD_SUB(F16, F20);
-	ADD_SUB(F17, F21);
-	ADD_SUB(F18, F22);
-	ADD_SUB(F19, F23);
-
-	output[2] = Q_REDUCE(F16);
-	output[10] = Q_REDUCE(F17);
-	output[18] = Q_REDUCE(F18);
-	output[26] = Q_REDUCE(F19);
-	output[34] = Q_REDUCE(F20);
-	output[42] = Q_REDUCE(F21);
-	output[50] = Q_REDUCE(F22);
-	output[58] = Q_REDUCE(F23);
+	output[ 2] = Q_REDUCE( F[16] );
+	output[10] = Q_REDUCE( F[17] );
+	output[18] = Q_REDUCE( F[18] );
+	output[26] = Q_REDUCE( F[19] );
+	output[34] = Q_REDUCE( F[20] );
+	output[42] = Q_REDUCE( F[21] );
+	output[50] = Q_REDUCE( F[22] );
+	output[58] = Q_REDUCE( F[23] );

 	// Iteration 3:
-	ADD_SUB(F24, F25);
-	ADD_SUB(F26, F27);
-	ADD_SUB(F28, F29);
-	ADD_SUB(F30, F31);
+	ADD_SUB( F[24], F[25] );
+	ADD_SUB( F[26], F[27] );
+	ADD_SUB( F[28], F[29] );
+	ADD_SUB( F[30], F[31] );
+ 	F[27] <<= 4;
+ 	F[31] <<= 4;
+	ADD_SUB( F[24], F[26] );
+	ADD_SUB( F[25], F[27] );
+	ADD_SUB( F[28], F[30] );
+	ADD_SUB( F[29], F[31] );
+	F[29] <<= 2;
+	F[30] <<= 4;
+	F[31] <<= 6;
+	ADD_SUB( F[24], F[28] );
+	ADD_SUB( F[25], F[29] );
+	ADD_SUB( F[26], F[30] );
+	ADD_SUB( F[27], F[31] );

-	F27 <<= 4;
-	F31 <<= 4;
-
-	ADD_SUB(F24, F26);
-	ADD_SUB(F25, F27);
-	ADD_SUB(F28, F30);
-	ADD_SUB(F29, F31);
-
-	F29 <<= 2;
-	F30 <<= 4;
-	F31 <<= 6;
-
-	ADD_SUB(F24, F28);
-	ADD_SUB(F25, F29);
-	ADD_SUB(F26, F30);
-	ADD_SUB(F27, F31);
-
-	output[3] = Q_REDUCE(F24);
-	output[11] = Q_REDUCE(F25);
-	output[19] = Q_REDUCE(F26);
-	output[27] = Q_REDUCE(F27);
-	output[35] = Q_REDUCE(F28);
-	output[43] = Q_REDUCE(F29);
-	output[51] = Q_REDUCE(F30);
-	output[59] = Q_REDUCE(F31);
+	output[ 3] = Q_REDUCE( F[24] );
+	output[11] = Q_REDUCE( F[25] );
+	output[19] = Q_REDUCE( F[26] );
+	output[27] = Q_REDUCE( F[27] );
+	output[35] = Q_REDUCE( F[28] );
+	output[43] = Q_REDUCE( F[29] );
+	output[51] = Q_REDUCE( F[30] );
+	output[59] = Q_REDUCE( F[31] );

 	// Iteration 4:
-	ADD_SUB(F32, F33);
-	ADD_SUB(F34, F35);
-	ADD_SUB(F36, F37);
-	ADD_SUB(F38, F39);
+	ADD_SUB( F[32], F[33] );
+	ADD_SUB( F[34], F[35] );
+	ADD_SUB( F[36], F[37] );
+	ADD_SUB( F[38], F[39] );
+	F[35] <<= 4;
+	F[39] <<= 4;
+	ADD_SUB( F[32], F[34] );
+	ADD_SUB( F[33], F[35] );
+	ADD_SUB( F[36], F[38] );
+	ADD_SUB( F[37], F[39] );
+	F[37] <<= 2;
+	F[38] <<= 4;
+	F[39] <<= 6;
+	ADD_SUB( F[32], F[36] );
+	ADD_SUB( F[33], F[37] );
+	ADD_SUB( F[34], F[38] );
+	ADD_SUB( F[35], F[39] );

-	F35 <<= 4;
-	F39 <<= 4;
-
-	ADD_SUB(F32, F34);
-	ADD_SUB(F33, F35);
-	ADD_SUB(F36, F38);
-	ADD_SUB(F37, F39);
-
-	F37 <<= 2;
-	F38 <<= 4;
-	F39 <<= 6;
-
-	ADD_SUB(F32, F36);
-	ADD_SUB(F33, F37);
-	ADD_SUB(F34, F38);
-	ADD_SUB(F35, F39);
-
-	output[4] = Q_REDUCE(F32);
-	output[12] = Q_REDUCE(F33);
-	output[20] = Q_REDUCE(F34);
-	output[28] = Q_REDUCE(F35);
-	output[36] = Q_REDUCE(F36);
-	output[44] = Q_REDUCE(F37);
-	output[52] = Q_REDUCE(F38);
-	output[60] = Q_REDUCE(F39);
+	output[ 4] = Q_REDUCE( F[32] );
+	output[12] = Q_REDUCE( F[33] );
+	output[20] = Q_REDUCE( F[34] );
+	output[28] = Q_REDUCE( F[35] );
+	output[36] = Q_REDUCE( F[36] );
+	output[44] = Q_REDUCE( F[37] );
+	output[52] = Q_REDUCE( F[38] );
+	output[60] = Q_REDUCE( F[39] );

 	// Iteration 5:
-	ADD_SUB(F40, F41);
-	ADD_SUB(F42, F43);
-	ADD_SUB(F44, F45);
-	ADD_SUB(F46, F47);
+	ADD_SUB( F[40], F[41] );
+	ADD_SUB( F[42], F[43] );
+	ADD_SUB( F[44], F[45] );
+	ADD_SUB( F[46], F[47] );
+	F[43] <<= 4;
+	F[47] <<= 4;
+	ADD_SUB( F[40], F[42] );
+	ADD_SUB( F[41], F[43] );
+	ADD_SUB( F[44], F[46] );
+	ADD_SUB( F[45], F[47] );
+	F[45] <<= 2;
+	F[46] <<= 4;
+	F[47] <<= 6;
+	ADD_SUB( F[40], F[44] );
+	ADD_SUB( F[41], F[45] );
+	ADD_SUB( F[42], F[46] );
+	ADD_SUB( F[43], F[47] );

-	F43 <<= 4;
-	F47 <<= 4;
-
-	ADD_SUB(F40, F42);
-	ADD_SUB(F41, F43);
-	ADD_SUB(F44, F46);
-	ADD_SUB(F45, F47);
-
-	F45 <<= 2;
-	F46 <<= 4;
-	F47 <<= 6;
-
-	ADD_SUB(F40, F44);
-	ADD_SUB(F41, F45);
-	ADD_SUB(F42, F46);
-	ADD_SUB(F43, F47);
-
-	output[5] = Q_REDUCE(F40);
-	output[13] = Q_REDUCE(F41);
-	output[21] = Q_REDUCE(F42);
-	output[29] = Q_REDUCE(F43);
-	output[37] = Q_REDUCE(F44);
-	output[45] = Q_REDUCE(F45);
-	output[53] = Q_REDUCE(F46);
-	output[61] = Q_REDUCE(F47);
+	output[ 5] = Q_REDUCE( F[40] );
+	output[13] = Q_REDUCE( F[41] );
+	output[21] = Q_REDUCE( F[42] );
+	output[29] = Q_REDUCE( F[43] );
+	output[37] = Q_REDUCE( F[44] );
+	output[45] = Q_REDUCE( F[45] );
+	output[53] = Q_REDUCE( F[46] );
+	output[61] = Q_REDUCE( F[47] );

 	// Iteration 6:
-	ADD_SUB(F48, F49);
-	ADD_SUB(F50, F51);
-	ADD_SUB(F52, F53);
-	ADD_SUB(F54, F55);
+	ADD_SUB( F[48], F[49] );
+	ADD_SUB( F[50], F[51] );
+	ADD_SUB( F[52], F[53] );
+	ADD_SUB( F[54], F[55] );
+	F[51] <<= 4;
+	F[55] <<= 4;
+	ADD_SUB( F[48], F[50] );
+	ADD_SUB( F[49], F[51] );
+	ADD_SUB( F[52], F[54] );
+	ADD_SUB( F[53], F[55] );
+	F[53] <<= 2;
+	F[54] <<= 4;
+	F[55] <<= 6;
+	ADD_SUB( F[48], F[52] );
+	ADD_SUB( F[49], F[53] );
+	ADD_SUB( F[50], F[54] );
+	ADD_SUB( F[51], F[55] );

-	F51 <<= 4;
-	F55 <<= 4;
-
-	ADD_SUB(F48, F50);
-	ADD_SUB(F49, F51);
-	ADD_SUB(F52, F54);
-	ADD_SUB(F53, F55);
-
-	F53 <<= 2;
-	F54 <<= 4;
-	F55 <<= 6;
-
-	ADD_SUB(F48, F52);
-	ADD_SUB(F49, F53);
-	ADD_SUB(F50, F54);
-	ADD_SUB(F51, F55);
-
-	output[6] = Q_REDUCE(F48);
-	output[14] = Q_REDUCE(F49);
-	output[22] = Q_REDUCE(F50);
-	output[30] = Q_REDUCE(F51);
-	output[38] = Q_REDUCE(F52);
-	output[46] = Q_REDUCE(F53);
-	output[54] = Q_REDUCE(F54);
-	output[62] = Q_REDUCE(F55);
+	output[ 6] = Q_REDUCE( F[48] );
+	output[14] = Q_REDUCE( F[49] );
+	output[22] = Q_REDUCE( F[50] );
+	output[30] = Q_REDUCE( F[51] );
+	output[38] = Q_REDUCE( F[52] );
+	output[46] = Q_REDUCE( F[53] );
+	output[54] = Q_REDUCE( F[54] );
+	output[62] = Q_REDUCE( F[55] );

 	// Iteration 7:
-	ADD_SUB(F56, F57);
-	ADD_SUB(F58, F59);
-	ADD_SUB(F60, F61);
-	ADD_SUB(F62, F63);
+	ADD_SUB( F[56], F[57] );
+	ADD_SUB( F[58], F[59] );
+	ADD_SUB( F[60], F[61] );
+	ADD_SUB( F[62], F[63] );
+	F[59] <<= 4;
+	F[63] <<= 4;
+	ADD_SUB( F[56], F[58] );
+	ADD_SUB( F[57], F[59] );
+	ADD_SUB( F[60], F[62] );
+	ADD_SUB( F[61], F[63] );
+	F[61] <<= 2;
+	F[62] <<= 4;
+	F[63] <<= 6;
+	ADD_SUB( F[56], F[60] );
+	ADD_SUB( F[57], F[61] );
+	ADD_SUB( F[58], F[62] );
+	ADD_SUB( F[59], F[63] );

-	F59 <<= 4;
-	F63 <<= 4;
-
-	ADD_SUB(F56, F58);
-	ADD_SUB(F57, F59);
-	ADD_SUB(F60, F62);
-	ADD_SUB(F61, F63);
-
-	F61 <<= 2;
-	F62 <<= 4;
-	F63 <<= 6;
-
-	ADD_SUB(F56, F60);
-	ADD_SUB(F57, F61);
-	ADD_SUB(F58, F62);
-	ADD_SUB(F59, F63);
-
-	output[7] = Q_REDUCE(F56);
-	output[15] = Q_REDUCE(F57);
-	output[23] = Q_REDUCE(F58);
-	output[31] = Q_REDUCE(F59);
-	output[39] = Q_REDUCE(F60);
-	output[47] = Q_REDUCE(F61);
-	output[55] = Q_REDUCE(F62);
-	output[63] = Q_REDUCE(F63);
+	output[ 7] = Q_REDUCE( F[56] );
+	output[15] = Q_REDUCE( F[57] );
+	output[23] = Q_REDUCE( F[58] );
+	output[31] = Q_REDUCE( F[59] );
+	output[39] = Q_REDUCE( F[60] );
+	output[47] = Q_REDUCE( F[61] );
+	output[55] = Q_REDUCE( F[62] );
+	output[63] = Q_REDUCE( F[63] );

   #undef ADD_SUB
   #undef Q_REDUCE
--- a/algo/verthash/tiny_sha3/sha3-4way.c
+++ b/algo/verthash/tiny_sha3/sha3-4way.c
@@ -134,10 +134,10 @@ int sha3_4way_update( sha3_4way_ctx_t *c, const void *data, size_t len )
 int sha3_4way_final( void *md, sha3_4way_ctx_t *c )
 {
    c->st[ c->pt ] = _mm256_xor_si256( c->st[ c->pt ],
-                                       m256_const1_64( 6 ) );
+                                       _mm256_set1_epi64x( 6 ) );
    c->st[ c->rsiz / 8 - 1 ] =
                       _mm256_xor_si256( c->st[ c->rsiz / 8 - 1 ],
-                                         m256_const1_64( 0x8000000000000000 ) );
+                                    _mm256_set1_epi64x( 0x8000000000000000 ) );
    sha3_4way_keccakf( c->st );
    memcpy( md, c->st, c->mdlen * 4 );
    return 1;
@@ -268,10 +268,10 @@ int sha3_8way_final( void *md, sha3_8way_ctx_t *c )
 {
    c->st[ c->pt ] =
                       _mm512_xor_si512( c->st[ c->pt ],
-                                         m512_const1_64( 6 ) );
+                                         _mm512_set1_epi64( 6 ) );
    c->st[ c->rsiz / 8 - 1 ] =
                       _mm512_xor_si512( c->st[ c->rsiz / 8 - 1 ],
-                                         m512_const1_64( 0x8000000000000000 ) );
+                                     _mm512_set1_epi64( 0x8000000000000000 ) );
    sha3_8way_keccakf( c->st );
    memcpy( md, c->st, c->mdlen * 8 );
    return 1;
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -201,7 +201,7 @@ int scanhash_c11_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const uint32_t targ32_d7 = ptarget[7];
-   const __m512i eight = m512_const1_64( 8 );
+   const __m512i eight = _mm512_set1_epi64( 8 );
   const bool bench = opt_benchmark;

   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
@@ -369,7 +369,7 @@ int scanhash_c11_4way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const uint32_t targ32_d7 = ptarget[7];
-   const __m256i four = m256_const1_64( 4 );
+   const __m256i four = _mm256_set1_epi64x( 4 );
   const bool bench = opt_benchmark;

   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) );
--- a/algo/x13/skunk-4way.c
+++ b/algo/x13/skunk-4way.c
@@ -114,7 +114,7 @@ int scanhash_skunk_8way( struct work *work, uint32_t max_nonce,
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
      n +=8;
   } while ( likely( ( n < last_nonce ) && !( *restart ) ) );
   pdata[19] = n;
@@ -218,7 +218,7 @@ int scanhash_skunk_4way( struct work *work, uint32_t max_nonce,
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
      n +=4;
   } while ( likely( ( n < last_nonce ) && !( *restart ) ) );
   pdata[19] = n;
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -536,7 +536,7 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
      n += 8;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
@@ -963,7 +963,7 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
      n += 4;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -49,7 +49,7 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
      n += 8;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
@@ -102,7 +102,7 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
      n += 4;
   } while ( (  n < last_nonce ) && !(*restart) );
   pdata[19] = n;
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -26,7 +26,7 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
      x16rt_getTimeHash( masked_ntime, &timeHash );
      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = masked_ntime;
-      if ( opt_debug && !thr_id )
+      if ( !thr_id )
          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
                        x16r_hash_order, swab32( pdata[17] ), timeHash );
   }
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -658,7 +658,7 @@ int scanhash_x16rv2_8way( struct work *work, uint32_t max_nonce,
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
      n += 8;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
@@ -1143,7 +1143,7 @@ int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
      n += 4;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -181,7 +181,7 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
         }
      }
      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
      n += 8;
   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
@@ -335,7 +335,7 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
      n += 4;
   } while ( likely( (  n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -254,7 +254,7 @@ int scanhash_x17_8way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const uint32_t targ32_d7 = ptarget[7];
-   const __m512i eight = m512_const1_64( 8 );
+   const __m512i eight = _mm512_set1_epi64( 8 );
   const bool bench = opt_benchmark;

   // convert LE32 to LE64
@@ -468,7 +468,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const uint32_t targ32_d7 = ptarget[7];
-   const __m256i four = m256_const1_64( 4 );
+   const __m256i four = _mm256_set1_epi64x( 4 );
   const bool bench = opt_benchmark;

   // convert LE32 to LE64
--- a/algo/x22/x22i-4way.c
+++ b/algo/x22/x22i-4way.c
@@ -445,7 +445,7 @@ int scanhash_x22i_8way_sha( struct work *work, uint32_t max_nonce,
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
      n += 8;
   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
@@ -494,7 +494,7 @@ int scanhash_x22i_8way( struct work *work, uint32_t max_nonce,
         }
      }
      *noncev = _mm512_add_epi32( *noncev,
-                                  m512_const1_64( 0x0000000800000000 ) );
+                                  _mm512_set1_epi64( 0x0000000800000000 ) );
      n += 8;
   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
@@ -787,7 +787,7 @@ int scanhash_x22i_4way_sha( struct work* work, uint32_t max_nonce,
         submit_solution( work, hash+(i<<3), mythr );
      }
      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
      n += 4;
   } while ( likely( ( n < last_nonce ) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
@@ -835,7 +835,7 @@ int scanhash_x22i_4way( struct work* work, uint32_t max_nonce,
         }
      }
      *noncev = _mm256_add_epi32( *noncev,
-                                  m256_const1_64( 0x0000000400000000 ) );
+                                  _mm256_set1_epi64x( 0x0000000400000000 ) );
      n += 4;
   } while ( likely( ( n <= last_nonce ) && !work_restart[thr_id].restart ) );
   pdata[19] = n;
--- a/algo/x22/x25x-4way.c
+++ b/algo/x22/x25x-4way.c
@@ -571,7 +571,7 @@ int scanhash_x25x_8way( struct work *work, uint32_t max_nonce,
   const int thr_id = mythr->id;
   const uint32_t targ32 = ptarget[7];
   const bool bench = opt_benchmark;
-   const __m512i eight = m512_const1_64( 8 );
+   const __m512i eight = _mm512_set1_epi64( 8 );
   if ( bench )  ptarget[7] = 0x08ff;

   edata[0] = mm128_swap64_32( casti_m128i( pdata, 0 ) ); 
@@ -927,7 +927,7 @@ int scanhash_x25x_4way( struct work* work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const uint32_t targ32 = ptarget[7];
-   const __m256i four = m256_const1_64( 4 );
+   const __m256i four = _mm256_set1_epi64x( 4 );
   const bool bench = opt_benchmark;

   if ( bench ) ptarget[7] = 0x08ff;
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.22.3.
+# Generated by GNU Autoconf 2.71 for cpuminer-opt 3.23.0.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -608,8 +608,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.22.3'
-PACKAGE_STRING='cpuminer-opt 3.22.3'
+PACKAGE_VERSION='3.23.0'
+PACKAGE_STRING='cpuminer-opt 3.23.0'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.22.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.23.0 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1432,7 +1432,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.22.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.23.0:";;
   esac
  cat <<\_ACEOF

@@ -1538,7 +1538,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.22.3
+cpuminer-opt configure 3.23.0
 generated by GNU Autoconf 2.71

 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.22.3, which was
+It was created by cpuminer-opt $as_me 3.23.0, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  $ $0$ac_configure_args_raw
@@ -3593,7 +3593,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.22.3'
+ VERSION='3.23.0'


 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.22.3, which was
+This file was extended by cpuminer-opt $as_me 3.23.0, which was
 generated by GNU Autoconf 2.71.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-cpuminer-opt config.status 3.22.3
+cpuminer-opt config.status 3.23.0
 configured by $0, generated by GNU Autoconf 2.71,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.22.3])
+AC_INIT([cpuminer-opt], [3.23.0])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/7647
+++ b/7647
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1532,6 +1532,7 @@ const char *getwork_req =
 #define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"

 #define GBT_RULES "[\"segwit\"]"
+
 static const char *gbt_req =
   "{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
   GBT_CAPABILITIES ", \"rules\": " GBT_RULES "}], \"id\":0}\r\n";
@@ -1589,18 +1590,21 @@ start:
         json_decref( val );
         goto start;
      }
+      allow_getwork = false;  // GBT is working, disable fallback
   } 
   else
      rc = work_decode( json_object_get( val, "result" ), work );

   if ( rc ) 
   {
+      bool new_work = true;
+
      json_decref( val );

      get_mininginfo( curl, work );
      report_summary_log( false );
      
-      if ( opt_protocol | opt_debug )
+      if ( opt_protocol || opt_debug )
      {
         timeval_subtract( &diff, &tv_end, &tv_start );
         applog( LOG_INFO, "%s new work received in %.2f ms",
@@ -1621,8 +1625,10 @@ start:
         applog( LOG_BLUE, "New Work: Block %d, Tx %d, Net Diff %.5g, Ntime %08x",
                                work->height, work->tx_count, net_diff,
                                work->data[ algo_gate.ntime_index ] );
-       
-      if ( !opt_quiet )
+      else
+        new_work = false;
+
+      if ( new_work && !opt_quiet )
      {
         double miner_hr = 0.;
         double net_hr = net_hashrate;
@@ -2745,10 +2751,14 @@ static void *stratum_thread(void *userdata )
         }
         else
         {
-            stratum_down = false;
+// sometimes stratum connects but doesn't immediately send a job, wait for one.
+//            stratum_down = false;
            applog(LOG_BLUE,"Stratum connection established" );
            if ( stratum.new_job )   // prime first job
+            {
+               stratum_down = false;
               stratum_gen_work( &stratum, &g_work );
+            }
         }
      }

@@ -2757,6 +2767,7 @@ static void *stratum_thread(void *userdata )
      {
         if ( likely( s = stratum_recv_line( &stratum ) ) )
         {
+            stratum_down = false;
            if ( likely( !stratum_handle_method( &stratum, s ) ) )
               stratum_handle_response( s );
            free( s );
@@ -2848,6 +2859,7 @@ static bool cpu_capability( bool display_only )
     bool cpu_has_sha    = has_sha();
     bool cpu_has_avx512 = has_avx512();
     bool cpu_has_vaes   = has_vaes();
+     bool cpu_has_avx10  = has_avx10();
     bool sw_has_aes    = false;
     bool sw_has_sse2   = false;
     bool sw_has_sse42  = false;
@@ -2912,8 +2924,8 @@ static bool cpu_capability( bool display_only )
     #ifdef _MSC_VER
         " with VC++ 2013\n");
     #elif defined(__GNUC__)
-         " with GCC");
-        printf(" %d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
+         " with GCC-");
+        printf("%d.%d.%d\n", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__);
     #else
        printf("\n");
     #endif
@@ -2927,6 +2939,8 @@ static bool cpu_capability( bool display_only )
     if      ( cpu_has_vaes   )    printf( " VAES"   );
     else if ( cpu_has_aes    )    printf( "  AES"   );
     if      ( cpu_has_sha    )    printf( " SHA"    );
+     if      ( cpu_has_avx10  )    printf( " AVX10.%d-%d",
+                                    avx10_version(), avx10_vector_length() );

     printf("\nSW features:  ");
     if      ( sw_has_avx512 )    printf( " AVX512" );
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -15,10 +15,6 @@
 //    data but not for vectors. The main categories are bit rotation
 //    and endian byte swapping
 //
-//    An attempt was made to make the names as similar as possible to
-//    Intel's intrinsic function format. Most variations are to avoid
-//    confusion with actual Intel intrinsics, brevity, and clarity.
-//
 //    This suite supports some operations on regular 64 bit integers
 //    as well as 128 bit integers available on recent versions of Linux
 //    and GCC.
@@ -37,6 +33,9 @@
 //    SSE2:   128 bit vectors  (64 bit CPUs only, such as Intel Core2.
 //    AVX2:   256 bit vectors  (Starting with Intel Haswell and AMD Ryzen)
 //    AVX512: 512 bit vectors  (Starting with SkylakeX)
+//    AVX10:  when available will supersede AVX512 and will bring AVX512
+//        features, except 512 bit vectors, to Intel's Ecores. It needs to be
+//        enabled manually when the relevant GCC macros are known.
 //
 //    Most functions are avalaible at the stated levels but in rare cases
 //    a higher level feature may be required with no compatible alternative.
@@ -53,21 +52,17 @@
 //    for the applications but also adds responsibility to ensure adequate data
 //    alignment.
 //
-//    Windows has problems with function vector arguments larger than
-//    128 bits. Stack alignment is only guaranteed to 16 bytes. Always use
-//    pointers for larger vectors in function arguments. Macros can be used
-//    for larger value arguments.
-//
 //    An attempt was made to make the names as similar as possible to
 //    Intel's intrinsic function format. Most variations are to avoid
-//    confusion with actual Intel intrinsics, brevity, and clarity
+//    confusion with actual Intel intrinsics, brevity, and clarity.
 //
 //    The main differences are:
 //
-//   - the leading underscore(s) "_" and the "i" are dropped from the
-//     prefix of vector instructions.
-//   - "mm64" and "mm128" used for 64 and 128 bit prefix respectively
-//     to avoid the ambiguity of "mm".
+//   - the leading underscore "_" is dropped from the prefix of vector function
+//     macros.
+//   - "mm128" is used 128 bit prefix to be consistent with mm256 & mm512 and
+//     to avoid the ambiguity of "mm" which is also used for 64 bit MMX
+//     intrinsics.
 //   - the element size does not include additional type specifiers
 //      like "epi".
 //   - there is a subset of some functions for scalar data. They may have
@@ -76,14 +71,14 @@
 //   
 //    Function names follow this pattern:
 //
-//         prefix_op[vsize]_[esize]
+//         [prefix]_[op][vsize]_[esize]
 //
 //    Prefix: usually the size of the returned vector.
 //    Following are some examples:
 //
 //    u64:  unsigned 64 bit integer function
 //    i128: signed 128 bit integer function (rarely used)
-//    m128: 128 bit vector identifier
+//    m128: 128 bit vector identifier (deprecated)
 //    mm128: 128 bit vector function
 //
 //    op: describes the operation of the function or names the data
@@ -94,7 +89,7 @@
 //    vsize: optional, lane size used when a function operates on elements
 //           within lanes of a larger vector.
 //
-//    mm256_shuflr128_32 rotates each 128 bit lane of a 256 bit vector
+//    Ex: mm256_shuflr128_32 rotates each 128 bit lane of a 256 bit vector
 //        right by 32 bits.
 //
 // Vector constants
--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -731,6 +731,67 @@ static inline void extr_lane_8x32( void *d, const void *s,

 #if defined(__AVX2__)

+#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
+
+//TODO Enable for AVX10_256 AVX10_512
+
+// Combine byte swap & broadcast in one permute
+static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
+{
+   const __m256i c0 = _mm256_set1_epi32( 0x00010203 );
+   const __m256i c1 = _mm256_set1_epi32( 0x04050607 );
+   const __m256i c2 = _mm256_set1_epi32( 0x08090a0b );
+   const __m256i c3 = _mm256_set1_epi32( 0x0c0d0e0f );
+   const __m128i s0 = casti_m128i( src,0 );
+   const __m128i s1 = casti_m128i( src,1 );
+   const __m128i s2 = casti_m128i( src,2 );
+   const __m128i s3 = casti_m128i( src,3 );
+   const __m128i s4 = casti_m128i( src,4 );
+
+   casti_m256i( d, 0 ) = _mm256_permutexvar_epi8( c0,
+                          _mm256_castsi128_si256( s0 ) );
+   casti_m256i( d, 1 ) = _mm256_permutexvar_epi8( c1,
+                          _mm256_castsi128_si256( s0 ) );
+   casti_m256i( d, 2 ) = _mm256_permutexvar_epi8( c2,
+                          _mm256_castsi128_si256( s0 ) );
+   casti_m256i( d, 3 ) = _mm256_permutexvar_epi8( c3,
+                          _mm256_castsi128_si256( s0 ) );
+   casti_m256i( d, 4 ) = _mm256_permutexvar_epi8( c0,
+                          _mm256_castsi128_si256( s1 ) );
+   casti_m256i( d, 5 ) = _mm256_permutexvar_epi8( c1,
+                          _mm256_castsi128_si256( s1 ) );
+   casti_m256i( d, 6 ) = _mm256_permutexvar_epi8( c2,
+                          _mm256_castsi128_si256( s1 ) );
+   casti_m256i( d, 7 ) = _mm256_permutexvar_epi8( c3,
+                          _mm256_castsi128_si256( s1 ) );
+   casti_m256i( d, 8 ) = _mm256_permutexvar_epi8( c0,
+                          _mm256_castsi128_si256( s2 ) );
+   casti_m256i( d, 9 ) = _mm256_permutexvar_epi8( c1,
+                          _mm256_castsi128_si256( s2 ) );
+   casti_m256i( d,10 ) = _mm256_permutexvar_epi8( c2,
+                          _mm256_castsi128_si256( s2 ) );
+   casti_m256i( d,11 ) = _mm256_permutexvar_epi8( c3,
+                          _mm256_castsi128_si256( s2 ) );
+   casti_m256i( d,12 ) = _mm256_permutexvar_epi8( c0,
+                          _mm256_castsi128_si256( s3 ) );
+   casti_m256i( d,13 ) = _mm256_permutexvar_epi8( c1,
+                          _mm256_castsi128_si256( s3 ) );
+   casti_m256i( d,14 ) = _mm256_permutexvar_epi8( c2,
+                          _mm256_castsi128_si256( s3 ) );
+   casti_m256i( d,15 ) = _mm256_permutexvar_epi8( c3,
+                          _mm256_castsi128_si256( s3 ) );
+   casti_m256i( d,16 ) = _mm256_permutexvar_epi8( c0,
+                          _mm256_castsi128_si256( s4 ) );
+   casti_m256i( d,17 ) = _mm256_permutexvar_epi8( c1,
+                          _mm256_castsi128_si256( s4 ) );
+   casti_m256i( d,18 ) = _mm256_permutexvar_epi8( c2,
+                          _mm256_castsi128_si256( s4 ) );
+   casti_m256i( d,19 ) = _mm256_permutexvar_epi8( c3,
+                          _mm256_castsi128_si256( s4 ) );
+}
+
+#else
+
 static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
 {
  const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
@@ -792,6 +853,7 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, const void *src )
                         _mm256_castsi128_si256( s4 ), c3 );
 }

+#endif   // AVX512VBMI else
 #endif   // AVX2

 // 16x32
@@ -1173,10 +1235,12 @@ static inline void extr_lane_16x32( void *d, const void *s,
   ((uint32_t*)d)[15] = ((const uint32_t*)s)[ lane+240 ];
 }

-#if defined(__AVX512F__) && defined(__AVX512VL__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 #if defined(__AVX512VBMI__)

+// TODO Enable for AVX10_512
+
 // Combine byte swap & broadcast in one permute
 static inline void mm512_bswap32_intrlv80_16x32( void *d, const void *src )
 {
@@ -1496,10 +1560,48 @@ static inline void mm256_intrlv80_4x64( void *d, const void *src )
                          _mm256_castsi128_si256( s4 ), 0x55 );
 }

+#if defined(__AVX512VL__) && defined(__AVX512VBMI__)
+
+//TODO Enable for AVX10_256 AVX10_512
+
+static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
+{
+   const __m256i c0 = _mm256_set1_epi64x( 0x0405060700010203 );
+   const __m256i c1 = _mm256_set1_epi64x( 0x0c0d0e0f08090a0b );
+   const __m128i s0 = casti_m128i( src,0 );
+   const __m128i s1 = casti_m128i( src,1 );
+   const __m128i s2 = casti_m128i( src,2 );
+   const __m128i s3 = casti_m128i( src,3 );
+   const __m128i s4 = casti_m128i( src,4 );
+
+   casti_m256i( d,0 ) = _mm256_permutexvar_epi8( c0,
+                         _mm256_castsi128_si256( s0 ) );
+   casti_m256i( d,1 ) = _mm256_permutexvar_epi8( c1,
+                         _mm256_castsi128_si256( s0 ) );
+   casti_m256i( d,2 ) = _mm256_permutexvar_epi8( c0,
+                         _mm256_castsi128_si256( s1 ) );
+   casti_m256i( d,3 ) = _mm256_permutexvar_epi8( c1,
+                         _mm256_castsi128_si256( s1 ) );
+   casti_m256i( d,4 ) = _mm256_permutexvar_epi8( c0,
+                         _mm256_castsi128_si256( s2 ) );
+   casti_m256i( d,5 ) = _mm256_permutexvar_epi8( c1,
+                         _mm256_castsi128_si256( s2 ) );
+   casti_m256i( d,6 ) = _mm256_permutexvar_epi8( c0,
+                         _mm256_castsi128_si256( s3 ) );
+   casti_m256i( d,7 ) = _mm256_permutexvar_epi8( c1,
+                         _mm256_castsi128_si256( s3 ) );
+   casti_m256i( d,8 ) = _mm256_permutexvar_epi8( c0,
+                         _mm256_castsi128_si256( s4 ) );
+   casti_m256i( d,9 ) = _mm256_permutexvar_epi8( c1,
+                         _mm256_castsi128_si256( s4 ) );
+}
+
+#else
+
 static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
 {
  const __m256i bswap_shuf = mm256_bcast_m128(
-                     _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
+                    _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
  __m256i s0 = casti_m256i( src,0 );
  __m256i s1 = casti_m256i( src,1 );
  __m128i s4 = casti_m128i( src,4 );
@@ -1524,6 +1626,8 @@ static inline void mm256_bswap32_intrlv80_4x64( void *d, const void *src )
                          _mm256_castsi128_si256( s4 ), 0x55 );
 }

+#endif
+
 #endif  // AVX2

 // 8x64   (AVX512)
@@ -1846,6 +1950,8 @@ static inline void extr_lane_8x64( void *dst, const void *src, const int lane,

 #if defined(__AVX512F__) && defined(__AVX512VL__)

+//TODO Enable for AVX10_512
+
 // broadcast to all lanes
 static inline void mm512_intrlv80_8x64( void *dst, const void *src )
 {
@@ -2089,10 +2195,36 @@ static inline void dintrlv_4x128_512( void *dst0, void *dst1, void *dst2,
   d0[3] = s[12];   d1[3] = s[13];    d2[3] = s[14];   d3[3] = s[15];
 }

-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
+#if defined(__AVX512VBMI__)
+//TODO Enable for AVX10_512
+
+static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
+{
+  const __m512i bswap_shuf = mm512_bcast_m128(
+                    _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
+  const __m128i s0 = casti_m128i( src,0 );
+  const __m128i s1 = casti_m128i( src,1 );
+  const __m128i s2 = casti_m128i( src,2 );
+  const __m128i s3 = casti_m128i( src,3 );
+  const __m128i s4 = casti_m128i( src,4 );
+
+  casti_m512i( d,0 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s0 ),
+                                                 bswap_shuf );
+  casti_m512i( d,1 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s1 ),
+                                                 bswap_shuf );
+  casti_m512i( d,2 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s2 ),
+                                                 bswap_shuf );
+  casti_m512i( d,3 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s3 ),
+                                                 bswap_shuf );
+  casti_m512i( d,4 ) = _mm512_permutexvar_epi8( _mm512_castsi128_si512( s4 ),
+                                                 bswap_shuf );
+}
+
+#else
+
+static inline void mm512_bswap32_intrlv80_4x128( void *d, const void *src )
 {
  const __m128i bswap_shuf = _mm_set_epi64x( 0x0c0d0e0f08090a0b,
                                             0x0405060700010203 );
@@ -2108,14 +2240,15 @@ static inline void mm512_bswap32_intrlv80_4x128( void *d, void *src )
  s3 = _mm_shuffle_epi8( s3, bswap_shuf );
  s4 = _mm_shuffle_epi8( s4, bswap_shuf );

-  casti_m512i( d, 0 ) = mm512_bcast_m128( s0 );
-  casti_m512i( d, 1 ) = mm512_bcast_m128( s1 );
-  casti_m512i( d, 2 ) = mm512_bcast_m128( s2 );
-  casti_m512i( d, 3 ) = mm512_bcast_m128( s3 );
-  casti_m512i( d, 4 ) = mm512_bcast_m128( s4 );
-}  
+  casti_m512i( d,0 ) = mm512_bcast_m128( s0 );
+  casti_m512i( d,1 ) = mm512_bcast_m128( s1 );
+  casti_m512i( d,2 ) = mm512_bcast_m128( s2 );
+  casti_m512i( d,3 ) = mm512_bcast_m128( s3 );
+  casti_m512i( d,4 ) = mm512_bcast_m128( s4 );
+}

-#endif
+#endif   // AVX512VBMI ELSE
+#endif   // AVX512

 // 2x256 (AVX512)

@@ -2955,6 +3088,8 @@ do { \

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

+//TODO Enable for AVX10_512
+
 /*
 #define mm512_intrlv_blend_128( hi, lo ) \
   _mm512_mask_blend_epi32( 0x0f0f, hi, lo )
--- a/simd-utils/simd-128.h
+++ b/simd-utils/simd-128.h
@@ -43,9 +43,11 @@ typedef union
 } __attribute__ ((aligned (16))) m128_ovly;


-// Deprecated. EVEX adds support for integer argument in broadcast instruction
-// eliminating the need for an explicit move in most cases. Use the set1
-// intrinsic with integers and let the compiler figure it out.
+// Deprecated. AVX512 adds EVEX encoding (3rd operand) and other improvements
+// that make these functions either unnecessary or inefficient.
+// In cases where an explicit move betweeen GP & SIMD registers is still
+// necessary the cvt, set, or set1 intrinsics can be used allowing the
+// compiler to exploilt new features to produce optimum code.
 static inline __m128i mm128_mov64_128( const uint64_t n )
 {
  __m128i a;
@@ -73,15 +75,7 @@ static inline __m128i mm128_mov32_128( const uint32_t n )
 //#define mm128_bcast_m64( v )   _mm_shuffle_epi32( v, 0x44 )
 //#define mm128_bcast_m32( v )   _mm_shuffle_epi32( v, 0x00 )

-// Deprecated, use set1 directly
-#define m128_const1_64          _mm_set1_epi64x
-#define m128_const1_32          _mm_set1_epi32
-
-// Deprecated, use set directly
-#define m128_const_64  _mm_set_epi64x
-
 // Pseudo constants
-
 #define m128_zero      _mm_setzero_si128()
 #define m128_one_128   mm128_mov64_128( 1 )
 //#define m128_one_64    _mm_set1_epi64x( 1 )
@@ -141,7 +135,7 @@ static inline __m128i mm128_neg1_fn()

 // Examples of simple operations using xim:

-// Insert 32 bit integer into v at element c and return updated v.
+// Copy i to element c of dest and copy remaining elemnts from v.
 static inline __m128i mm128_insert_32( const __m128i v, const uint32_t i,
                                       const int c )
 {   return mm128_xim_32( v, mm128_mov32_128( i ), c<<4 ); }
@@ -161,6 +155,7 @@ static inline __m128i mm128_mask_32( const __m128i v, const int m )

 // Bitwise not (~v)  
 #if defined(__AVX512VL__)
+//TODO Enable for AVX10_256

 static inline __m128i mm128_not( const __m128i v )
 {  return _mm_ternarylogic_epi64( v, v, v, 1 ); }
@@ -223,18 +218,54 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 {   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }

 #if defined(__AVX512VL__)
+//TODO Enable for AVX10_256

 // a ^ b ^ c
-#define mm128_xor3( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x96 )
+#define mm128_xor3( a, b, c )      _mm_ternarylogic_epi64( a, b, c, 0x96 )
+
+// a & b & c
+#define mm128_and3( a, b, c )      _mm_ternarylogic_epi64( a, b, c, 0x80 )
+
+// a | b | c
+#define mm128_or3( a, b, c )       _mm_ternarylogic_epi64( a, b, c, 0xfe )

 // a ^ ( b & c )
-#define mm128_xorand( a, b, c )  _mm_ternarylogic_epi64( a, b, c, 0x78 )
+#define mm128_xorand( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x78 )
+
+// a & ( b ^ c )
+#define mm128_andxor( a, b, c )    _mm_ternarylogic_epi64( a, b, c, 0x60 )
+
+// a ^ ( b | c )
+#define mm128_xoror( a, b, c )     _mm_ternarylogic_epi64( a, b, c, 0x1e )
+
+// a ^ ( ~b & c )
+#define mm128_xorandnot( a, b, c ) _mm_ternarylogic_epi64( a, b, c, 0xd2 )
+
+// a | ( b & c )
+#define mm128_orand( a, b, c )     _mm_ternarylogic_epi64( a, b, c, 0xf8 )
+
+// ~( a ^ b ), same as (~a) ^ b
+#define mm128_xnor( a, b )         _mm_ternarylogic_epi64( a, b, b, 0x81 )

 #else

-#define mm128_xor3( a, b, c )    _mm_xor_si128( a, _mm_xor_si128( b, c ) )
+#define mm128_xor3( a, b, c )      _mm_xor_si128( a, _mm_xor_si128( b, c ) )

-#define mm128_xorand( a, b, c )  _mm_xor_si128( a, _mm_and_si128( b, c ) )
+#define mm128_and3( a, b, c )      _mm_and_si128( a, _mm_and_si128( b, c ) )
+
+#define mm128_or3( a, b, c )       _mm_or_si128( a, _mm_or_si128( b, c ) )
+
+#define mm128_xorand( a, b, c )    _mm_xor_si128( a, _mm_and_si128( b, c ) )
+
+#define mm128_andxor( a, b, c )    _mm_and_si128( a, _mm_xor_si128( b, c ))
+
+#define mm128_xoror( a, b, c )     _mm_xor_si128( a, _mm_or_si128( b, c ) )
+
+#define mm128_xorandnot( a, b, c ) _mm_xor_si128( a, _mm_andnot_si128( b, c ) )
+
+#define mm128_orand( a, b, c )     _mm_or_si128( a, _mm_and_si128( b, c ) )
+
+#define mm128_xnor( a, b )         mm128_not( _mm_xor_si128( a, b ) )

 #endif

@@ -257,6 +288,7 @@ static inline void memcpy_128( __m128i *dst, const __m128i *src, const int n )
 // transparency.

 #if defined(__AVX512VL__)
+//TODO Enable for AVX10_256

 #define mm128_ror_64    _mm_ror_epi64
 #define mm128_rol_64    _mm_rol_epi64
@@ -372,7 +404,10 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #define mm128_shuflr64_32     mm128_swap64_32
 #define mm128_shufll64_32     mm128_swap64_32

-#if defined(__SSSE3__) && !defined(__AVX512VL__)
+//TODO Enable for AVX10_256
+#if defined(__AVX512VL__)
+  #define m1286_shuflr64_24( v )  _mm_ror_epi64( v, 24 )
+#elif defined(__SSSE3__) 
  #define mm128_shuflr64_24( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                    0x0a09080f0e0d0c0b, 0x0201000706050403 ) )
@@ -380,7 +415,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
  #define mm128_shuflr64_24( v ) mm128_ror_64( v, 24 )
 #endif

-#if defined(__SSSE3__) && !defined(__AVX512VL__)
+#if defined(__AVX512VL__)
+  #define mm128_shuflr64_16( v )  _mm_ror_epi64( v, 16 )
+#elif defined(__SSSE3__) 
  #define mm128_shuflr64_16( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                    0x09080f0e0d0c0b0a, 0x0100070605040302 ) )
@@ -390,7 +427,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )

 // Rotate 32 bit lanes

-#if defined(__SSSE3__) && !defined(__AVX512VL__)
+#if defined(__AVX512VL__)
+  #define mm128_swap32_16( v )  _mm_ror_epi32( v, 16 )
+#elif defined(__SSSE3__)
  #define mm128_swap32_16( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                    0x0d0c0f0e09080b0a, 0x0504070601000302 ) )
@@ -400,7 +439,9 @@ static inline __m128i mm128_shuflr_x8( const __m128i v, const int c )
 #define mm128_shuflr32_16      mm128_swap32_16
 #define mm128_shufll32_16      mm128_swap32_16

-#if defined(__SSSE3__) && !defined(__AVX512VL__)
+#if defined(__AVX512VL__)
+  #define mm128_shuflr32_8( v )  _mm_ror_epi32( v, 8 )
+#elif defined(__SSSE3__)
  #define mm128_shuflr32_8( v ) \
    _mm_shuffle_epi8( v, _mm_set_epi64x( \
                                    0x0c0f0e0d080b0a09, 0x0407060500030201 ) )
--- a/simd-utils/simd-256.h
+++ b/simd-utils/simd-256.h
@@ -13,17 +13,14 @@
 // automatically but their use is limited because 256 bit vectors are less
 // likely to be used when 512 is available.
 //
+// AVX10_256 will support AVX512VL instructions on CPUs limited to 256 bit
+// vectors. This will require enabling when the compiler's AVX10 feature
+// macros are known.
+//
 // "_mm256_shuffle_epi8" and "_mm256_alignr_epi8" are restricted to 128 bit
 // lanes and data can't cross the 128 bit lane boundary.  
-// Full width byte shuffle is available with AVX512VL using the mask version
-// with a full mask (-1). 
 // Instructions that can move data across 128 bit lane boundary incur a
 // performance penalty over those that can't.
-// Some usage of index vectors may be encoded as if full vector shuffles are
-// supported. This has no side effects and would have the same results using
-// either version.
-// If the need arises and AVX512VL is available, 256 bit full vector byte 
-// shuffles can be implemented using the AVX512 mask feature with a NULL mask.

 #if defined(__AVX__)

@@ -66,6 +63,7 @@ typedef union
 // Set either the low or high 64 bit elements in 128 bit lanes, other elements
 // are set to zero.
 #if defined(__AVX512VL__)
+//TODO Enable for AVX10_256

 #define mm256_bcast128lo_64( i64 )     _mm256_maskz_set1_epi64( 0x55, i64 )
 #define mm256_bcast128hi_64( i64 )     _mm256_maskz_set1_epi64( 0xaa, i64 )
@@ -81,11 +79,9 @@ typedef union

 #define mm256_set2_64( i1, i0 )   mm256_bcast_m128( _mm_set_epi64x( i1, i0 ) )

-// Deprecated
-#define m256_const1_64       _mm256_set1_epi64x
-#define m256_const1_32       _mm256_set1_epi32
+#define mm256_set4_32( i3, i2, i1, i0 ) \
+   mm256_bcast_m128( _mm_set_epi32( i3, i2, i1, i0 ) )

-//
 // All SIMD constant macros are actually functions containing executable
 // code and therefore can't be used as compile time initializers.

@@ -121,6 +117,7 @@ static inline void memcpy_256( __m256i *dst, const __m256i *src, const int n )
 // Basic operations without SIMD equivalent

 #if defined(__AVX512VL__)
+//TODO Enable for AVX10_256

 static inline __m256i mm256_not( const __m256i v )
 {  return _mm256_ternarylogic_epi64( v, v, v, 1 ); }
@@ -140,8 +137,7 @@ static inline __m256i mm256_not( const __m256i v )
   _mm256_add_epi32( _mm256_add_epi32( a, b ), _mm256_add_epi32( c, d ) )

 #if defined(__AVX512VL__)
-
-// AVX512 has ternary logic that supports any 3 input boolean expression.
+//TODO Enable for AVX10_256

 // a ^ b ^ c
 #define mm256_xor3( a, b, c )      _mm256_ternarylogic_epi64( a, b, c, 0x96 )
@@ -176,31 +172,31 @@ static inline __m256i mm256_not( const __m256i v )
 #else

 #define mm256_xor3( a, b, c ) \
-   _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )
+  _mm256_xor_si256( a, _mm256_xor_si256( b, c ) )

 #define mm256_xor4( a, b, c, d ) \
-   _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )
+  _mm256_xor_si256( _mm256_xor_si256( a, b ), _mm256_xor_si256( c, d ) )

 #define mm256_and3( a, b, c ) \
-   _mm256_and_si256( a, _mm256_and_si256( b, c ) )
+  _mm256_and_si256( a, _mm256_and_si256( b, c ) )

 #define mm256_or3( a, b, c ) \
   _mm256_or_si256( a, _mm256_or_si256( b, c ) )

 #define mm256_xorand( a, b, c ) \
- _mm256_xor_si256( a, _mm256_and_si256( b, c ) )
+  _mm256_xor_si256( a, _mm256_and_si256( b, c ) )

 #define mm256_andxor( a, b, c ) \
  _mm256_and_si256( a, _mm256_xor_si256( b, c ))

 #define mm256_xoror( a, b, c ) \
- _mm256_xor_si256( a, _mm256_or_si256( b, c ) )
+  _mm256_xor_si256( a, _mm256_or_si256( b, c ) )

 #define mm256_xorandnot( a, b, c ) \
- _mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )
+  _mm256_xor_si256( a, _mm256_andnot_si256( b, c ) )

 #define mm256_orand( a, b, c ) \
- _mm256_or_si256( a, _mm256_and_si256( b, c ) )
+  _mm256_or_si256( a, _mm256_and_si256( b, c ) )

 #define mm256_xnor( a, b ) \
  mm256_not( _mm256_xor_si256( a, b ) )
@@ -226,6 +222,7 @@ static inline __m256i mm256_not( const __m256i v )
 // transparency.

 #if defined(__AVX512VL__)
+//TODO Enable for AVX10_256

 #define mm256_ror_64    _mm256_ror_epi64
 #define mm256_rol_64    _mm256_rol_epi64
@@ -380,6 +377,7 @@ static inline __m256i mm256_shuflr128_x8( const __m256i v, const int c )
 #define mm256_shuflr64_32         mm256_swap64_32
 #define mm256_shufll64_32         mm256_swap64_32

+//TODO Enable for AVX10_256
 #if defined(__AVX512VL__)
  #define mm256_shuflr64_24( v )  _mm256_ror_epi64( v, 24 )
 #else
--- a/simd-utils/simd-512.h
+++ b/simd-utils/simd-512.h
@@ -113,10 +113,6 @@ static inline __m512i mm512_perm_128( const __m512i v, const int c )
 #define mm512_set2_64( i1, i0 ) \
   mm512_bcast_m128( _mm_set_epi64x( i1, i0 ) )

-// Deprecated, use set
-#define m512_const1_64    _mm512_set1_epi64
-#define m512_const1_32    _mm512_set1_epi32
-
 // Pseudo constants.
 #define m512_zero       _mm512_setzero_si512()
 // Deprecated
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -174,35 +174,147 @@ static inline int cpu_fanpercent()
 	return 0;
 }

+
+// CPUID
+
+// This list is incomplete, it only contains features of interest to cpuminer.
+// refer to http://en.wikipedia.org/wiki/CPUID for details.
+
+// AVX10 compatibility notes
+//
+// Notation used: AVX10i.[version]_[vectorwidth]
+// AVX10.1_512 is a rebranding of AVX512 and is effectively the AVX* superset
+// with full 512 bit vector support.
+// AVX10.2_256 is effectively AVX2 + AVX512_VL, all AVX512 instructions and
+// features applied only to 256 bit and 128 bit vectors.
+// Future AVX10 versions will add new instructions and features.
+
+// Register array indexes
+#define EAX_Reg  (0)
+#define EBX_Reg  (1)
+#define ECX_Reg  (2)
+#define EDX_Reg  (3)
+
+// CPUID function number, aka leaf (EAX)
+#define VENDOR_ID            (0)
+#define CPU_INFO             (1)
+#define CACHE_TLB_DESCRIPTOR (2)
+#define EXTENDED_FEATURES    (7)
+#define AVX10_FEATURES       (0x24)
+#define HIGHEST_EXT_FUNCTION (0x80000000)
+#define EXTENDED_CPU_INFO    (0x80000001)
+#define CPU_BRAND_1          (0x80000002)
+#define CPU_BRAND_2          (0x80000003)
+#define CPU_BRAND_3          (0x80000004)
+
+// CPU_INFO: EAX=1, ECX=0
+// ECX
+#define SSE3_Flag                 1    
+#define SSSE3_Flag               (1<< 9)
+#define XOP_Flag                 (1<<11)   // obsolete
+#define FMA3_Flag                (1<<12)
+#define SSE41_Flag               (1<<19)
+#define SSE42_Flag               (1<<20)
+#define AES_NI_Flag              (1<<25)
+#define XSAVE_Flag               (1<<26) 
+#define OSXSAVE_Flag             (1<<27)
+#define AVX_Flag                 (1<<28)
+// EDX
+#define MMX_Flag                 (1<<23)
+#define SSE_Flag                 (1<<25)
+#define SSE2_Flag                (1<<26) 
+
+// EXTENDED_FEATURES subleaf 0: EAX=7, ECX=0
+// EBX
+#define AVX2_Flag                (1<< 5)
+#define AVX512_F_Flag            (1<<16)
+#define AVX512_DQ_Flag           (1<<17)
+#define AVX512_IFMA_Flag         (1<<21)
+#define AVX512_PF_Flag           (1<<26)
+#define AVX512_ER_Flag           (1<<27)
+#define AVX512_CD_Flag           (1<<28)
+#define SHA_Flag                 (1<<29)
+#define AVX512_BW_Flag           (1<<30)
+#define AVX512_VL_Flag           (1<<31)
+// ECX
+#define AVX512_VBMI_Flag         (1<< 1) 
+#define AVX512_VBMI2_Flag        (1<< 6)
+#define VAES_Flag                (1<< 9)
+#define AVX512_VNNI_Flag         (1<<11)
+#define AVX512_BITALG_Flag       (1<<12)
+#define AVX512_VPOPCNTDQ_Flag    (1<<14)
+// EDX
+#define AVX512_4VNNIW_Flag       (1<< 2)
+#define AVX512_4FMAPS_Flag       (1<< 3)
+#define AVX512_VP2INTERSECT_Flag (1<< 8)
+#define AMX_BF16_Flag            (1<<22)
+#define AVX512_FP16_Flag         (1<<23)
+#define AMX_TILE_Flag            (1<<24)
+#define AMX_INT8_Flag            (1<<25)
+
+// EXTENDED_FEATURES subleaf 1: EAX=7, ECX=1
+// EAX
+#define SHA512_Flag               1
+#define SM3_Flag                 (1<< 1)
+#define SM4_Flag                 (1<< 2)
+#define AVX_VNNI_Flag            (1<< 4)
+#define AVX512_BF16_Flag         (1<< 5)
+#define AMX_FP16_Flag            (1<<21)
+#define AVX_IFMA_Flag            (1<<23)
+// EDX
+#define AVX_VNNI_INT8_Flag       (1<< 4)
+#define AVX_NE_CONVERT_Flag      (1<< 5)
+#define AMX_COMPLEX_Flag         (1<< 8)
+#define AVX_VNNI_INT16_Flag      (1<<10)
+#define AVX10_Flag               (1<<19)
+#define APX_F_Flag               (1<<21)
+
+// AVX10_FEATURES: EAX=0x24, ECX=0
+// EBX
+#define AVX10_VERSION_mask        0xff      // bits [7:0]
+#define AVX10_128_Flag           (1<<16)
+#define AVX10_256_Flag           (1<<17)   
+#define AVX10_512_Flag           (1<<18)   
+
+// Use this to detect presence of feature
+#define AVX_mask     (AVX_Flag|XSAVE_Flag|OSXSAVE_Flag)
+#define FMA3_mask    (FMA3_Flag|AVX_mask)
+#define AVX512_mask  (AVX512_VL_Flag|AVX512_BW_Flag|AVX512_DQ_Flag|AVX512_F_Flag)
+
+
 #ifndef __arm__
-static inline void cpuid(int functionnumber, int output[4]) {
+static inline void cpuid( unsigned int leaf, unsigned int subleaf,
+                          unsigned int output[4] )
+{
 #if defined (_MSC_VER) || defined (__INTEL_COMPILER)
-	// Microsoft or Intel compiler, intrin.h included
-	__cpuidex(output, functionnumber, 0);
+   // Microsoft or Intel compiler, intrin.h included
+   __cpuidex(output, leaf, subleaf );
 #elif defined(__GNUC__) || defined(__clang__)
-	// use inline assembly, Gnu/AT&T syntax
-	int a, b, c, d;
-	asm volatile("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionnumber), "c"(0));
-	output[0] = a;
-	output[1] = b;
-	output[2] = c;
-	output[3] = d;
+   // use inline assembly, Gnu/AT&T syntax
+   unsigned int a, b, c, d;
+   asm volatile( "cpuid"
+               : "=a"(a), "=b"(b), "=c"(c), "=d"(d)
+               : "a"(leaf), "c"(subleaf) );
+   output[ EAX_Reg ] = a;
+   output[ EBX_Reg ] = b;
+   output[ ECX_Reg ] = c;
+   output[ EDX_Reg ] = d;
 #else
-	// unknown platform. try inline assembly with masm/intel syntax
-	__asm {
-		mov eax, functionnumber
-		xor ecx, ecx
-		cpuid;
-		mov esi, output
-		mov[esi], eax
-		mov[esi + 4], ebx
-		mov[esi + 8], ecx
-		mov[esi + 12], edx
-	}
+   // unknown platform. try inline assembly with masm/intel syntax
+   __asm {
+      mov eax, leaf
+      mov ecx, subleaf
+      cpuid;
+      mov esi, output
+      mov[esi], eax
+      mov[esi + 4], ebx
+      mov[esi + 8], ecx
+      mov[esi + 12], edx
+   }
 #endif
 }
 #else /* !__arm__ */
-#define cpuid(fn, out) out[0] = 0;
+#define cpuid(leaf, subleaf, out) out[0] = 0;
 #endif

 static inline void cpu_getname(char *outbuf, size_t maxsz)
@@ -211,13 +323,13 @@ static inline void cpu_getname(char *outbuf, size_t maxsz)
 #ifdef WIN32
   char brand[256] = { 0 };
   int output[4] = { 0 }, ext;
-   cpuid(0x80000000, output);
+   cpuid( 0x80000000, 0, output );
   ext = output[0];
   if (ext >= 0x80000004)
   {
      for (int i = 2; i <= (ext & 0xF); i++)
      {
-         cpuid(0x80000000+i, output);
+         cpuid( 0x80000000+i, 0, output);
         memcpy(&brand[(i-2) * 4*sizeof(int)], output, 4*sizeof(int));
      }
      snprintf(outbuf, maxsz, "%s", brand);
@@ -309,70 +421,97 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz)
 #endif
 }
 
-// http://en.wikipedia.org/wiki/CPUID
+// Typical display format: AVX10.[version]_[vectorlength], if vector length is
+// omitted 256 is the default.
+//    Ex: AVX10.1_512
+// Flags:
+// AVX10  128  256  512
+//   0     0    0    0    = AVX10 not supported
+//   1     1    1    0    = AVX10 256 bit max  (version 2)
+//   1     1    1    1    = AVX10 512 bit max  (version 1 granite rapids)
+// Other combinations are not defined.

-// CPUID commands
-#define VENDOR_ID            (0)
-#define CPU_INFO             (1)
-#define CACHE_TLB_DESCRIPTOR (2)
-#define EXTENDED_FEATURES    (7)
-#define HIGHEST_EXT_FUNCTION (0x80000000)
-#define EXTENDED_CPU_INFO    (0x80000001)
-#define CPU_BRAND_1          (0x80000002)
-#define CPU_BRAND_2          (0x80000003)
-#define CPU_BRAND_3          (0x80000004)
+// Test AVX10_flag before AVX10_FEATURES flags.
+static inline bool has_avx10()
+{
+#ifdef __arm__
+    return false;
+#else
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, 1, cpu_info );
+    return cpu_info[ EDX_Reg ] & AVX10_Flag;
+#endif
+}

-// Registers
-#define EAX_Reg  (0)
-#define EBX_Reg  (1)
-#define ECX_Reg  (2)
-#define EDX_Reg  (3)
+static inline unsigned int avx10_version()
+{
+#ifdef __arm__
+    return 0;
+#else
+    if ( has_avx10() )
+    {
+       unsigned int cpu_info[4] = { 0 };
+       cpuid( AVX10_FEATURES, 0, cpu_info );
+       return cpu_info[ EBX_Reg ] & AVX10_VERSION_mask;
+    }
+    return 0;
+#endif
+}

-// Feature flags
+static inline bool has_avx10_512()
+{
+#ifdef __arm__
+    return false;
+#else
+    if ( has_avx10() )
+    {
+       unsigned int cpu_info[4] = { 0 };
+       cpuid( AVX10_FEATURES, 0, cpu_info );
+       return cpu_info[ EBX_Reg ] & AVX10_512_Flag;
+    }
+    return false;
+#endif
+}

-// CPU_INFO ECX
-#define SSE3_Flag      1    
-#define SSSE3_Flag    (1<< 9)
-#define XOP_Flag      (1<<11)   // obsolete, only available on pre-Ryzen AMD
-#define FMA3_Flag     (1<<12)
-#define AES_Flag      (1<<25)
-#define SSE41_Flag    (1<<19)
-#define SSE42_Flag    (1<<20)
-#define AES_Flag      (1<<25)
-#define XSAVE_Flag    (1<<26) 
-#define OSXSAVE_Flag  (1<<27)
-#define AVX_Flag      (1<<28)
+static inline bool has_avx10_256()
+{
+#ifdef __arm__
+    return false;
+#else
+    if ( has_avx10() )
+    {
+       unsigned int cpu_info[4] = { 0 };
+       cpuid( AVX10_FEATURES, 0, cpu_info );
+       return cpu_info[ EBX_Reg ] & AVX10_256_Flag;
+    }
+    return false;
+#endif
+}

-// CPU_INFO EDX
-#define SSE_Flag      (1<<25)
-#define SSE2_Flag     (1<<26) 
-
-// EXTENDED_FEATURES EBX
-#define AVX2_Flag     (1<< 5)
-#define AVX512F_Flag  (1<<16)
-#define AVX512DQ_Flag (1<<17)
-#define SHA_Flag      (1<<29)
-#define AVX512BW_Flag (1<<30)
-#define AVX512VL_Flag (1<<31)
-
-// EXTENDED_FEATURES ECX
-#define AVX512VBMI_Flag  (1<<1) 
-#define AVX512VBMI2_Flag (1<<6)
-#define VAES_Flag        (1<<9)
-
-
-// Use this to detect presence of feature
-#define AVX_mask     (AVX_Flag|XSAVE_Flag|OSXSAVE_Flag)
-#define FMA3_mask    (FMA3_Flag|AVX_mask)
-#define AVX512_mask  (AVX512VL_Flag|AVX512BW_Flag|AVX512DQ_Flag|AVX512F_Flag)
+// Maximum vector length
+static inline unsigned int avx10_vector_length()
+{
+#ifdef __arm__
+    return 0;
+#else
+    if ( has_avx10() )
+    {
+       unsigned int cpu_info[4] = { 0 };
+       cpuid( AVX10_FEATURES, 0, cpu_info );
+       return cpu_info[ EBX_Reg ] & AVX10_512_Flag ? 512
+          : ( cpu_info[ EBX_Reg ] & AVX10_256_Flag ? 256 : 0 );
+    }
+    return 0;
+#endif
+}    

 static inline bool has_sha()
 {
 #ifdef __arm__
    return false;
 #else
-    int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, cpu_info );
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, 0, cpu_info );
    return cpu_info[ EBX_Reg ] & SHA_Flag;
 #endif
 }
@@ -382,8 +521,8 @@ static inline bool has_sse2()
 #ifdef __arm__
    return false;
 #else
-    int cpu_info[4] = { 0 };
-    cpuid( CPU_INFO, cpu_info );
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( CPU_INFO, 0, cpu_info );
    return cpu_info[ EDX_Reg ] & SSE2_Flag;
 #endif
 }
@@ -394,9 +533,9 @@ static inline bool has_aes_ni()
 #ifdef __arm__
 	return false;
 #else
-	int cpu_info[4] = { 0 };
-        cpuid( CPU_INFO, cpu_info );
-	return cpu_info[ ECX_Reg ] & AES_Flag;
+	unsigned int cpu_info[4] = { 0 };
+        cpuid( CPU_INFO, 0, cpu_info );
+	return cpu_info[ ECX_Reg ] & AES_NI_Flag;
 #endif
 }

@@ -406,8 +545,8 @@ static inline bool has_avx()
 #ifdef __arm__
        return false;
 #else
-        int cpu_info[4] = { 0 };
-        cpuid( CPU_INFO, cpu_info );
+        unsigned int cpu_info[4] = { 0 };
+        cpuid( CPU_INFO, 0, cpu_info );
        return ( ( cpu_info[ ECX_Reg ] & AVX_mask ) == AVX_mask );
 #endif
 }
@@ -418,8 +557,8 @@ static inline bool has_avx2()
 #ifdef __arm__
    return false;
 #else
-    int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, cpu_info );
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, 0, cpu_info );
    return cpu_info[ EBX_Reg ] & AVX2_Flag;
 #endif
 }
@@ -429,9 +568,9 @@ static inline bool has_avx512f()
 #ifdef __arm__
    return false;
 #else
-    int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, cpu_info );
-    return cpu_info[ EBX_Reg ] & AVX512F_Flag;
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, 0, cpu_info );
+    return cpu_info[ EBX_Reg ] & AVX512_F_Flag;
 #endif
 }

@@ -440,9 +579,9 @@ static inline bool has_avx512dq()
 #ifdef __arm__
    return false;
 #else
-    int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, cpu_info );
-    return cpu_info[ EBX_Reg ] & AVX512DQ_Flag;
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, 0, cpu_info );
+    return cpu_info[ EBX_Reg ] & AVX512_DQ_Flag;
 #endif
 }

@@ -451,9 +590,9 @@ static inline bool has_avx512bw()
 #ifdef __arm__
    return false;
 #else
-    int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, cpu_info );
-    return cpu_info[ EBX_Reg ] & AVX512BW_Flag;
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, 0, cpu_info );
+    return cpu_info[ EBX_Reg ] & AVX512_BW_Flag;
 #endif
 }

@@ -462,9 +601,9 @@ static inline bool has_avx512vl()
 #ifdef __arm__
    return false;
 #else
-    int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, cpu_info );
-    return cpu_info[ EBX_Reg ] & AVX512VL_Flag;
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, 0, cpu_info );
+    return cpu_info[ EBX_Reg ] & AVX512_VL_Flag;
 #endif
 }

@@ -474,30 +613,19 @@ static inline bool has_avx512()
 #ifdef __arm__
    return false;
 #else
-    int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, cpu_info );
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, 0, cpu_info );
    return ( ( cpu_info[ EBX_Reg ] & AVX512_mask ) == AVX512_mask );
 #endif
 }

-// AMD Zen3 added support for 256 bit VAES without requiring AVX512.
-// The original Intel spec requires AVX512F to support 512 bit VAES and 
-// requires AVX512VL to support 256 bit VAES.
-// The CPUID VAES bit alone can't distiguish 256 vs 512 bit.
-// If necessary:
-// VAES 256 & 512 = VAES && AVX512VL
-// VAES 512 = VAES && AVX512F  
-// VAES 256 = ( VAES && AVX512VL ) || ( VAES && !AVX512F )
-// VAES 512 only = VAES && AVX512F && !AVX512VL
-// VAES 256 only = VAES && !AVX512F
-
 static inline bool has_vaes()
 {
 #ifdef __arm__
    return false;
 #else
-    int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, cpu_info );
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, 0, cpu_info );
    return cpu_info[ ECX_Reg ] & VAES_Flag;
 #endif
 }
@@ -507,9 +635,9 @@ static inline bool has_vbmi()
 #ifdef __arm__
    return false;
 #else
-    int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, cpu_info );
-    return cpu_info[ ECX_Reg ] & AVX512VBMI_Flag;
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, 0, cpu_info );
+    return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag;
 #endif
 }

@@ -518,9 +646,9 @@ static inline bool has_vbmi2()
 #ifdef __arm__
    return false;
 #else
-    int cpu_info[4] = { 0 };
-    cpuid( EXTENDED_FEATURES, cpu_info );
-    return cpu_info[ ECX_Reg ] & AVX512VBMI2_Flag;
+    unsigned int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, 0, cpu_info );
+    return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag;
 #endif
 }

@@ -530,8 +658,8 @@ static inline bool has_xop()
 #ifdef __arm__
        return false;
 #else
-        int cpu_info[4] = { 0 };
-        cpuid( EXTENDED_CPU_INFO, cpu_info );
+        unsigned int cpu_info[4] = { 0 };
+        cpuid( EXTENDED_CPU_INFO, 0, cpu_info );
        return cpu_info[ ECX_Reg ] & XOP_Flag;
 #endif
 }
@@ -541,8 +669,8 @@ static inline bool has_fma3()
 #ifdef __arm__
        return false;
 #else
-        int cpu_info[4] = { 0 };
-        cpuid( CPU_INFO, cpu_info );
+        unsigned int cpu_info[4] = { 0 };
+        cpuid( CPU_INFO, 0, cpu_info );
        return ( ( cpu_info[ ECX_Reg ] & FMA3_mask ) == FMA3_mask );
 #endif
 }
@@ -552,8 +680,8 @@ static inline bool has_sse42()
 #ifdef __arm__
        return false;
 #else
-        int cpu_info[4] = { 0 };
-        cpuid( CPU_INFO, cpu_info );
+        unsigned int cpu_info[4] = { 0 };
+        cpuid( CPU_INFO, 0, cpu_info );
        return cpu_info[ ECX_Reg ] & SSE42_Flag;
 #endif
 }
@@ -563,16 +691,16 @@ static inline bool has_sse()
 #ifdef __arm__
        return false;
 #else
-        int cpu_info[4] = { 0 };
-        cpuid( CPU_INFO, cpu_info );
+        unsigned int cpu_info[4] = { 0 };
+        cpuid( CPU_INFO, 0, cpu_info );
        return cpu_info[ EDX_Reg ] & SSE_Flag;
 #endif
 }

 static inline uint32_t cpuid_get_highest_function_number()
 {
-  uint32_t cpu_info[4] = {0};
-  cpuid( VENDOR_ID, cpu_info);
+  unsigned int cpu_info[4] = {0};
+  cpuid( VENDOR_ID, 0, cpu_info);
  return cpu_info[ EAX_Reg ];
 }

@@ -605,8 +733,8 @@ static inline void cpu_bestfeature(char *outbuf, size_t maxsz)
 #else
 	int cpu_info[4] = { 0 };
 	int cpu_info_adv[4] = { 0 };
-	cpuid( CPU_INFO, cpu_info );
-	cpuid( EXTENDED_FEATURES, cpu_info_adv );
+	cpuid( CPU_INFO, 0, cpu_info );
+	cpuid( EXTENDED_FEATURES, 0, cpu_info_adv );

        if ( has_avx() && has_avx2() )
              sprintf(outbuf, "AVX2");
@@ -634,14 +762,14 @@ static inline void cpu_brand_string( char* s )
        sprintf( s, "ARM" );
 #else
    int cpu_info[4] = { 0 };
-    cpuid( VENDOR_ID, cpu_info );
+    cpuid( VENDOR_ID, 0, cpu_info );
    if ( cpu_info[ EAX_Reg ] >= 4 )
    {
-        cpuid( CPU_BRAND_1, cpu_info );
+        cpuid( CPU_BRAND_1, 0, cpu_info );
        memcpy( s, cpu_info, sizeof(cpu_info) );
-        cpuid( CPU_BRAND_2, cpu_info );
+        cpuid( CPU_BRAND_2, 0, cpu_info );
        memcpy( s + 16, cpu_info, sizeof(cpu_info) );
-        cpuid( CPU_BRAND_3, cpu_info );
+        cpuid( CPU_BRAND_3, 0, cpu_info );
        memcpy( s + 32, cpu_info, sizeof(cpu_info) );
    }
 #endif