v3.22.3

2025-09-17 23:44:27 +00:00 · 2023-06-14 11:07:40 -04:00
parent de564ccbde
commit 57a6b7b58b
31 changed files with 3724 additions and 3345 deletions
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -598,10 +598,10 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = m128_const1_64( 0x243F6A88243F6A88 ); \
-   V9 = m128_const1_64( 0x85A308D385A308D3 ); \
-   VA = m128_const1_64( 0x13198A2E13198A2E ); \
-   VB = m128_const1_64( 0x0370734403707344 ); \
+   V8 = _mm_set1_epi64x( 0x243F6A88243F6A88 ); \
+   V9 = _mm_set1_epi64x( 0x85A308D385A308D3 ); \
+   VA = _mm_set1_epi64x( 0x13198A2E13198A2E ); \
+   VB = _mm_set1_epi64x( 0x0370734403707344 ); \
   VC = _mm_set1_epi32( T0 ^ 0xA4093822 ); \
   VD = _mm_set1_epi32( T0 ^ 0x299F31D0 ); \
   VE = _mm_set1_epi32( T1 ^ 0x082EFA98 ); \
@@ -958,7 +958,6 @@ do { \
   __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
   __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
-   __m256i shuf_bswap32; \
   V0 = H0; \
   V1 = H1; \
   V2 = H2; \
@@ -967,16 +966,16 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
-   V9 = m256_const1_64( 0x85A308D385A308D3 ); \
-   VA = m256_const1_64( 0x13198A2E13198A2E ); \
-   VB = m256_const1_64( 0x0370734403707344 ); \
+   V8 = _mm256_set1_epi64x( 0x243F6A88243F6A88 ); \
+   V9 = _mm256_set1_epi64x( 0x85A308D385A308D3 ); \
+   VA = _mm256_set1_epi64x( 0x13198A2E13198A2E ); \
+   VB = _mm256_set1_epi64x( 0x0370734403707344 ); \
   VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
   VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
   VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
   VF = _mm256_set1_epi32( T1 ^ 0xEC4E6C89 ); \
-   shuf_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+   const __m256i shuf_bswap32 = mm256_set2_64( \
+                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
   M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
@@ -1034,10 +1033,10 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = m256_const1_64( 0x243F6A88243F6A88 ); \
-   V9 = m256_const1_64( 0x85A308D385A308D3 ); \
-   VA = m256_const1_64( 0x13198A2E13198A2E ); \
-   VB = m256_const1_64( 0x0370734403707344 ); \
+   V8 = _mm256_set1_epi64x( 0x243F6A88243F6A88 ); \
+   V9 = _mm256_set1_epi64x( 0x85A308D385A308D3 ); \
+   VA = _mm256_set1_epi64x( 0x13198A2E13198A2E ); \
+   VB = _mm256_set1_epi64x( 0x0370734403707344 ); \
   VC = _mm256_set1_epi32( T0 ^ 0xA4093822 ); \
   VD = _mm256_set1_epi32( T0 ^ 0x299F31D0 ); \
   VE = _mm256_set1_epi32( T1 ^ 0x082EFA98 ); \
@@ -1100,23 +1099,23 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
   V[ 5] = H[5];
   V[ 6] = H[6];
   V[ 7] = H[7];
-   V[ 8] = m256_const1_32( CS0 );
-   V[ 9] = m256_const1_32( CS1 );
-   V[10] = m256_const1_32( CS2 );
-   V[11] = m256_const1_32( CS3 );
-   V[12] = m256_const1_32( CS4 ^ 0x280 );
-   V[13] = m256_const1_32( CS5 ^ 0x280 );
-   V[14] = m256_const1_32( CS6 );
-   V[15] = m256_const1_32( CS7 );
+   V[ 8] = _mm256_set1_epi32( CS0 );
+   V[ 9] = _mm256_set1_epi32( CS1 );
+   V[10] = _mm256_set1_epi32( CS2 );
+   V[11] = _mm256_set1_epi32( CS3 );
+   V[12] = _mm256_set1_epi32( CS4 ^ 0x280 );
+   V[13] = _mm256_set1_epi32( CS5 ^ 0x280 );
+   V[14] = _mm256_set1_epi32( CS6 );
+   V[15] = _mm256_set1_epi32( CS7 );

 // M[ 0:3 ] contain new message data including unique nonces in M[ 3].
 // M[ 5:12, 14 ] are always zero and not needed or used.
 // M[ 4], M[ 13], M[15] are constant and are initialized here.
 // M[ 5] is a special case, used as a cache for (M[13] ^ CSC).

-   M[ 4] = m256_const1_32( 0x80000000 );
+   M[ 4] = _mm256_set1_epi32( 0x80000000 );
   M[13] = m256_one_32;
-   M[15] = m256_const1_32( 80*8 );
+   M[15] = _mm256_set1_epi32( 80*8 );

   M[ 5] =_mm256_xor_si256( M[13], _mm256_set1_epi32( CSC ) );

@@ -1278,8 +1277,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   ROUND256_8WAY_3;

   const __m256i shuf_bswap32 =
-                  m256_const_64( 0x1c1d1e1f18191a1b, 0x1415161710111213,
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 );
+                  mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );

   H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
   H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
@@ -1615,7 +1613,8 @@ do { \
   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
   __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
-   __m512i shuf_bswap32; \
+   const __m512i shuf_bswap32 = mm512_bcast_m128( _mm_set_epi64x( \
+                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
   V0 = H0; \
   V1 = H1; \
   V2 = H2; \
@@ -1624,18 +1623,14 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
-   V9 = m512_const1_64( 0x85A308D385A308D3 ); \
-   VA = m512_const1_64( 0x13198A2E13198A2E ); \
-   VB = m512_const1_64( 0x0370734403707344 ); \
+   V8 = _mm512_set1_epi64( 0x243F6A88243F6A88 ); \
+   V9 = _mm512_set1_epi64( 0x85A308D385A308D3 ); \
+   VA = _mm512_set1_epi64( 0x13198A2E13198A2E ); \
+   VB = _mm512_set1_epi64( 0x0370734403707344 ); \
   VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
   VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
   VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
   VF = _mm512_set1_epi32( T1 ^ 0xEC4E6C89 ); \
-   shuf_bswap32 = m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233, \
-                                 0x2c2d2e2f28292a2b, 0x2425262720212223, \
-                                 0x1c1d1e1f18191a1b, 0x1415161710111213, \
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
   M0 = _mm512_shuffle_epi8( * buf    , shuf_bswap32 ); \
   M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
   M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
@@ -1693,10 +1688,10 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = m512_const1_64( 0x243F6A88243F6A88 ); \
-   V9 = m512_const1_64( 0x85A308D385A308D3 ); \
-   VA = m512_const1_64( 0x13198A2E13198A2E ); \
-   VB = m512_const1_64( 0x0370734403707344 ); \
+   V8 = _mm512_set1_epi64( 0x243F6A88243F6A88 ); \
+   V9 = _mm512_set1_epi64( 0x85A308D385A308D3 ); \
+   VA = _mm512_set1_epi64( 0x13198A2E13198A2E ); \
+   VB = _mm512_set1_epi64( 0x0370734403707344 ); \
   VC = _mm512_set1_epi32( T0 ^ 0xA4093822 ); \
   VD = _mm512_set1_epi32( T0 ^ 0x299F31D0 ); \
   VE = _mm512_set1_epi32( T1 ^ 0x082EFA98 ); \
@@ -1763,23 +1758,23 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
   V[ 5] = H[5];
   V[ 6] = H[6];
   V[ 7] = H[7];
-   V[ 8] = m512_const1_32( CS0 );
-   V[ 9] = m512_const1_32( CS1 );
-   V[10] = m512_const1_32( CS2 );
-   V[11] = m512_const1_32( CS3 );
-   V[12] = m512_const1_32( CS4 ^ 0x280 );
-   V[13] = m512_const1_32( CS5 ^ 0x280 );
-   V[14] = m512_const1_32( CS6 );
-   V[15] = m512_const1_32( CS7 );
+   V[ 8] = _mm512_set1_epi32( CS0 );
+   V[ 9] = _mm512_set1_epi32( CS1 );
+   V[10] = _mm512_set1_epi32( CS2 );
+   V[11] = _mm512_set1_epi32( CS3 );
+   V[12] = _mm512_set1_epi32( CS4 ^ 0x280 );
+   V[13] = _mm512_set1_epi32( CS5 ^ 0x280 );
+   V[14] = _mm512_set1_epi32( CS6 );
+   V[15] = _mm512_set1_epi32( CS7 );

 // M[ 0:3 ] contain new message data including unique nonces in M[ 3].   
 // M[ 5:12, 14 ] are always zero and not needed or used, except M[5] as noted.
 // M[ 4], M[ 13], M[15] are constant and are initialized here.
 // M[ 5] is a special case, used as a cache for (M[13] ^ CSC).
   
-   M[ 4] = m512_const1_32( 0x80000000 );
+   M[ 4] = _mm512_set1_epi32( 0x80000000 );
   M[13] = m512_one_32;
-   M[15] = m512_const1_32( 80*8 );
+   M[15] = _mm512_set1_epi32( 80*8 );

   M[ 5] =_mm512_xor_si512( M[13], _mm512_set1_epi32( CSC ) );

@@ -1956,10 +1951,8 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,

   // Byte swap final hash
   const __m512i shuf_bswap32 =
-                  m512_const_64( 0x3c3d3e3f38393a3b, 0x3435363730313233,
-                                 0x2c2d2e2f28292a2b, 0x2425262720212223,
-                                 0x1c1d1e1f18191a1b, 0x1415161710111213,
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 );
+                  mm512_bcast_m128( _mm_set_epi64x( 
+                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
   H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
@@ -1981,14 +1974,14 @@ static void
 blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
                   const uint32_t *salt, int rounds )
 {
-   casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 );
-   casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 );
-   casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 );
-   casti_m128i( ctx->H, 3 ) = m128_const1_64( 0xA54FF53AA54FF53A );
-   casti_m128i( ctx->H, 4 ) = m128_const1_64( 0x510E527F510E527F );
-   casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C );
-   casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB );
-   casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 );
+   casti_m128i( ctx->H, 0 ) = _mm_set1_epi64x( 0x6A09E6676A09E667 );
+   casti_m128i( ctx->H, 1 ) = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
+   casti_m128i( ctx->H, 2 ) = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
+   casti_m128i( ctx->H, 3 ) = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
+   casti_m128i( ctx->H, 4 ) = _mm_set1_epi64x( 0x510E527F510E527F );
+   casti_m128i( ctx->H, 5 ) = _mm_set1_epi64x( 0x9B05688C9B05688C );
+   casti_m128i( ctx->H, 6 ) = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
+   casti_m128i( ctx->H, 7 ) = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
   ctx->T0 = ctx->T1 = 0;
   ctx->ptr = 0;
   ctx->rounds = rounds;
@@ -2059,13 +2052,13 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
   else
      ctx->T0 -= 512 - bit_len;

-   buf[vptr] = m128_const1_64( 0x0000008000000080 );
+   buf[vptr] = _mm_set1_epi64x( 0x0000008000000080 );

   if ( vptr < 12 )
   {
      memset_zero_128( buf + vptr + 1, 13 - vptr  );
      buf[ 13 ] = _mm_or_si128( buf[ 13 ],
-                                m128_const1_64( 0x0100000001000000ULL ) );
+                                _mm_set1_epi64x( 0x0100000001000000ULL ) );
      buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
      buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
      blake32_4way( ctx, buf + vptr, 64 - ptr );
@@ -2078,7 +2071,7 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
      ctx->T1 = 0xFFFFFFFFUL;
      memset_zero_128( buf, 56>>2 );
      buf[ 13 ] = _mm_or_si128( buf[ 13 ],
-                                m128_const1_64( 0x0100000001000000ULL ) );
+                                _mm_set1_epi64x( 0x0100000001000000ULL ) );
      buf[ 14 ] = _mm_set1_epi32( bswap_32( th ) );
      buf[ 15 ] = _mm_set1_epi32( bswap_32( tl ) );
      blake32_4way( ctx, buf, 64 );
@@ -2097,14 +2090,14 @@ static void
 blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
                   const sph_u32 *salt, int rounds )
 {
-   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 );
-   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 );
-   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 );
-   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53AA54FF53A );
-   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527F510E527F );
-   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C );
-   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB );
-   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 );
+   casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
+   casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
+   casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
+   casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
+   casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527F510E527F );
+   casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C9B05688C );
+   casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
+   casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
   sc->rounds = rounds;
@@ -2163,7 +2156,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   buf[ptr>>2] = m256_const1_64( 0x0000008000000080ULL );
+   buf[ptr>>2] = _mm256_set1_epi64x( 0x0000008000000080ULL );
   tl = sc->T0 + bit_len;
   th = sc->T1;

@@ -2185,7 +2178,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
       memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
       if ( out_size_w32 == 8 )
           buf[52>>2] = _mm256_or_si256( buf[52>>2],
-                                m256_const1_64( 0x0100000001000000ULL ) );
+                                _mm256_set1_epi64x( 0x0100000001000000ULL ) );
       *(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
       *(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
       blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
@@ -2198,7 +2191,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
       sc->T1 = SPH_C32(0xFFFFFFFFUL);
       memset_zero_256( buf, 56>>2 );
       if ( out_size_w32 == 8 )
-           buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
+           buf[52>>2] = _mm256_set1_epi64x( 0x0100000001000000ULL );
       *(buf+(56>>2)) = _mm256_set1_epi32( bswap_32( th ) );
       *(buf+(60>>2)) = _mm256_set1_epi32( bswap_32( tl ) );
       blake32_8way( sc, buf, 64 );
@@ -2259,7 +2252,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   buf[ptr>>2] = m256_const1_32( 0x80000000 );
+   buf[ptr>>2] = _mm256_set1_epi32( 0x80000000 );
   tl = sc->T0 + bit_len;
   th = sc->T1;

@@ -2312,14 +2305,14 @@ static void
 blake32_16way_init( blake_16way_small_context *sc, const sph_u32 *iv,
                   const sph_u32 *salt, int rounds )
 {
-   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E6676A09E667 );
-   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE85BB67AE85 );
-   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF3723C6EF372 );
-   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53AA54FF53A );
-   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527F510E527F );
-   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C9B05688C );
-   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9AB1F83D9AB );
-   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD195BE0CD19 );
+   casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E6676A09E667 );
+   casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
+   casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
+   casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
+   casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527F510E527F );
+   casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C9B05688C );
+   casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
+   casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
   sc->rounds = rounds;
@@ -2376,7 +2369,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   buf[ptr>>2] = m512_const1_64( 0x0000008000000080ULL );
+   buf[ptr>>2] = _mm512_set1_epi64( 0x0000008000000080ULL );
   tl = sc->T0 + bit_len;
   th = sc->T1;

@@ -2398,7 +2391,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
       memset_zero_512( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
       if ( out_size_w32 == 8 )
           buf[52>>2] = _mm512_or_si512( buf[52>>2],
-                                m512_const1_64( 0x0100000001000000ULL ) );
+                                _mm512_set1_epi64( 0x0100000001000000ULL ) );
       buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
       buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
       blake32_16way( sc, buf + (ptr>>2), 64 - ptr );
@@ -2411,7 +2404,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
       sc->T1 = 0xFFFFFFFFUL;
       memset_zero_512( buf, 56>>2 );
       if ( out_size_w32 == 8 )
-          buf[52>>2] = m512_const1_64( 0x0100000001000000ULL );
+          buf[52>>2] = _mm512_set1_epi64( 0x0100000001000000ULL );
       buf[56>>2] = _mm512_set1_epi32( bswap_32( th ) );
       buf[60>>2] = _mm512_set1_epi32( bswap_32( tl ) );
       blake32_16way( sc, buf, 64 );
@@ -2473,7 +2466,7 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   buf[ptr>>2] = m512_const1_32( 0x80000000 );
+   buf[ptr>>2] = _mm512_set1_epi32( 0x80000000 );
   tl = sc->T0 + bit_len;
   th = sc->T1;

--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -350,7 +350,6 @@ static const sph_u64 CB[16] = {
  __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
  __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
  __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
-  __m512i shuf_bswap64; \
  V0 = H0; \
  V1 = H1; \
  V2 = H2; \
@@ -359,18 +358,16 @@ static const sph_u64 CB[16] = {
  V5 = H5; \
  V6 = H6; \
  V7 = H7; \
-  V8 = m512_const1_64( CB0 );  \
-  V9 = m512_const1_64( CB1 );  \
-  VA = m512_const1_64( CB2 );  \
-  VB = m512_const1_64( CB3 );  \
+  V8 = _mm512_set1_epi64( CB0 );  \
+  V9 = _mm512_set1_epi64( CB1 );  \
+  VA = _mm512_set1_epi64( CB2 );  \
+  VB = _mm512_set1_epi64( CB3 );  \
  VC = _mm512_set1_epi64( T0 ^ CB4 ); \
  VD = _mm512_set1_epi64( T0 ^ CB5 ); \
  VE = _mm512_set1_epi64( T1 ^ CB6 ); \
  VF = _mm512_set1_epi64( T1 ^ CB7 ); \
-  shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637, \
-                                0x28292a2b2c2d2e2f, 0x2021222324252627, \
-                                0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                                0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( \
+                                   0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
  M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
  M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
  M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
@@ -419,7 +416,6 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
  __m512i V0, V1, V2, V3, V4, V5, V6, V7;
  __m512i V8, V9, VA, VB, VC, VD, VE, VF;
-  __m512i shuf_bswap64;

  V0 = sc->H[0];
  V1 = sc->H[1];
@@ -429,19 +425,17 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  V5 = sc->H[5];
  V6 = sc->H[6];
  V7 = sc->H[7];
-  V8 = m512_const1_64( CB0 );
-  V9 = m512_const1_64( CB1 );
-  VA = m512_const1_64( CB2 );
-  VB = m512_const1_64( CB3 );
+  V8 = _mm512_set1_epi64( CB0 );
+  V9 = _mm512_set1_epi64( CB1 );
+  VA = _mm512_set1_epi64( CB2 );
+  VB = _mm512_set1_epi64( CB3 );
  VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
  VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
  VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
  VF = _mm512_set1_epi64( sc->T1 ^ CB7 );

-  shuf_bswap64 = m512_const_64( 0x38393a3b3c3d3e3f, 0x3031323334353637,
-                                0x28292a2b2c2d2e2f, 0x2021222324252627,
-                                0x18191a1b1c1d1e1f, 0x1011121314151617,
-                                0x08090a0b0c0d0e0f, 0x0001020304050607 );
+  const __m512i shuf_bswap64 = mm512_bcast_m128( _mm_set_epi64x( 
+                                   0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

  M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
  M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
@@ -503,10 +497,10 @@ void blake512_8way_compress_le( blake_8way_big_context *sc )
  V5 = sc->H[5];
  V6 = sc->H[6];
  V7 = sc->H[7];
-  V8 = m512_const1_64( CB0 );
-  V9 = m512_const1_64( CB1 );
-  VA = m512_const1_64( CB2 );
-  VB = m512_const1_64( CB3 );
+  V8 = _mm512_set1_epi64( CB0 );
+  V9 = _mm512_set1_epi64( CB1 );
+  VA = _mm512_set1_epi64( CB2 );
+  VB = _mm512_set1_epi64( CB3 );
  VC = _mm512_set1_epi64( sc->T0 ^ CB4 );
  VD = _mm512_set1_epi64( sc->T0 ^ CB5 );
  VE = _mm512_set1_epi64( sc->T1 ^ CB6 );
@@ -565,23 +559,23 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
   __m512i V8, V9, VA, VB, VC, VD, VE, VF;

   // initial hash
-   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
-   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
-   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
-   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
-   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
-   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
-   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
-   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+   casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );

   // fill buffer
   memcpy_512( sc->buf, (__m512i*)data, 80>>3 );
-   sc->buf[10] = m512_const1_64( 0x8000000000000000ULL );
+   sc->buf[10] = _mm512_set1_epi64( 0x8000000000000000ULL );
   sc->buf[11] = 
   sc->buf[12] = m512_zero;
   sc->buf[13] = m512_one_64;
   sc->buf[14] = m512_zero;
-   sc->buf[15] = m512_const1_64( 80*8 );
+   sc->buf[15] = _mm512_set1_epi64( 80*8 );

   // build working variables
   V0 = sc->H[0];
@@ -592,10 +586,10 @@ void blake512_8way_prehash_le( blake_8way_big_context *sc, __m512i *midstate,
   V5 = sc->H[5];
   V6 = sc->H[6];
   V7 = sc->H[7];
-   V8 = m512_const1_64( CB0 );
-   V9 = m512_const1_64( CB1 );
-   VA = m512_const1_64( CB2 );
-   VB = m512_const1_64( CB3 );
+   V8 = _mm512_set1_epi64( CB0 );
+   V9 = _mm512_set1_epi64( CB1 );
+   VA = _mm512_set1_epi64( CB2 );
+   VB = _mm512_set1_epi64( CB3 );
   VC = _mm512_set1_epi64( CB4 ^ 0x280ULL );
   VD = _mm512_set1_epi64( CB5 ^ 0x280ULL );
   VE = _mm512_set1_epi64( CB6 );
@@ -790,14 +784,14 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,

 void blake512_8way_init( blake_8way_big_context *sc )
 {
-   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
-   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
-   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
-   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
-   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
-   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
-   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
-   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+   casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );

   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
@@ -861,7 +855,7 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   buf[ptr>>3] = m512_const1_64( 0x80 );
+   buf[ptr>>3] = _mm512_set1_epi64( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;
   if (ptr == 0 )
@@ -882,9 +876,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
   {
       memset_zero_512( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
       buf[104>>3] = _mm512_or_si512( buf[104>>3],
-                                 m512_const1_64( 0x0100000000000000ULL ) );
-       buf[112>>3] = m512_const1_64( bswap_64( th ) );
-       buf[120>>3] = m512_const1_64( bswap_64( tl ) );
+                                 _mm512_set1_epi64( 0x0100000000000000ULL ) );
+       buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
+       buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );

       blake64_8way( sc, buf + (ptr>>3), 128 - ptr );
   }
@@ -896,9 +890,9 @@ blake64_8way_close( blake_8way_big_context *sc, void *dst )
       sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
       sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
       memset_zero_512( buf, 112>>3 );
-       buf[104>>3] = m512_const1_64( 0x0100000000000000ULL );
-       buf[112>>3] = m512_const1_64( bswap_64( th ) );
-       buf[120>>3] = m512_const1_64( bswap_64( tl ) );
+       buf[104>>3] = _mm512_set1_epi64( 0x0100000000000000ULL );
+       buf[112>>3] = _mm512_set1_epi64( bswap_64( th ) );
+       buf[120>>3] = _mm512_set1_epi64( bswap_64( tl ) );

       blake64_8way( sc, buf, 128 );
   }
@@ -912,14 +906,14 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
   
 // init

-   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
-   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
-   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
-   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
-   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
-   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
-   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
-   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+   casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );

   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
@@ -943,7 +937,7 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
   uint64_t th, tl;

   bit_len = sc->ptr << 3;
-   sc->buf[ptr64] = m512_const1_64( 0x80 );
+   sc->buf[ptr64] = _mm512_set1_epi64( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;

@@ -961,9 +955,9 @@ void blake512_8way_full( blake_8way_big_context *sc, void * dst,
      sc->T0 -= 1024 - bit_len;

   memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
-   sc->buf[13] = m512_const1_64( 0x0100000000000000ULL );
-   sc->buf[14] = m512_const1_64( bswap_64( th ) );
-   sc->buf[15] = m512_const1_64( bswap_64( tl ) );
+   sc->buf[13] = _mm512_set1_epi64( 0x0100000000000000ULL );
+   sc->buf[14] = _mm512_set1_epi64( bswap_64( th ) );
+   sc->buf[15] = _mm512_set1_epi64( bswap_64( tl ) );

   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;
@@ -979,14 +973,14 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,

 // init

-   casti_m512i( sc->H, 0 ) = m512_const1_64( 0x6A09E667F3BCC908 );
-   casti_m512i( sc->H, 1 ) = m512_const1_64( 0xBB67AE8584CAA73B );
-   casti_m512i( sc->H, 2 ) = m512_const1_64( 0x3C6EF372FE94F82B );
-   casti_m512i( sc->H, 3 ) = m512_const1_64( 0xA54FF53A5F1D36F1 );
-   casti_m512i( sc->H, 4 ) = m512_const1_64( 0x510E527FADE682D1 );
-   casti_m512i( sc->H, 5 ) = m512_const1_64( 0x9B05688C2B3E6C1F );
-   casti_m512i( sc->H, 6 ) = m512_const1_64( 0x1F83D9ABFB41BD6B );
-   casti_m512i( sc->H, 7 ) = m512_const1_64( 0x5BE0CD19137E2179 );
+   casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
+   casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
+   casti_m512i( sc->H, 2 ) = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
+   casti_m512i( sc->H, 3 ) = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
+   casti_m512i( sc->H, 4 ) = _mm512_set1_epi64( 0x510E527FADE682D1 );
+   casti_m512i( sc->H, 5 ) = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
+   casti_m512i( sc->H, 6 ) = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
+   casti_m512i( sc->H, 7 ) = _mm512_set1_epi64( 0x5BE0CD19137E2179 );

   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
@@ -1010,7 +1004,7 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,
   uint64_t th, tl;

   bit_len = sc->ptr << 3;
-   sc->buf[ptr64] = m512_const1_64( 0x8000000000000000ULL );
+   sc->buf[ptr64] = _mm512_set1_epi64( 0x8000000000000000ULL );
   tl = sc->T0 + bit_len;
   th = sc->T1;

@@ -1029,8 +1023,8 @@ void blake512_8way_full_le( blake_8way_big_context *sc, void * dst,

   memset_zero_512( sc->buf + ptr64 + 1, 13 - ptr64 );
   sc->buf[13] = m512_one_64;
-   sc->buf[14] = m512_const1_64( th );
-   sc->buf[15] = m512_const1_64( tl );
+   sc->buf[14] = _mm512_set1_epi64( th );
+   sc->buf[15] = _mm512_set1_epi64( tl );

   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;
@@ -1092,7 +1086,6 @@ blake512_8way_close(void *cc, void *dst)
  __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
  __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
  __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
-  __m256i shuf_bswap64; \
  V0 = H0; \
  V1 = H1; \
  V2 = H2; \
@@ -1101,16 +1094,16 @@ blake512_8way_close(void *cc, void *dst)
  V5 = H5; \
  V6 = H6; \
  V7 = H7; \
-  V8 = m256_const1_64( CB0 );  \
-  V9 = m256_const1_64( CB1 );  \
-  VA = m256_const1_64( CB2 );  \
-  VB = m256_const1_64( CB3 );  \
+  V8 = _mm256_set1_epi64x( CB0 );  \
+  V9 = _mm256_set1_epi64x( CB1 );  \
+  VA = _mm256_set1_epi64x( CB2 );  \
+  VB = _mm256_set1_epi64x( CB3 );  \
  VC = _mm256_set1_epi64x( T0 ^ CB4 ); \
  VD = _mm256_set1_epi64x( T0 ^ CB5 ); \
  VE = _mm256_set1_epi64x( T1 ^ CB6 ); \
  VF = _mm256_set1_epi64x( T1 ^ CB7 ); \
-  shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617, \
-                                0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+  const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x( \
+                             0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
  M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
  M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
@@ -1160,7 +1153,6 @@ void blake512_4way_compress( blake_4way_big_context *sc )
  __m256i M8, M9, MA, MB, MC, MD, ME, MF;
  __m256i V0, V1, V2, V3, V4, V5, V6, V7;
  __m256i V8, V9, VA, VB, VC, VD, VE, VF;
-  __m256i shuf_bswap64;

  V0 = sc->H[0];
  V1 = sc->H[1];
@@ -1170,20 +1162,20 @@ void blake512_4way_compress( blake_4way_big_context *sc )
  V5 = sc->H[5];
  V6 = sc->H[6];
  V7 = sc->H[7];
-  V8 = m256_const1_64( CB0 );
-  V9 = m256_const1_64( CB1 );
-  VA = m256_const1_64( CB2 );
-  VB = m256_const1_64( CB3 );
+  V8 = _mm256_set1_epi64x( CB0 );
+  V9 = _mm256_set1_epi64x( CB1 );
+  VA = _mm256_set1_epi64x( CB2 );
+  VB = _mm256_set1_epi64x( CB3 );
  VC = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
-                             m256_const1_64( CB4 ) );
+                             _mm256_set1_epi64x( CB4 ) );
  VD = _mm256_xor_si256( _mm256_set1_epi64x( sc->T0 ),
-                             m256_const1_64( CB5 ) );
+                             _mm256_set1_epi64x( CB5 ) );
  VE = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
-                             m256_const1_64( CB6 ) );
+                             _mm256_set1_epi64x( CB6 ) );
  VF = _mm256_xor_si256( _mm256_set1_epi64x( sc->T1 ),
-                             m256_const1_64( CB7 ) );
-  shuf_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f, 0x1011121314151617,
-                                0x08090a0b0c0d0e0f, 0x0001020304050607 );
+                             _mm256_set1_epi64x( CB7 ) );
+  const __m256i shuf_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
+                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

  M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
  M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
@@ -1236,23 +1228,23 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
   __m256i V8, V9, VA, VB, VC, VD, VE, VF;

   // initial hash
-   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
-   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
-   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
-   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
-   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
-   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
-   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
-   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
+   casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
+   casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
+   casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
+   casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
+   casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
+   casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
+   casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
+   casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
   
   // fill buffer
   memcpy_256( sc->buf, (__m256i*)data, 80>>3 );
-   sc->buf[10] = m256_const1_64( 0x8000000000000000ULL );
+   sc->buf[10] = _mm256_set1_epi64x( 0x8000000000000000ULL );
   sc->buf[11] = m256_zero;
   sc->buf[12] = m256_zero;
   sc->buf[13] = m256_one_64;
   sc->buf[14] = m256_zero;
-   sc->buf[15] = m256_const1_64( 80*8 );
+   sc->buf[15] = _mm256_set1_epi64x( 80*8 );

   // build working variables
   V0 = sc->H[0];
@@ -1263,10 +1255,10 @@ void blake512_4way_prehash_le( blake_4way_big_context *sc, __m256i *midstate,
   V5 = sc->H[5];
   V6 = sc->H[6];
   V7 = sc->H[7];
-   V8 = m256_const1_64( CB0 );
-   V9 = m256_const1_64( CB1 );
-   VA = m256_const1_64( CB2 );
-   VB = m256_const1_64( CB3 );
+   V8 = _mm256_set1_epi64x( CB0 );
+   V9 = _mm256_set1_epi64x( CB1 );
+   VA = _mm256_set1_epi64x( CB2 );
+   VB = _mm256_set1_epi64x( CB3 );
   VC = _mm256_set1_epi64x( CB4 ^ 0x280ULL );
   VD = _mm256_set1_epi64x( CB5 ^ 0x280ULL );
   VE = _mm256_set1_epi64x( CB6 );
@@ -1446,14 +1438,14 @@ void blake512_4way_final_le( blake_4way_big_context *sc, void *hash,

 void blake512_4way_init( blake_4way_big_context *sc )
 {
-   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
-   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
-   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
-   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
-   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
-   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
-   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
-   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
+   casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
+   casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
+   casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
+   casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
+   casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
+   casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
+   casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
+   casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );

   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
@@ -1513,7 +1505,7 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   buf[ptr>>3] = m256_const1_64( 0x80 );
+   buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;
   if (ptr == 0 )
@@ -1535,9 +1527,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
   {
       memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
       buf[104>>3] = _mm256_or_si256( buf[104>>3],
-                                 m256_const1_64( 0x0100000000000000ULL ) );
-       buf[112>>3] = m256_const1_64( bswap_64( th ) );
-       buf[120>>3] = m256_const1_64( bswap_64( tl ) );
+                                 _mm256_set1_epi64x( 0x0100000000000000ULL ) );
+       buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
+       buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );

       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
   }
@@ -1549,9 +1541,9 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
       memset_zero_256( buf, 112>>3 ); 
-       buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
-       buf[112>>3] = m256_const1_64( bswap_64( th ) );
-       buf[120>>3] = m256_const1_64( bswap_64( tl ) );
+       buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
+       buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
+       buf[120>>3] = _mm256_set1_epi64x( bswap_64( tl ) );

       blake64_4way( sc, buf, 128 );
   }
@@ -1565,14 +1557,14 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,

 // init

-   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
-   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
-   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
-   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
-   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
-   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
-   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
-   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );
+   casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
+   casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
+   casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
+   casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
+   casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( 0x510E527FADE682D1 );
+   casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
+   casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
+   casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );

   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
@@ -1596,7 +1588,7 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
   uint64_t th, tl;

   bit_len = sc->ptr << 3;
-   sc->buf[ptr64] = m256_const1_64( 0x80 );
+   sc->buf[ptr64] = _mm256_set1_epi64x( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;
   if ( sc->ptr == 0 )
@@ -1613,9 +1605,9 @@ void blake512_4way_full( blake_4way_big_context *sc, void * dst,
        sc->T0 -= 1024 - bit_len;

   memset_zero_256( sc->buf + ptr64 + 1, 13 - ptr64 );
-   sc->buf[13] = m256_const1_64( 0x0100000000000000ULL );
-   sc->buf[14] = m256_const1_64( bswap_64( th ) );
-   sc->buf[15] = m256_const1_64( bswap_64( tl ) );
+   sc->buf[13] = _mm256_set1_epi64x( 0x0100000000000000ULL );
+   sc->buf[14] = _mm256_set1_epi64x( bswap_64( th ) );
+   sc->buf[15] = _mm256_set1_epi64x( bswap_64( tl ) );

   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -221,14 +221,14 @@ int cube_4way_init( cube_4way_context *sp, int hashbitlen, int rounds,
    sp->rounds    = rounds;
    sp->pos       = 0;

-    h[ 0] = m512_const1_128( iv[0] );
-    h[ 1] = m512_const1_128( iv[1] );
-    h[ 2] = m512_const1_128( iv[2] );
-    h[ 3] = m512_const1_128( iv[3] );
-    h[ 4] = m512_const1_128( iv[4] );
-    h[ 5] = m512_const1_128( iv[5] );
-    h[ 6] = m512_const1_128( iv[6] );
-    h[ 7] = m512_const1_128( iv[7] );
+    h[ 0] = mm512_bcast_m128( iv[0] );
+    h[ 1] = mm512_bcast_m128( iv[1] );
+    h[ 2] = mm512_bcast_m128( iv[2] );
+    h[ 3] = mm512_bcast_m128( iv[3] );
+    h[ 4] = mm512_bcast_m128( iv[4] );
+    h[ 5] = mm512_bcast_m128( iv[5] );
+    h[ 6] = mm512_bcast_m128( iv[6] );
+    h[ 7] = mm512_bcast_m128( iv[7] );

    return 0;
 }
@@ -259,11 +259,11 @@ int cube_4way_close( cube_4way_context *sp, void *output )

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
-                                 m512_const2_64( 0, 0x0000000000000080 ) );
+                         mm512_bcast128lo_64( 0x0000000000000080 ) );
    transform_4way( sp );

    sp->h[7] = _mm512_xor_si512( sp->h[7],
-                                 m512_const2_64( 0x0000000100000000, 0 ) );
+                         mm512_bcast128hi_64( 0x0000000100000000 ) );

    for ( i = 0; i < 10; ++i ) 
       transform_4way( sp );
@@ -283,14 +283,14 @@ int cube_4way_full( cube_4way_context *sp, void *output,  int hashbitlen,
    sp->rounds    = 16;
    sp->pos       = 0;

-    h[ 0] = m512_const1_128( iv[0] );
-    h[ 1] = m512_const1_128( iv[1] );
-    h[ 2] = m512_const1_128( iv[2] );
-    h[ 3] = m512_const1_128( iv[3] );
-    h[ 4] = m512_const1_128( iv[4] );
-    h[ 5] = m512_const1_128( iv[5] );
-    h[ 6] = m512_const1_128( iv[6] );
-    h[ 7] = m512_const1_128( iv[7] );
+    h[ 0] = mm512_bcast_m128( iv[0] );
+    h[ 1] = mm512_bcast_m128( iv[1] );
+    h[ 2] = mm512_bcast_m128( iv[2] );
+    h[ 3] = mm512_bcast_m128( iv[3] );
+    h[ 4] = mm512_bcast_m128( iv[4] );
+    h[ 5] = mm512_bcast_m128( iv[5] );
+    h[ 6] = mm512_bcast_m128( iv[6] );
+    h[ 7] = mm512_bcast_m128( iv[7] );

    const int len = size >> 4;
    const __m512i *in = (__m512i*)data;
@@ -310,11 +310,11 @@ int cube_4way_full( cube_4way_context *sp, void *output,  int hashbitlen,

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
-                                    m512_const2_64( 0, 0x0000000000000080 ) );
+                         mm512_bcast128lo_64( 0x0000000000000080 ) );
    transform_4way( sp );

    sp->h[7] = _mm512_xor_si512( sp->h[7],
-                                    m512_const2_64( 0x0000000100000000, 0 ) );
+                         mm512_bcast128hi_64( 0x0000000100000000 ) );

    for ( i = 0; i < 10; ++i )
       transform_4way( sp );
@@ -336,14 +336,14 @@ int cube_4way_2buf_full( cube_4way_2buf_context *sp,
    sp->rounds    = 16;
    sp->pos       = 0;

-    h1[0] = h0[0] = m512_const1_128( iv[0] );
-    h1[1] = h0[1] = m512_const1_128( iv[1] );
-    h1[2] = h0[2] = m512_const1_128( iv[2] );
-    h1[3] = h0[3] = m512_const1_128( iv[3] );
-    h1[4] = h0[4] = m512_const1_128( iv[4] );
-    h1[5] = h0[5] = m512_const1_128( iv[5] );
-    h1[6] = h0[6] = m512_const1_128( iv[6] );
-    h1[7] = h0[7] = m512_const1_128( iv[7] );
+    h1[0] = h0[0] = mm512_bcast_m128( iv[0] );
+    h1[1] = h0[1] = mm512_bcast_m128( iv[1] );
+    h1[2] = h0[2] = mm512_bcast_m128( iv[2] );
+    h1[3] = h0[3] = mm512_bcast_m128( iv[3] );
+    h1[4] = h0[4] = mm512_bcast_m128( iv[4] );
+    h1[5] = h0[5] = mm512_bcast_m128( iv[5] );
+    h1[6] = h0[6] = mm512_bcast_m128( iv[6] );
+    h1[7] = h0[7] = mm512_bcast_m128( iv[7] );

    const int len = size >> 4;
    const __m512i *in0 = (__m512i*)data0;
@@ -365,13 +365,13 @@ int cube_4way_2buf_full( cube_4way_2buf_context *sp,
    }

    // pos is zero for 64 byte data, 1 for 80 byte data.
-    __m512i tmp = m512_const2_64( 0, 0x0000000000000080 );
+    __m512i tmp = mm512_bcast128lo_64( 0x0000000000000080 );
    sp->h0[ sp->pos ] = _mm512_xor_si512( sp->h0[ sp->pos ], tmp );
    sp->h1[ sp->pos ] = _mm512_xor_si512( sp->h1[ sp->pos ], tmp );

    transform_4way_2buf( sp );

-    tmp = m512_const2_64( 0x0000000100000000, 0 );
+    tmp = mm512_bcast128hi_64( 0x0000000100000000 );
    sp->h0[7] = _mm512_xor_si512( sp->h0[7], tmp );
    sp->h1[7] = _mm512_xor_si512( sp->h1[7], tmp );

@@ -384,7 +384,6 @@ int cube_4way_2buf_full( cube_4way_2buf_context *sp,
    return 0;
 }

-
 int cube_4way_update_close( cube_4way_context *sp, void *output,
                               const void *data, size_t size )
 {
@@ -406,11 +405,11 @@ int cube_4way_update_close( cube_4way_context *sp, void *output,

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->h[ sp->pos ] = _mm512_xor_si512( sp->h[ sp->pos ],
-                                    m512_const2_64( 0, 0x0000000000000080 ) );
+                          mm512_bcast128lo_64( 0x0000000000000080 ) );
    transform_4way( sp );

    sp->h[7] = _mm512_xor_si512( sp->h[7],
-                                    m512_const2_64( 0x0000000100000000, 0 ) );
+                          mm512_bcast128hi_64( 0x0000000100000000 ) );

    for ( i = 0; i < 10; ++i )
       transform_4way( sp );
@@ -508,14 +507,14 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
    sp->rounds    = rounds;
    sp->pos       = 0;

-    h[ 0] = m256_const1_128( iv[0] );
-    h[ 1] = m256_const1_128( iv[1] );
-    h[ 2] = m256_const1_128( iv[2] );
-    h[ 3] = m256_const1_128( iv[3] );
-    h[ 4] = m256_const1_128( iv[4] );
-    h[ 5] = m256_const1_128( iv[5] );
-    h[ 6] = m256_const1_128( iv[6] );
-    h[ 7] = m256_const1_128( iv[7] );
+    h[ 0] = mm256_bcast_m128( iv[0] );
+    h[ 1] = mm256_bcast_m128( iv[1] );
+    h[ 2] = mm256_bcast_m128( iv[2] );
+    h[ 3] = mm256_bcast_m128( iv[3] );
+    h[ 4] = mm256_bcast_m128( iv[4] );
+    h[ 5] = mm256_bcast_m128( iv[5] );
+    h[ 6] = mm256_bcast_m128( iv[6] );
+    h[ 7] = mm256_bcast_m128( iv[7] );
    
    return 0;
 }
@@ -546,13 +545,14 @@ int cube_2way_close( cube_2way_context *sp, void *output )

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
-                                   m256_const2_64( 0, 0x0000000000000080 ) );
+                                   mm256_bcast128lo_64( 0x0000000000000080 ) );
    transform_2way( sp );

    sp->h[7] = _mm256_xor_si256( sp->h[7],
-                                   m256_const2_64( 0x0000000100000000, 0 ) );
+                                   mm256_bcast128hi_64( 0x0000000100000000 ) );

-    for ( i = 0; i < 10; ++i )           transform_2way( sp );
+    for ( i = 0; i < 10; ++i )  
+       transform_2way( sp );

    memcpy( hash, sp->h, sp->hashlen<<5 );
    return 0;
@@ -579,13 +579,14 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
-                                    m256_const2_64( 0, 0x0000000000000080 ) );
+                                    mm256_bcast128lo_64( 0x0000000000000080 ) );
    transform_2way( sp );

    sp->h[7] = _mm256_xor_si256( sp->h[7],
-                                    m256_const2_64( 0x0000000100000000, 0 ) );
+                                    mm256_bcast128hi_64( 0x0000000100000000 ) );

-    for ( i = 0; i < 10; ++i )    transform_2way( sp );
+    for ( i = 0; i < 10; ++i )
+       transform_2way( sp );

    memcpy( hash, sp->h, sp->hashlen<<5 );
    return 0;
@@ -602,14 +603,14 @@ int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,
    sp->rounds    = 16;
    sp->pos       = 0;

-    h[ 0] = m256_const1_128( iv[0] );
-    h[ 1] = m256_const1_128( iv[1] );
-    h[ 2] = m256_const1_128( iv[2] );
-    h[ 3] = m256_const1_128( iv[3] );
-    h[ 4] = m256_const1_128( iv[4] );
-    h[ 5] = m256_const1_128( iv[5] );
-    h[ 6] = m256_const1_128( iv[6] );
-    h[ 7] = m256_const1_128( iv[7] );
+    h[ 0] = mm256_bcast_m128( iv[0] );
+    h[ 1] = mm256_bcast_m128( iv[1] );
+    h[ 2] = mm256_bcast_m128( iv[2] );
+    h[ 3] = mm256_bcast_m128( iv[3] );
+    h[ 4] = mm256_bcast_m128( iv[4] );
+    h[ 5] = mm256_bcast_m128( iv[5] );
+    h[ 6] = mm256_bcast_m128( iv[6] );
+    h[ 7] = mm256_bcast_m128( iv[7] );

    const int len = size >> 4;
    const __m256i *in = (__m256i*)data;
@@ -629,13 +630,14 @@ int cube_2way_full( cube_2way_context *sp, void *output, int hashbitlen,

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
-                                    m256_const2_64( 0, 0x0000000000000080 ) );
+                                    mm256_bcast128lo_64( 0x0000000000000080 ) );
    transform_2way( sp );

    sp->h[7] = _mm256_xor_si256( sp->h[7],
-                                    m256_const2_64( 0x0000000100000000, 0 ) );
+                                    mm256_bcast128hi_64( 0x0000000100000000 ) );

-    for ( i = 0; i < 10; ++i )    transform_2way( sp );
+    for ( i = 0; i < 10; ++i )
+       transform_2way( sp );

    memcpy( hash, sp->h, sp->hashlen<<5 );
    return 0;
--- a/algo/echo/echo-hash-4way.c
+++ b/algo/echo/echo-hash-4way.c
@@ -162,9 +162,9 @@ void echo_4way_compress( echo_4way_context *ctx, const __m512i *pmsg,
  unsigned int r, b, i, j;
  __m512i t1, t2, s2, k1;
  __m512i _state[4][4], _state2[4][4], _statebackup[4][4]; 
-  __m512i one = m512_one_128;
-  __m512i mul2mask = m512_const2_64( 0, 0x00001b00 );
-  __m512i lsbmask  = m512_const1_32( 0x01010101 ); 
+  const __m512i one = mm512_bcast128lo_64( 1 ); 
+  const __m512i mul2mask = mm512_bcast128lo_64( 0x00001b00 );
+  const __m512i lsbmask  = _mm512_set1_epi32( 0x01010101 ); 

  _state[ 0 ][ 0 ] = ctx->state[ 0 ][ 0 ];
  _state[ 0 ][ 1 ] = ctx->state[ 0 ][ 1 ];
@@ -264,16 +264,16 @@ int echo_4way_init( echo_4way_context *ctx, int nHashSize )
 		ctx->uHashSize = 256;
 		ctx->uBlockLength = 192;
 		ctx->uRounds = 8;
-		ctx->hashsize = m512_const2_64( 0, 0x100 );
-		ctx->const1536 = m512_const2_64( 0, 0x600 );
+      ctx->hashsize = mm512_bcast128lo_64( 0x100 );
+      ctx->const1536 = mm512_bcast128lo_64( 0x600 );
 		break;

 	case 512:
 		ctx->uHashSize = 512;
 		ctx->uBlockLength = 128;
 		ctx->uRounds = 10;
-		ctx->hashsize = m512_const2_64( 0, 0x200 );
-		ctx->const1536 = m512_const2_64( 0, 0x400);
+      ctx->hashsize = mm512_bcast128lo_64( 0x200 );
+      ctx->const1536 = mm512_bcast128lo_64( 0x400);
 		break;

 	default:
@@ -305,7 +305,7 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
   {
      echo_4way_compress( state, data, 1 );
      state->processed_bits = 1024;
-      remainingbits = m512_const2_64( 0, -1024 );
+      remainingbits = mm512_bcast128lo_64( -1024 );
      vlen = 0;
   }
   else
@@ -313,13 +313,15 @@ int echo_4way_update_close( echo_4way_context *state, void *hashval,
      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
      memcpy_512( state->buffer, data, vlen );
      state->processed_bits += (unsigned int)( databitlen );
-      remainingbits = m512_const2_64( 0, (uint64_t)databitlen );
+      remainingbits = mm512_bcast128lo_64( (uint64_t)databitlen );
   }

-   state->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
+   state->buffer[ vlen ] = mm512_bcast128lo_64( 0x80 );
   memset_zero_512( state->buffer + vlen + 1, vblen - vlen - 2 );
-   state->buffer[ vblen-2 ] = m512_const2_64( (uint64_t)state->uHashSize << 48, 0 );
-   state->buffer[ vblen-1 ] = m512_const2_64( 0, state->processed_bits);
+   state->buffer[ vblen-2 ] =
+           mm512_bcast128hi_64( (uint64_t)state->uHashSize << 48 );
+   state->buffer[ vblen-1 ] =
+           mm512_bcast128lo_64( state->processed_bits );

   state->k = _mm512_add_epi64( state->k, remainingbits );
   state->k = _mm512_sub_epi64( state->k, state->const1536 );
@@ -352,16 +354,16 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
         ctx->uHashSize = 256;
         ctx->uBlockLength = 192;
         ctx->uRounds = 8;
-         ctx->hashsize = m512_const2_64( 0, 0x100 );
-         ctx->const1536 = m512_const2_64( 0, 0x600 );
+         ctx->hashsize = mm512_bcast128lo_64( 0x100 );
+         ctx->const1536 = mm512_bcast128lo_64( 0x600 );
         break;

      case 512:
         ctx->uHashSize = 512;
         ctx->uBlockLength = 128;
         ctx->uRounds = 10;
-         ctx->hashsize = m512_const2_64( 0, 0x200 );
-         ctx->const1536 = m512_const2_64( 0, 0x400 );
+         ctx->hashsize = mm512_bcast128lo_64( 0x200 );
+         ctx->const1536 = mm512_bcast128lo_64( 0x400 );
         break;

      default:
@@ -388,7 +390,7 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
   {
      echo_4way_compress( ctx, data, 1 );
      ctx->processed_bits = 1024;
-      remainingbits = m512_const2_64( 0, -1024 );
+      remainingbits = mm512_bcast128lo_64( -1024 );
      vlen = 0;
   }
   else
@@ -396,14 +398,14 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,
      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
      memcpy_512( ctx->buffer, data, vlen );
      ctx->processed_bits += (unsigned int)( databitlen );
-      remainingbits = m512_const2_64( 0, databitlen );
+      remainingbits = mm512_bcast128lo_64( databitlen );
   }

-   ctx->buffer[ vlen ] = m512_const2_64( 0, 0x80 );
+   ctx->buffer[ vlen ] = mm512_bcast128lo_64( 0x80 );
   memset_zero_512( ctx->buffer + vlen + 1, vblen - vlen - 2 );
   ctx->buffer[ vblen-2 ] =
-                     m512_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
-   ctx->buffer[ vblen-1 ] = m512_const2_64( 0, ctx->processed_bits);
+               mm512_bcast128hi_64( (uint64_t)ctx->uHashSize << 48 );
+   ctx->buffer[ vblen-1 ] = mm512_bcast128lo_64( ctx->processed_bits);

   ctx->k = _mm512_add_epi64( ctx->k, remainingbits );
   ctx->k = _mm512_sub_epi64( ctx->k, ctx->const1536 );
@@ -425,9 +427,9 @@ int echo_4way_full( echo_4way_context *ctx, void *hashval, int nHashSize,

 // AVX2 + VAES

-#define mul2mask_2way   m256_const2_64( 0, 0x0000000000001b00 ) 
+#define mul2mask_2way   mm256_bcast128lo_64( 0x0000000000001b00 ) 

-#define lsbmask_2way    m256_const1_32( 0x01010101 ) 
+#define lsbmask_2way    _mm256_set1_epi32( 0x01010101 ) 

 #define ECHO_SUBBYTES4_2WAY( state, j ) \
   state[0][j] = _mm256_aesenc_epi128( state[0][j], k1 ); \
@@ -679,16 +681,16 @@ int echo_2way_init( echo_2way_context *ctx, int nHashSize )
                        ctx->uHashSize = 256;
                        ctx->uBlockLength = 192;
                        ctx->uRounds = 8;
-                        ctx->hashsize = m256_const2_64( 0, 0x100 );
-                        ctx->const1536 = m256_const2_64( 0, 0x600 );
+                        ctx->hashsize = mm256_bcast128lo_64( 0x100 );
+                        ctx->const1536 = mm256_bcast128lo_64( 0x600 );
                        break;

                case 512:
                        ctx->uHashSize = 512;
                        ctx->uBlockLength = 128;
                        ctx->uRounds = 10;
-                        ctx->hashsize = m256_const2_64( 0, 0x200 );
-                        ctx->const1536 = m256_const2_64( 0, 0x400 );
+                        ctx->hashsize = mm256_bcast128lo_64( 0x200 );
+                        ctx->const1536 = mm256_bcast128lo_64( 0x400 );
                        break;

                default:
@@ -720,20 +722,20 @@ int echo_2way_update_close( echo_2way_context *state, void *hashval,
   {
      echo_2way_compress( state, data, 1 );
      state->processed_bits = 1024;
-      remainingbits = m256_const2_64( 0, -1024 );
+      remainingbits = mm256_bcast128lo_64( -1024 );
      vlen = 0;
   }
   else
   {
      memcpy_256( state->buffer, data, vlen );
      state->processed_bits += (unsigned int)( databitlen );
-      remainingbits = m256_const2_64( 0, databitlen );
+      remainingbits = mm256_bcast128lo_64( databitlen );
   }

-   state->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
+   state->buffer[ vlen ] = mm256_bcast128lo_64( 0x80 );
   memset_zero_256( state->buffer + vlen + 1, vblen - vlen - 2 );
-   state->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)state->uHashSize << 48, 0 );
-   state->buffer[ vblen-1 ] = m256_const2_64( 0, state->processed_bits );
+   state->buffer[ vblen-2 ] = mm256_bcast128hi_64( (uint64_t)state->uHashSize << 48 );
+   state->buffer[ vblen-1 ] = mm256_bcast128lo_64( state->processed_bits );

   state->k = _mm256_add_epi64( state->k, remainingbits );
   state->k = _mm256_sub_epi64( state->k, state->const1536 );
@@ -766,16 +768,16 @@ int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
         ctx->uHashSize = 256;
         ctx->uBlockLength = 192;
         ctx->uRounds = 8;
-         ctx->hashsize = m256_const2_64( 0, 0x100 );
-         ctx->const1536 = m256_const2_64( 0, 0x600 );
+         ctx->hashsize = mm256_bcast128lo_64( 0x100 );
+         ctx->const1536 = mm256_bcast128lo_64( 0x600 );
         break;

      case 512:
         ctx->uHashSize = 512;
         ctx->uBlockLength = 128;
         ctx->uRounds = 10;
-         ctx->hashsize = m256_const2_64( 0, 0x200 );
-         ctx->const1536 = m256_const2_64( 0, 0x400 );
+         ctx->hashsize = mm256_bcast128lo_64( 0x200 );
+         ctx->const1536 = mm256_bcast128lo_64( 0x400 );
         break;

      default:
@@ -798,7 +800,7 @@ int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
   {
      echo_2way_compress( ctx, data, 1 );
      ctx->processed_bits = 1024;
-      remainingbits = m256_const2_64( 0, -1024 );
+      remainingbits = mm256_bcast128lo_64( -1024 );
      vlen = 0;
   }
   else
@@ -806,13 +808,13 @@ int echo_2way_full( echo_2way_context *ctx, void *hashval, int nHashSize,
      vlen = databitlen / 128;  // * 4 lanes / 128 bits per lane
      memcpy_256( ctx->buffer, data, vlen );
      ctx->processed_bits += (unsigned int)( databitlen );
-      remainingbits = m256_const2_64( 0, databitlen );
+      remainingbits = mm256_bcast128lo_64( databitlen );
   }

-   ctx->buffer[ vlen ] = m256_const2_64( 0, 0x80 );
+   ctx->buffer[ vlen ] = mm256_bcast128lo_64( 0x80 );
   memset_zero_256( ctx->buffer + vlen + 1, vblen - vlen - 2 );
-   ctx->buffer[ vblen-2 ] = m256_const2_64( (uint64_t)ctx->uHashSize << 48, 0 );
-   ctx->buffer[ vblen-1 ] = m256_const2_64( 0, ctx->processed_bits );
+   ctx->buffer[ vblen-2 ] = mm256_bcast128hi_64( (uint64_t)ctx->uHashSize << 48 );
+   ctx->buffer[ vblen-1 ] = mm256_bcast128lo_64( ctx->processed_bits );

   ctx->k = _mm256_add_epi64( ctx->k, remainingbits );
   ctx->k = _mm256_sub_epi64( ctx->k, ctx->const1536 );
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -33,8 +33,7 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
  }

  // The only non-zero in the IV is len. It can be hard coded.
-  ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
-
+  ctx->chaining[ 3 ] = mm512_bcast128lo_64( 0x0100000000000000 );
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

@@ -51,9 +50,6 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
   __m512i* in = (__m512i*)input;
   int i;

-//  if (ctx->chaining == NULL || ctx->buffer == NULL)
-//    return 1;
-
  for ( i = 0; i < SIZE256; i++ )
  {
     ctx->chaining[i] = m512_zero;
@@ -61,7 +57,7 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
  }

  // The only non-zero in the IV is len. It can be hard coded.
-  ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
+  ctx->chaining[ 3 ] = mm512_bcast128lo_64( 0x0100000000000000 );
  ctx->buf_ptr = 0;
   
   // --- update ---
@@ -83,18 +79,18 @@ int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
   if ( i == SIZE256 - 1 )
   {        
       // only 1 vector left in buffer, all padding at once
-       ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 ); 
+       ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 ); 
   }   
   else
   {
       // add first padding
-       ctx->buffer[i] = m512_const2_64( 0, 0x80 );
+       ctx->buffer[i] = mm512_bcast128lo_64( 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE256 - 1; i++ )
           ctx->buffer[i] = m512_zero;

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
+       ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 );
   }

   // digest final padding block and do output transform
@@ -140,18 +136,18 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
   if ( i == SIZE256 - 1 )
   {
       // only 1 vector left in buffer, all padding at once
-       ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
+       ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 );
   }
   else
   {
       // add first padding
-       ctx->buffer[i] = m512_const2_64( 0, 0x80 );
+       ctx->buffer[i] = mm512_bcast128lo_64( 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE256 - 1; i++ )
           ctx->buffer[i] = m512_zero;

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
+       ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 );
   }

 // digest final padding block and do output transform
@@ -186,7 +182,7 @@ int groestl256_2way_init( groestl256_2way_context* ctx, uint64_t hashlen )
  }

  // The only non-zero in the IV is len. It can be hard coded.
-  ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
+  ctx->chaining[ 3 ] = mm256_bcast128lo_64( 0x0100000000000000 );

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
@@ -211,7 +207,7 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
   }

   // The only non-zero in the IV is len. It can be hard coded.
-   ctx->chaining[ 3 ] = m256_const2_64( 0, 0x0100000000000000 );
+   ctx->chaining[ 3 ] = mm256_bcast128lo_64( 0x0100000000000000 );
   ctx->buf_ptr = 0;

   // --- update ---
@@ -233,18 +229,18 @@ int groestl256_2way_full( groestl256_2way_context* ctx, void* output,
   if ( i == SIZE256 - 1 )
   {
       // only 1 vector left in buffer, all padding at once
-      ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
+      ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 );
   }
   else
   {
       // add first padding
-       ctx->buffer[i] = m256_const2_64( 0, 0x80 );
+       ctx->buffer[i] = mm256_bcast128lo_64( 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE256 - 1; i++ )
           ctx->buffer[i] = m256_zero;

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
+       ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 );
   }

   // digest final padding block and do output transform
@@ -289,23 +285,22 @@ int groestl256_2way_update_close( groestl256_2way_context* ctx, void* output,
   if ( i == SIZE256 - 1 )
   {
       // only 1 vector left in buffer, all padding at once
-       ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
+       ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 );
   }
   else
   {
       // add first padding
-       ctx->buffer[i] = m256_const2_64( 0, 0x80 );
+       ctx->buffer[i] = mm256_bcast128lo_64( 0x80 );
       // add zero padding
       for ( i += 1; i < SIZE256 - 1; i++ )
           ctx->buffer[i] = m256_zero;

       // add length padding, second last byte is zero unless blocks > 255
-       ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
+       ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 );
   }

 // digest final padding block and do output transform
   TF512_2way( ctx->chaining, ctx->buffer );
-
   OF512_2way( ctx->chaining );

   // store hash result in output 
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -165,7 +165,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
+  b1 = _mm512_set1_epi64( 0x1b1b1b1b1b1b1b1b ); \
  MUL2( a0, b0, b1 ); \
  a0 = _mm512_xor_si512( a0, TEMP0 ); \
  MUL2( a1, b0, b1 ); \
@@ -205,116 +205,18 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
  b1 = _mm512_xor_si512( b1, a4 ); \
 }/*MixBytes*/

-
-#if 0
-#define MixBytes(a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
-  /* t_i = a_i + a_{i+1} */\
-  b6 = a0;\
-  b7 = a1;\
-  a0 = _mm512_xor_si512(a0, a1);\
-  b0 = a2;\
-  a1 = _mm512_xor_si512(a1, a2);\
-  b1 = a3;\
-  a2 = _mm512_xor_si512(a2, a3);\
-  b2 = a4;\
-  a3 = _mm512_xor_si512(a3, a4);\
-  b3 = a5;\
-  a4 = _mm512_xor_si512(a4, a5);\
-  b4 = a6;\
-  a5 = _mm512_xor_si512(a5, a6);\
-  b5 = a7;\
-  a6 = _mm512_xor_si512(a6, a7);\
-  a7 = _mm512_xor_si512(a7, b6);\
-  \
-  /* build y4 y5 y6 ... in regs xmm8, xmm9, xmm10 by adding t_i*/\
-  b0 = _mm512_xor_si512(b0, a4);\
-  b6 = _mm512_xor_si512(b6, a4);\
-  b1 = _mm512_xor_si512(b1, a5);\
-  b7 = _mm512_xor_si512(b7, a5);\
-  b2 = _mm512_xor_si512(b2, a6);\
-  b0 = _mm512_xor_si512(b0, a6);\
-  /* spill values y_4, y_5 to memory */\
-  TEMP0 = b0;\
-  b3 = _mm512_xor_si512(b3, a7);\
-  b1 = _mm512_xor_si512(b1, a7);\
-  TEMP1 = b1;\
-  b4 = _mm512_xor_si512(b4, a0);\
-  b2 = _mm512_xor_si512(b2, a0);\
-  /* save values t0, t1, t2 to xmm8, xmm9 and memory */\
-  b0 = a0;\
-  b5 = _mm512_xor_si512(b5, a1);\
-  b3 = _mm512_xor_si512(b3, a1);\
-  b1 = a1;\
-  b6 = _mm512_xor_si512(b6, a2);\
-  b4 = _mm512_xor_si512(b4, a2);\
-  TEMP2 = a2;\
-  b7 = _mm512_xor_si512(b7, a3);\
-  b5 = _mm512_xor_si512(b5, a3);\
-  \
-  /* compute x_i = t_i + t_{i+3} */\
-  a0 = _mm512_xor_si512(a0, a3);\
-  a1 = _mm512_xor_si512(a1, a4);\
-  a2 = _mm512_xor_si512(a2, a5);\
-  a3 = _mm512_xor_si512(a3, a6);\
-  a4 = _mm512_xor_si512(a4, a7);\
-  a5 = _mm512_xor_si512(a5, b0);\
-  a6 = _mm512_xor_si512(a6, b1);\
-  a7 = _mm512_xor_si512(a7, TEMP2);\
-  \
-  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
-  /* compute w_i : add y_{i+4} */\
-  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b );\
-  MUL2(a0, b0, b1);\
-  a0 = _mm512_xor_si512(a0, TEMP0);\
-  MUL2(a1, b0, b1);\
-  a1 = _mm512_xor_si512(a1, TEMP1);\
-  MUL2(a2, b0, b1);\
-  a2 = _mm512_xor_si512(a2, b2);\
-  MUL2(a3, b0, b1);\
-  a3 = _mm512_xor_si512(a3, b3);\
-  MUL2(a4, b0, b1);\
-  a4 = _mm512_xor_si512(a4, b4);\
-  MUL2(a5, b0, b1);\
-  a5 = _mm512_xor_si512(a5, b5);\
-  MUL2(a6, b0, b1);\
-  a6 = _mm512_xor_si512(a6, b6);\
-  MUL2(a7, b0, b1);\
-  a7 = _mm512_xor_si512(a7, b7);\
-  \
-  /* compute v_i : double w_i      */\
-  /* add to y_4 y_5 .. v3, v4, ... */\
-  MUL2(a0, b0, b1);\
-  b5 = _mm512_xor_si512(b5, a0);\
-  MUL2(a1, b0, b1);\
-  b6 = _mm512_xor_si512(b6, a1);\
-  MUL2(a2, b0, b1);\
-  b7 = _mm512_xor_si512(b7, a2);\
-  MUL2(a5, b0, b1);\
-  b2 = _mm512_xor_si512(b2, a5);\
-  MUL2(a6, b0, b1);\
-  b3 = _mm512_xor_si512(b3, a6);\
-  MUL2(a7, b0, b1);\
-  b4 = _mm512_xor_si512(b4, a7);\
-  MUL2(a3, b0, b1);\
-  MUL2(a4, b0, b1);\
-  b0 = TEMP0;\
-  b1 = TEMP1;\
-  b0 = _mm512_xor_si512(b0, a3);\
-  b1 = _mm512_xor_si512(b1, a4);\
-}/*MixBytes*/
-#endif
+#define MASK_NOT( a )  _mm512_mask_ternarylogic_epi64( a, 0xaa, a, a, 1 )

 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = m512_const2_64( 0xffffffffffffffff, 0 ); \
-  a0 = _mm512_xor_si512( a0, m512_const1_128( round_const_l0[i] ) );\
-  a1 = _mm512_xor_si512( a1, b1 );\
-  a2 = _mm512_xor_si512( a2, b1 );\
-  a3 = _mm512_xor_si512( a3, b1 );\
-  a4 = _mm512_xor_si512( a4, b1 );\
-  a5 = _mm512_xor_si512( a5, b1 );\
-  a6 = _mm512_xor_si512( a6, b1 );\
-  a7 = _mm512_xor_si512( a7, m512_const1_128( round_const_l7[i] ) );\
+  a0 = _mm512_xor_si512( a0, mm512_bcast_m128( round_const_l0[i] ) );\
+  a1 = MASK_NOT( a1 ); \
+  a2 = MASK_NOT( a2 ); \
+  a3 = MASK_NOT( a3 ); \
+  a4 = MASK_NOT( a4 ); \
+  a5 = MASK_NOT( a5 ); \
+  a6 = MASK_NOT( a6 ); \
+  a7 = _mm512_xor_si512( a7, mm512_bcast_m128( round_const_l7[i] ) );\
  \
  /* ShiftBytes + SubBytes (interleaved) */\
  b0 = _mm512_xor_si512( b0, b0 );\
@@ -450,7 +352,7 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
 * outputs: (i0-7) = (0|S)
 */
 #define Matrix_Transpose_O_B(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm512_xor_si512( t0, t0 );\
+  t0 = m512_zero;\
  i1 = i0;\
  i3 = i2;\
  i5 = i4;\
@@ -481,11 +383,11 @@ static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,

 void TF512_4way( __m512i* chaining, __m512i* message )
 {
-  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m512i TEMP0;
-  static __m512i TEMP1;
-  static __m512i TEMP2;
+  __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  __m512i TEMP0;
+  __m512i TEMP1;
+  __m512i TEMP2;

  /* load message into registers xmm12 - xmm15 */
  xmm12 = message[0];
@@ -547,11 +449,11 @@ void TF512_4way( __m512i* chaining, __m512i* message )

 void OF512_4way( __m512i* chaining )
 {
-  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m512i TEMP0;
-  static __m512i TEMP1;
-  static __m512i TEMP2;
+  __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  __m512i TEMP0;
+  __m512i TEMP1;
+  __m512i TEMP2;

  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
  xmm8 = chaining[0];
@@ -696,7 +598,7 @@ static const __m256i SUBSH_MASK7_2WAY =
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  b1 = _mm256_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
  MUL2_2WAY(a0, b0, b1);\
  a0 = _mm256_xor_si256(a0, TEMP0);\
  MUL2_2WAY(a1, b0, b1);\
@@ -738,15 +640,15 @@ static const __m256i SUBSH_MASK7_2WAY =

 #define ROUND_2WAY(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = m256_const2_64( 0xffffffffffffffff, 0 ); \
-  a0 = _mm256_xor_si256( a0, m256_const1_128( round_const_l0[i] ) );\
+  b1 = mm256_bcast_m128( mm128_mask_32( m128_neg1, 0x3 ) ); \
+  a0 = _mm256_xor_si256( a0, mm256_bcast_m128( round_const_l0[i] ) );\
  a1 = _mm256_xor_si256( a1, b1 );\
  a2 = _mm256_xor_si256( a2, b1 );\
  a3 = _mm256_xor_si256( a3, b1 );\
  a4 = _mm256_xor_si256( a4, b1 );\
  a5 = _mm256_xor_si256( a5, b1 );\
  a6 = _mm256_xor_si256( a6, b1 );\
-  a7 = _mm256_xor_si256( a7, m256_const1_128( round_const_l7[i] ) );\
+  a7 = _mm256_xor_si256( a7, mm256_bcast_m128( round_const_l7[i] ) );\
  \
  /* ShiftBytes + SubBytes (interleaved) */\
  b0 = _mm256_xor_si256( b0, b0 );\
@@ -850,7 +752,7 @@ static const __m256i SUBSH_MASK7_2WAY =
 }/**/

 #define Matrix_Transpose_O_B_2way(i0, i1, i2, i3, i4, i5, i6, i7, t0){\
-  t0 = _mm256_xor_si256( t0, t0 );\
+  t0 = m256_zero;\
  i1 = i0;\
  i3 = i2;\
  i5 = i4;\
@@ -874,11 +776,11 @@ static const __m256i SUBSH_MASK7_2WAY =

 void TF512_2way( __m256i* chaining, __m256i* message )
 {
-  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m256i TEMP0;
-  static __m256i TEMP1;
-  static __m256i TEMP2;
+  __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  __m256i TEMP0;
+  __m256i TEMP1;
+  __m256i TEMP2;

  /* load message into registers xmm12 - xmm15 */
  xmm12 = message[0];
@@ -940,11 +842,11 @@ void TF512_2way( __m256i* chaining, __m256i* message )
  
 void OF512_2way( __m256i* chaining )
 {
-  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m256i TEMP0;
-  static __m256i TEMP1;
-  static __m256i TEMP2;
+  __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  __m256i TEMP0;
+  __m256i TEMP1;
+  __m256i TEMP2;

  /* load CV into registers xmm8, xmm10, xmm12, xmm14 */
  xmm8 = chaining[0];
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -25,8 +25,7 @@ int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
  memset_zero_512( ctx->buffer, SIZE512 );

  // The only non-zero in the IV is len. It can be hard coded.
-  ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
-
+  ctx->chaining[ 6 ] = mm512_bcast128hi_64( 0x0200000000000000 );
  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;

@@ -61,14 +60,14 @@ int groestl512_4way_update_close( groestl512_4way_context* ctx, void* output,
   if ( i == SIZE512 - 1 )
   {        
       // only 1 vector left in buffer, all padding at once
-       ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
+       ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 );
   }   
   else
   {
-       ctx->buffer[i] = m512_const2_64( 0, 0x80 );
+       ctx->buffer[i] = mm512_bcast128lo_64( 0x80 );
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = m512_zero;
-       ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
+       ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 );
   }

   TF1024_4way( ctx->chaining, ctx->buffer );
@@ -94,7 +93,7 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,

   memset_zero_512( ctx->chaining, SIZE512 );
   memset_zero_512( ctx->buffer, SIZE512 );
-   ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
+   ctx->chaining[ 6 ] = mm512_bcast128hi_64( 0x0200000000000000 );
   ctx->buf_ptr = 0;

   // --- update ---
@@ -113,14 +112,14 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,
   if ( i == SIZE512 - 1 )
   {
       // only 1 vector left in buffer, all padding at once
-       ctx->buffer[i] = m512_const2_64( blocks << 56, 0x80 );
+       ctx->buffer[i] = mm512_set2_64( blocks << 56, 0x80 );
   }
   else
   {
-       ctx->buffer[i] = m512_const2_64( 0, 0x80 );
+       ctx->buffer[i] = mm512_bcast128lo_64( 0x80 );
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = m512_zero;
-       ctx->buffer[i] = m512_const2_64( blocks << 56, 0 );
+       ctx->buffer[i] = mm512_bcast128hi_64( blocks << 56 );
   }

   TF1024_4way( ctx->chaining, ctx->buffer );
@@ -143,7 +142,7 @@ int groestl512_2way_init( groestl512_2way_context* ctx, uint64_t hashlen )
  memset_zero_256( ctx->buffer, SIZE512 );

  // The only non-zero in the IV is len. It can be hard coded.
-  ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
+  ctx->chaining[ 6 ] = mm256_bcast128hi_64( 0x0200000000000000 );

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
@@ -179,14 +178,14 @@ int groestl512_2way_update_close( groestl512_2way_context* ctx, void* output,
   if ( i == SIZE512 - 1 )
   {
       // only 1 vector left in buffer, all padding at once
-       ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
+       ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 );
   }
   else
   {
-       ctx->buffer[i] = m256_const2_64( 0, 0x80 );
+       ctx->buffer[i] = mm256_bcast128lo_64( 0x80 );
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = m256_zero;
-       ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
+       ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 );
   }

   TF1024_2way( ctx->chaining, ctx->buffer );
@@ -212,7 +211,7 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,

   memset_zero_256( ctx->chaining, SIZE512 );
   memset_zero_256( ctx->buffer, SIZE512 );
-   ctx->chaining[ 6 ] = m256_const2_64( 0x0200000000000000, 0 );
+   ctx->chaining[ 6 ] = mm256_bcast128hi_64( 0x0200000000000000 );
   ctx->buf_ptr = 0;

   // --- update ---
@@ -231,14 +230,14 @@ int groestl512_2way_full( groestl512_2way_context* ctx, void* output,
   if ( i == SIZE512 - 1 )
   {
       // only 1 vector left in buffer, all padding at once
-       ctx->buffer[i] = m256_const2_64( blocks << 56, 0x80 );
+       ctx->buffer[i] = mm256_set2_64( blocks << 56, 0x80 );
   }
   else
   {
-       ctx->buffer[i] = m256_const2_64( 0, 0x80 );
+       ctx->buffer[i] = mm256_bcast128lo_64( 0x80 );
       for ( i += 1; i < SIZE512 - 1; i++ )
           ctx->buffer[i] = m256_zero;
-       ctx->buffer[i] = m256_const2_64( blocks << 56, 0 );
+       ctx->buffer[i] = mm256_bcast128hi_64( blocks << 56 );
   }

   TF1024_2way( ctx->chaining, ctx->buffer );
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -174,7 +174,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m512_const1_64( 0x1b1b1b1b1b1b1b1b ); \
+  b1 = _mm512_set1_epi64( 0x1b1b1b1b1b1b1b1b ); \
  MUL2( a0, b0, b1 ); \
  a0 = _mm512_xor_si512( a0, TEMP0 ); \
  MUL2( a1, b0, b1 ); \
@@ -238,7 +238,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
  for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
  { \
    /* AddRoundConstant P1024 */\
-    xmm8 = _mm512_xor_si512( xmm8, m512_const1_128( \
+    xmm8 = _mm512_xor_si512( xmm8, mm512_bcast_m128( \
             casti_m128i( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK0 ); \
@@ -253,7 +253,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
     /* AddRoundConstant P1024 */\
-    xmm0 = _mm512_xor_si512( xmm0, m512_const1_128( \
+    xmm0 = _mm512_xor_si512( xmm0, mm512_bcast_m128( \
             casti_m128i( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
@@ -282,7 +282,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm12 = _mm512_xor_si512( xmm12, xmm1 );\
    xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
    xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
-    xmm15 = _mm512_xor_si512( xmm15, m512_const1_128( \
+    xmm15 = _mm512_xor_si512( xmm15, mm512_bcast_m128( \
                 casti_m128i( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK1 );\
@@ -305,7 +305,7 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
    xmm4 = _mm512_xor_si512( xmm4, xmm9 );\
    xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
    xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
-    xmm7 = _mm512_xor_si512( xmm7, m512_const1_128( \
+    xmm7 = _mm512_xor_si512( xmm7, mm512_bcast_m128( \
             casti_m128i( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
@@ -471,8 +471,8 @@ static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,

 void INIT_4way( __m512i* chaining )
 {
-  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;

  /* load IV into registers xmm8 - xmm15 */
  xmm8 = chaining[0];
@@ -500,12 +500,12 @@ void INIT_4way( __m512i* chaining )

 void TF1024_4way( __m512i* chaining, const __m512i* message )
 {
-  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m512i QTEMP[8];
-  static __m512i TEMP0;
-  static __m512i TEMP1;
-  static __m512i TEMP2;
+  __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  __m512i QTEMP[8];
+  __m512i TEMP0;
+  __m512i TEMP1;
+  __m512i TEMP2;

  /* load message into registers xmm8 - xmm15 (Q = message) */
  xmm8 = message[0];
@@ -606,11 +606,11 @@ void TF1024_4way( __m512i* chaining, const __m512i* message )

 void OF1024_4way( __m512i* chaining )
 {
-  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m512i TEMP0;
-  static __m512i TEMP1;
-  static __m512i TEMP2;
+  __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  __m512i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  __m512i TEMP0;
+  __m512i TEMP1;
+  __m512i TEMP2;

  /* load CV into registers xmm8 - xmm15 */
  xmm8 = chaining[0];
@@ -758,7 +758,7 @@ static const __m256i SUBSH_MASK7_2WAY =
  \
  /* compute z_i : double x_i using temp xmm8 and 1B xmm9 */\
  /* compute w_i : add y_{i+4} */\
-  b1 = m256_const1_64( 0x1b1b1b1b1b1b1b1b );\
+  b1 = _mm256_set1_epi64x( 0x1b1b1b1b1b1b1b1b );\
  MUL2_2WAY(a0, b0, b1);\
  a0 = _mm256_xor_si256(a0, TEMP0);\
  MUL2_2WAY(a1, b0, b1);\
@@ -822,7 +822,7 @@ static const __m256i SUBSH_MASK7_2WAY =
  for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
  { \
    /* AddRoundConstant P1024 */\
-    xmm8 = _mm256_xor_si256( xmm8, m256_const1_128( \
+    xmm8 = _mm256_xor_si256( xmm8, mm256_bcast_m128( \
             casti_m128i( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK0_2WAY ); \
@@ -837,7 +837,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    SUBMIX_2WAY(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
     /* AddRoundConstant P1024 */\
-    xmm0 = _mm256_xor_si256( xmm0, m256_const1_128( \
+    xmm0 = _mm256_xor_si256( xmm0, mm256_bcast_m128( \
             casti_m128i( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK0_2WAY );\
@@ -866,7 +866,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    xmm12 = _mm256_xor_si256( xmm12, xmm1 );\
    xmm13 = _mm256_xor_si256( xmm13, xmm1 );\
    xmm14 = _mm256_xor_si256( xmm14, xmm1 );\
-    xmm15 = _mm256_xor_si256( xmm15, m256_const1_128( \
+    xmm15 = _mm256_xor_si256( xmm15, mm256_bcast_m128( \
                 casti_m128i( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm8  = _mm256_shuffle_epi8( xmm8,  SUBSH_MASK1_2WAY );\
@@ -889,7 +889,7 @@ static const __m256i SUBSH_MASK7_2WAY =
    xmm4 = _mm256_xor_si256( xmm4, xmm9 );\
    xmm5 = _mm256_xor_si256( xmm5, xmm9 );\
    xmm6 = _mm256_xor_si256( xmm6, xmm9 );\
-    xmm7 = _mm256_xor_si256( xmm7, m256_const1_128( \
+    xmm7 = _mm256_xor_si256( xmm7, mm256_bcast_m128( \
             casti_m128i( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
    xmm0 = _mm256_shuffle_epi8( xmm0, SUBSH_MASK1_2WAY );\
@@ -1040,8 +1040,8 @@ static const __m256i SUBSH_MASK7_2WAY =

 void INIT_2way( __m256i *chaining )
 {
-  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;

  /* load IV into registers xmm8 - xmm15 */
  xmm8 = chaining[0];
@@ -1069,12 +1069,12 @@ void INIT_2way( __m256i *chaining )

 void TF1024_2way( __m256i *chaining, const __m256i *message )
 {
-  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m256i QTEMP[8];
-  static __m256i TEMP0;
-  static __m256i TEMP1;
-  static __m256i TEMP2;
+  __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  __m256i QTEMP[8];
+  __m256i TEMP0;
+  __m256i TEMP1;
+  __m256i TEMP2;

  /* load message into registers xmm8 - xmm15 (Q = message) */
  xmm8 = message[0];
@@ -1175,11 +1175,11 @@ void TF1024_2way( __m256i *chaining, const __m256i *message )

 void OF1024_2way( __m256i* chaining )
 {
-  static __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
-  static __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
-  static __m256i TEMP0;
-  static __m256i TEMP1;
-  static __m256i TEMP2;
+  __m256i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
+  __m256i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
+  __m256i TEMP0;
+  __m256i TEMP1;
+  __m256i TEMP2;

  /* load CV into registers xmm8 - xmm15 */
  xmm8 = chaining[0];
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -60,7 +60,7 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

-#define cns4w(i)  m512_const1_128( ( (__m128i*)CNS_INIT)[i] )
+#define cns4w(i)  mm512_bcast_m128( ( (__m128i*)CNS_INIT)[i] )

 #define ADD_CONSTANT4W( a, b, c0, c1 ) \
    a = _mm512_xor_si512( a, c0 ); \
@@ -154,11 +154,10 @@ static const uint32 CNS_INIT[128] __attribute((aligned(64))) = {
 #define MIXTON10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
    NMLTOM10244W(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);

-void rnd512_4way( luffa_4way_context *state, __m512i *msg )
+void rnd512_4way( luffa_4way_context *state, const __m512i *msg )
 {
    __m512i t0, t1;
    __m512i *chainv = state->chainv;
-    __m512i msg0, msg1;
    __m512i x0, x1, x2, x3, x4, x5, x6, x7;

    t0 = mm512_xor3( chainv[0], chainv[2], chainv[4] );
@@ -168,9 +167,6 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )

    MULT24W( t0, t1 );

-    msg0 = _mm512_shuffle_epi32( msg[0], 27 );
-    msg1 = _mm512_shuffle_epi32( msg[1], 27 );
-
    chainv[0] = _mm512_xor_si512( chainv[0], t0 );
    chainv[1] = _mm512_xor_si512( chainv[1], t1 );
    chainv[2] = _mm512_xor_si512( chainv[2], t0 );
@@ -225,27 +221,36 @@ void rnd512_4way( luffa_4way_context *state, __m512i *msg )
    chainv[3] = _mm512_xor_si512( chainv[3], chainv[1] );

    MULT24W( chainv[0], chainv[1] );
-    chainv[0] = mm512_xor3( chainv[0], t0, msg0 );
-    chainv[1] = mm512_xor3( chainv[1], t1, msg1 );
+    chainv[0] = _mm512_xor_si512( chainv[0], t0 );
+    chainv[1] = _mm512_xor_si512( chainv[1], t1 );

-    MULT24W( msg0, msg1 );
-    chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
-    chainv[3] = _mm512_xor_si512( chainv[3], msg1 );
+    if ( msg )
+    {
+       __m512i msg0, msg1;

-    MULT24W( msg0, msg1 );
-    chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
-    chainv[5] = _mm512_xor_si512( chainv[5], msg1 );
+       msg0 = _mm512_shuffle_epi32( msg[0], 27 );
+       msg1 = _mm512_shuffle_epi32( msg[1], 27 );

-    MULT24W( msg0, msg1 );
-    chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
-    chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
+       chainv[0] = _mm512_xor_si512( chainv[0], msg0 );
+       chainv[1] = _mm512_xor_si512( chainv[1], msg1 );

-    MULT24W( msg0, msg1);
-    chainv[8] = _mm512_xor_si512( chainv[8], msg0 );
-    chainv[9] = _mm512_xor_si512( chainv[9], msg1 );
+       MULT24W( msg0, msg1 );
+       chainv[2] = _mm512_xor_si512( chainv[2], msg0 );
+       chainv[3] = _mm512_xor_si512( chainv[3], msg1 );

-    MULT24W( msg0, msg1 );
+       MULT24W( msg0, msg1 );
+       chainv[4] = _mm512_xor_si512( chainv[4], msg0 );
+       chainv[5] = _mm512_xor_si512( chainv[5], msg1 );

+       MULT24W( msg0, msg1 );
+       chainv[6] = _mm512_xor_si512( chainv[6], msg0 );
+       chainv[7] = _mm512_xor_si512( chainv[7], msg1 );
+
+       MULT24W( msg0, msg1);
+       chainv[8] = _mm512_xor_si512( chainv[8], msg0 );
+       chainv[9] = _mm512_xor_si512( chainv[9], msg1 );
+    }
+    
    chainv[3] = _mm512_rol_epi32( chainv[3], 1 );
    chainv[5] = _mm512_rol_epi32( chainv[5], 2 );
    chainv[7] = _mm512_rol_epi32( chainv[7], 3 );
@@ -282,16 +287,11 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    uint32_t hash[8*4] __attribute((aligned(128)));
    __m512i* chainv = state->chainv;
    __m512i t[2];
-    __m512i zero[2];
-    zero[0] = zero[1] = m512_zero;
-    const __m512i shuff_bswap32 = m512_const_64(
-                                  0x3c3d3e3f38393a3b, 0x3435363730313233,
-                                  0x2c2d2e2f28292a2b, 0x2425262720212223,
-                                  0x1c1d1e1f18191a1b, 0x1415161710111213,
-                                  0x0c0d0e0f08090a0b, 0x0405060700010203 );
+    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( 
+                                  0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    /*---- blank round with m=0 ----*/
-    rnd512_4way( state, zero );
+    rnd512_4way( state, NULL );
    
    t[0] = mm512_xor3( chainv[0], chainv[2], chainv[4] );
    t[1] = mm512_xor3( chainv[1], chainv[3], chainv[5] );
@@ -300,37 +300,30 @@ void finalization512_4way( luffa_4way_context *state, uint32 *b )
    t[0] = _mm512_shuffle_epi32( t[0], 27 );
    t[1] = _mm512_shuffle_epi32( t[1], 27 );

-    _mm512_store_si512( (__m512i*)&hash[0], t[0] );
+    _mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
    _mm512_store_si512( (__m512i*)&hash[16], t[1] );

-    casti_m512i( b, 0 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash, 0 ), shuff_bswap32 );
-    casti_m512i( b, 1 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash, 1 ), shuff_bswap32 );
+    casti_m512i( b,0 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash,0 ), shuff_bswap32 );
+    casti_m512i( b,1 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash,1 ), shuff_bswap32 );

-    rnd512_4way( state, zero );
-
-    t[0] = chainv[0];
-    t[1] = chainv[1];
-    t[0] = _mm512_xor_si512( t[0], chainv[2] );
-    t[1] = _mm512_xor_si512( t[1], chainv[3] );
-    t[0] = _mm512_xor_si512( t[0], chainv[4] );
-    t[1] = _mm512_xor_si512( t[1], chainv[5] );
-    t[0] = _mm512_xor_si512( t[0], chainv[6] );
-    t[1] = _mm512_xor_si512( t[1], chainv[7] );
-    t[0] = _mm512_xor_si512( t[0], chainv[8] );
-    t[1] = _mm512_xor_si512( t[1], chainv[9] );
+    rnd512_4way( state, NULL );

+    t[0] = mm512_xor3( chainv[0], chainv[2], chainv[4] );
+    t[1] = mm512_xor3( chainv[1], chainv[3], chainv[5] );
+    t[0] = mm512_xor3( t[0], chainv[6], chainv[8] );
+    t[1] = mm512_xor3( t[1], chainv[7], chainv[9] );
    t[0] = _mm512_shuffle_epi32( t[0], 27 );
    t[1] = _mm512_shuffle_epi32( t[1], 27 );

-    _mm512_store_si512( (__m512i*)&hash[0], t[0] );
+    _mm512_store_si512( (__m512i*)&hash[ 0], t[0] );
    _mm512_store_si512( (__m512i*)&hash[16], t[1] );

-    casti_m512i( b, 2 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash, 0 ), shuff_bswap32 );
-    casti_m512i( b, 3 ) = _mm512_shuffle_epi8(
-                                  casti_m512i( hash, 1 ), shuff_bswap32 );
+    casti_m512i( b,2 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash,0 ), shuff_bswap32 );
+    casti_m512i( b,3 ) = _mm512_shuffle_epi8(
+                                  casti_m512i( hash,1 ), shuff_bswap32 );
 }

 int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
@@ -338,16 +331,16 @@ int luffa_4way_init( luffa_4way_context *state, int hashbitlen )
    state->hashbitlen = hashbitlen;
    __m128i *iv = (__m128i*)IV;

-    state->chainv[0] = m512_const1_128( iv[0] );
-    state->chainv[1] = m512_const1_128( iv[1] );
-    state->chainv[2] = m512_const1_128( iv[2] );
-    state->chainv[3] = m512_const1_128( iv[3] );
-    state->chainv[4] = m512_const1_128( iv[4] );
-    state->chainv[5] = m512_const1_128( iv[5] );
-    state->chainv[6] = m512_const1_128( iv[6] );
-    state->chainv[7] = m512_const1_128( iv[7] );
-    state->chainv[8] = m512_const1_128( iv[8] );
-    state->chainv[9] = m512_const1_128( iv[9] );
+    state->chainv[0] = mm512_bcast_m128( iv[0] );
+    state->chainv[1] = mm512_bcast_m128( iv[1] );
+    state->chainv[2] = mm512_bcast_m128( iv[2] );
+    state->chainv[3] = mm512_bcast_m128( iv[3] );
+    state->chainv[4] = mm512_bcast_m128( iv[4] );
+    state->chainv[5] = mm512_bcast_m128( iv[5] );
+    state->chainv[6] = mm512_bcast_m128( iv[6] );
+    state->chainv[7] = mm512_bcast_m128( iv[7] );
+    state->chainv[8] = mm512_bcast_m128( iv[8] );
+    state->chainv[9] = mm512_bcast_m128( iv[9] );

    ((__m512i*)state->buffer)[0] = m512_zero;
    ((__m512i*)state->buffer)[1] = m512_zero;
@@ -370,11 +363,8 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
    __m512i msg[2];
    int i;
    int blocks = (int)len >> 5;
-    const __m512i shuff_bswap32 = m512_const_64( 
-                                   0x3c3d3e3f38393a3b, 0x3435363730313233,
-                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
-                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
-                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
+    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x(  
+                                   0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    state->rembytes = (int)len & 0x1F;

@@ -392,7 +382,7 @@ int luffa_4way_update( luffa_4way_context *state, const void *data,
    {
      // remaining data bytes
      buffer[0] = _mm512_shuffle_epi8( vdata[0], shuff_bswap32 );
-      buffer[1] = m512_const1_i128(  0x0000000080000000 );
+      buffer[1] = mm512_bcast128lo_64( 0x0000000080000000 );
    }
    return 0;
 }
@@ -416,7 +406,7 @@ int luffa_4way_close( luffa_4way_context *state, void *hashval )
      rnd512_4way( state, buffer );
    else
    {     // empty pad block, constant data
-      msg[0] = m512_const1_i128(  0x0000000080000000 );
+      msg[0] = mm512_bcast128lo_64( 0x0000000080000000 );
      msg[1] = m512_zero;
      rnd512_4way( state, msg );
    }
@@ -440,16 +430,16 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
    state->hashbitlen = 512;
    __m128i *iv = (__m128i*)IV;

-    state->chainv[0] = m512_const1_128( iv[0] );
-    state->chainv[1] = m512_const1_128( iv[1] );
-    state->chainv[2] = m512_const1_128( iv[2] );
-    state->chainv[3] = m512_const1_128( iv[3] );
-    state->chainv[4] = m512_const1_128( iv[4] );
-    state->chainv[5] = m512_const1_128( iv[5] );
-    state->chainv[6] = m512_const1_128( iv[6] );
-    state->chainv[7] = m512_const1_128( iv[7] );
-    state->chainv[8] = m512_const1_128( iv[8] );
-    state->chainv[9] = m512_const1_128( iv[9] );
+    state->chainv[0] = mm512_bcast_m128( iv[0] );
+    state->chainv[1] = mm512_bcast_m128( iv[1] );
+    state->chainv[2] = mm512_bcast_m128( iv[2] );
+    state->chainv[3] = mm512_bcast_m128( iv[3] );
+    state->chainv[4] = mm512_bcast_m128( iv[4] );
+    state->chainv[5] = mm512_bcast_m128( iv[5] );
+    state->chainv[6] = mm512_bcast_m128( iv[6] );
+    state->chainv[7] = mm512_bcast_m128( iv[7] );
+    state->chainv[8] = mm512_bcast_m128( iv[8] );
+    state->chainv[9] = mm512_bcast_m128( iv[9] );

    ((__m512i*)state->buffer)[0] = m512_zero;
    ((__m512i*)state->buffer)[1] = m512_zero;
@@ -458,11 +448,8 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
    __m512i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m512i shuff_bswap32 = m512_const_64(
-                                   0x3c3d3e3f38393a3b, 0x3435363730313233,
-                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
-                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
-                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
+    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( 
+                                   0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    state->rembytes = inlen & 0x1F;

@@ -479,13 +466,13 @@ int luffa512_4way_full( luffa_4way_context *state, void *output,
    {
       // padding of partial block
       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = m512_const1_i128(  0x0000000080000000 );
+       msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
       rnd512_4way( state, msg );
    }
    else
    {
       // empty pad block
-       msg[0] = m512_const1_i128( 0x0000000080000000 );
+       msg[0] = mm512_bcast128lo_64( 0x0000000080000000 );
       msg[1] = m512_zero;
       rnd512_4way( state, msg );
    }
@@ -506,11 +493,8 @@ int luffa_4way_update_close( luffa_4way_context *state,
    __m512i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m512i shuff_bswap32 = m512_const_64(
-                                   0x3c3d3e3f38393a3b, 0x3435363730313233,
-                                   0x2c2d2e2f28292a2b, 0x2425262720212223,
-                                   0x1c1d1e1f18191a1b, 0x1415161710111213,
-                                   0x0c0d0e0f08090a0b, 0x0405060700010203 );
+    const __m512i shuff_bswap32 = mm512_bcast_m128( _mm_set_epi64x( 
+                                   0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

    state->rembytes = inlen & 0x1F;

@@ -527,13 +511,13 @@ int luffa_4way_update_close( luffa_4way_context *state,
    {
       // padding of partial block
       msg[0] = _mm512_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = m512_const1_i128( 0x0000000080000000 );
+       msg[1] = mm512_bcast128lo_64( 0x0000000080000000 );
       rnd512_4way( state, msg );
    }
    else
    {
       // empty pad block
-       msg[0] = m512_const1_i128( 0x0000000080000000 );
+       msg[0] = mm512_bcast128lo_64( 0x0000000080000000 );
       msg[1] = m512_zero;
       rnd512_4way( state, msg );
    }
@@ -548,7 +532,7 @@ int luffa_4way_update_close( luffa_4way_context *state,

 #endif // AVX512

-#define cns(i)  m256_const1_128( ( (__m128i*)CNS_INIT)[i] )
+#define cns(i)  mm256_bcast_m128( ( (__m128i*)CNS_INIT)[i] )

 #define ADD_CONSTANT( a, b, c0, c1 ) \
    a = _mm256_xor_si256( a, c0 ); \
@@ -666,11 +650,10 @@ int luffa_4way_update_close( luffa_4way_context *state,
 /* Round function         */
 /* state: hash context    */

-void rnd512_2way( luffa_2way_context *state, __m256i *msg )
+void rnd512_2way( luffa_2way_context *state, const __m256i *msg )
 {
    __m256i t0, t1;
    __m256i *chainv = state->chainv;
-    __m256i msg0, msg1;
    __m256i x0, x1, x2, x3, x4, x5, x6, x7;

    t0 = chainv[0];
@@ -687,9 +670,6 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )

    MULT2( t0, t1 );

-    msg0 = _mm256_shuffle_epi32( msg[0], 27 );
-    msg1 = _mm256_shuffle_epi32( msg[1], 27 );
-
    chainv[0] = _mm256_xor_si256( chainv[0], t0 );
    chainv[1] = _mm256_xor_si256( chainv[1], t1 );
    chainv[2] = _mm256_xor_si256( chainv[2], t0 );
@@ -744,26 +724,35 @@ void rnd512_2way( luffa_2way_context *state, __m256i *msg )
    chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );

    MULT2( chainv[0], chainv[1] );
-    chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
-    chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
+    chainv[0] = _mm256_xor_si256( chainv[0], t0 );
+    chainv[1] = _mm256_xor_si256( chainv[1], t1 );

-    MULT2( msg0, msg1 );
-    chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
-    chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
+    if ( msg )
+    {
+       __m256i msg0, msg1;
+    
+       msg0 = _mm256_shuffle_epi32( msg[0], 27 );
+       msg1 = _mm256_shuffle_epi32( msg[1], 27 );

-    MULT2( msg0, msg1 );
-    chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
-    chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
+       chainv[0] = _mm256_xor_si256( chainv[0], msg0 );
+       chainv[1] = _mm256_xor_si256( chainv[1], msg1 );
+    
+       MULT2( msg0, msg1 );
+       chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
+       chainv[3] = _mm256_xor_si256( chainv[3], msg1 );

-    MULT2( msg0, msg1 );
-    chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
-    chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
+       MULT2( msg0, msg1 );
+       chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
+       chainv[5] = _mm256_xor_si256( chainv[5], msg1 );

-    MULT2( msg0, msg1 );
-    chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
-    chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
+       MULT2( msg0, msg1 );
+       chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
+       chainv[7] = _mm256_xor_si256( chainv[7], msg1 );

-    MULT2( msg0, msg1 );
+       MULT2( msg0, msg1 );
+       chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
+       chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
+    }

    chainv[3] = mm256_rol_32( chainv[3], 1 );
    chainv[5] = mm256_rol_32( chainv[5], 2 );
@@ -806,14 +795,10 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
    uint32 hash[8*2] __attribute((aligned(64)));
    __m256i* chainv = state->chainv;
    __m256i t[2];
-    __m256i zero[2];
-    zero[0] = zero[1] = m256_zero;
-    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
-                                                 0x1415161710111213,
-                                                 0x0c0d0e0f08090a0b,
+    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
                                                 0x0405060700010203 );
    /*---- blank round with m=0 ----*/
-    rnd512_2way( state, zero );
+    rnd512_2way( state, NULL );

    t[0] = chainv[0];
    t[1] = chainv[1];
@@ -838,7 +823,7 @@ void finalization512_2way( luffa_2way_context *state, uint32 *b )
    casti_m256i( b, 1 ) = _mm256_shuffle_epi8( 
                                  casti_m256i( hash, 1 ), shuff_bswap32 );

-    rnd512_2way( state, zero );
+    rnd512_2way( state, NULL );

    t[0] = chainv[0];
    t[1] = chainv[1];
@@ -868,16 +853,16 @@ int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
    state->hashbitlen = hashbitlen;
    __m128i *iv = (__m128i*)IV;
    
-    state->chainv[0] = m256_const1_128( iv[0] );
-    state->chainv[1] = m256_const1_128( iv[1] );
-    state->chainv[2] = m256_const1_128( iv[2] );
-    state->chainv[3] = m256_const1_128( iv[3] );
-    state->chainv[4] = m256_const1_128( iv[4] );
-    state->chainv[5] = m256_const1_128( iv[5] );
-    state->chainv[6] = m256_const1_128( iv[6] );
-    state->chainv[7] = m256_const1_128( iv[7] );
-    state->chainv[8] = m256_const1_128( iv[8] );
-    state->chainv[9] = m256_const1_128( iv[9] );
+    state->chainv[0] = mm256_bcast_m128( iv[0] );
+    state->chainv[1] = mm256_bcast_m128( iv[1] );
+    state->chainv[2] = mm256_bcast_m128( iv[2] );
+    state->chainv[3] = mm256_bcast_m128( iv[3] );
+    state->chainv[4] = mm256_bcast_m128( iv[4] );
+    state->chainv[5] = mm256_bcast_m128( iv[5] );
+    state->chainv[6] = mm256_bcast_m128( iv[6] );
+    state->chainv[7] = mm256_bcast_m128( iv[7] );
+    state->chainv[8] = mm256_bcast_m128( iv[8] );
+    state->chainv[9] = mm256_bcast_m128( iv[9] );

    ((__m256i*)state->buffer)[0] = m256_zero;
    ((__m256i*)state->buffer)[1] = m256_zero;
@@ -895,9 +880,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
    __m256i msg[2];
    int i;
    int blocks = (int)len >> 5;
-    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
-                                                 0x1415161710111213,
-                                                 0x0c0d0e0f08090a0b,
+    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
                                                 0x0405060700010203 );
    state-> rembytes = (int)len & 0x1F;

@@ -915,7 +898,7 @@ int luffa_2way_update( luffa_2way_context *state, const void *data,
    {
      // remaining data bytes
      buffer[0] = _mm256_shuffle_epi8( vdata[0], shuff_bswap32 );
-      buffer[1] = m256_const1_i128( 0x0000000080000000 );
+      buffer[1] = mm256_bcast128lo_64( 0x0000000080000000 );
    }
    return 0;
 }
@@ -931,7 +914,7 @@ int luffa_2way_close( luffa_2way_context *state, void *hashval )
      rnd512_2way( state, buffer );
    else
    {     // empty pad block, constant data
-      msg[0] = m256_const1_i128( 0x0000000080000000 );
+      msg[0] = mm256_bcast128lo_64( 0x0000000080000000 );
      msg[1] = m256_zero;
      rnd512_2way( state, msg );
    }
@@ -948,16 +931,16 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
    state->hashbitlen = 512;
    __m128i *iv = (__m128i*)IV;

-    state->chainv[0] = m256_const1_128( iv[0] );
-    state->chainv[1] = m256_const1_128( iv[1] );
-    state->chainv[2] = m256_const1_128( iv[2] );
-    state->chainv[3] = m256_const1_128( iv[3] );
-    state->chainv[4] = m256_const1_128( iv[4] );
-    state->chainv[5] = m256_const1_128( iv[5] );
-    state->chainv[6] = m256_const1_128( iv[6] );
-    state->chainv[7] = m256_const1_128( iv[7] );
-    state->chainv[8] = m256_const1_128( iv[8] );
-    state->chainv[9] = m256_const1_128( iv[9] );
+    state->chainv[0] = mm256_bcast_m128( iv[0] );
+    state->chainv[1] = mm256_bcast_m128( iv[1] );
+    state->chainv[2] = mm256_bcast_m128( iv[2] );
+    state->chainv[3] = mm256_bcast_m128( iv[3] );
+    state->chainv[4] = mm256_bcast_m128( iv[4] );
+    state->chainv[5] = mm256_bcast_m128( iv[5] );
+    state->chainv[6] = mm256_bcast_m128( iv[6] );
+    state->chainv[7] = mm256_bcast_m128( iv[7] );
+    state->chainv[8] = mm256_bcast_m128( iv[8] );
+    state->chainv[9] = mm256_bcast_m128( iv[9] );

    ((__m256i*)state->buffer)[0] = m256_zero;
    ((__m256i*)state->buffer)[1] = m256_zero;
@@ -966,9 +949,7 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
    __m256i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
-                                                 0x1415161710111213,
-                                                 0x0c0d0e0f08090a0b,
+    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
                                                 0x0405060700010203 );

    state->rembytes = inlen & 0x1F;
@@ -986,13 +967,13 @@ int luffa512_2way_full( luffa_2way_context *state, void *output,
    {
       // padding of partial block
       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = m256_const1_i128( 0x0000000080000000 );
+       msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
       rnd512_2way( state, msg );
    }
    else
    {
       // empty pad block
-       msg[0] = m256_const1_i128( 0x0000000080000000 );
+       msg[0] = mm256_bcast128lo_64( 0x0000000080000000 );
       msg[1] = m256_zero;
       rnd512_2way( state, msg );
    }
@@ -1013,9 +994,7 @@ int luffa_2way_update_close( luffa_2way_context *state,
    __m256i msg[2];
    int i;
    const int blocks = (int)( inlen >> 5 );
-    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
-                                                 0x1415161710111213,
-                                                 0x0c0d0e0f08090a0b,
+    const __m256i shuff_bswap32 = mm256_set2_64( 0x0c0d0e0f08090a0b,
                                                 0x0405060700010203 );

    state->rembytes = inlen & 0x1F;
@@ -1033,13 +1012,13 @@ int luffa_2way_update_close( luffa_2way_context *state,
    {
       // padding of partial block
       msg[0] = _mm256_shuffle_epi8( vdata[ 0 ], shuff_bswap32 );
-       msg[1] = m256_const1_i128( 0x0000000080000000 );
+       msg[1] = mm256_bcast128lo_64( 0x0000000080000000 );
       rnd512_2way( state, msg );
    }
    else
    {
       // empty pad block
-       msg[0] = m256_const1_i128( 0x0000000080000000 );
+       msg[0] = mm256_bcast128lo_64( 0x0000000080000000 );
       msg[1] = m256_zero;
       rnd512_2way( state, msg );
    }
--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -354,11 +354,11 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    // 16 byte partial block exists for 80 byte len
    if ( state->rembytes  )
       // padding of partial block
-       rnd512( state, m128_const_i128(  0x80000000 ),
+       rnd512( state, mm128_mov64_128(  0x80000000 ),
                      mm128_bswap_32( cast_m128i( data ) ) );
    else
       // empty pad block
-       rnd512( state, m128_zero, m128_const_i128( 0x80000000 ) );
+       rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) );

    finalization512( state, (uint32*) output );
    if ( state->hashbitlen > 512 )
@@ -403,11 +403,11 @@ int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
    // 16 byte partial block exists for 80 byte len
    if ( state->rembytes  )
       // padding of partial block
-       rnd512( state, m128_const_i128( 0x80000000 ),
+       rnd512( state, mm128_mov64_128( 0x80000000 ),
                      mm128_bswap_32( cast_m128i( data ) ) );
    else
       // empty pad block
-       rnd512( state, m128_zero, m128_const_i128( 0x80000000 ) );
+       rnd512( state, m128_zero, mm128_mov64_128( 0x80000000 ) );

    finalization512( state, (uint32*) output );
    if ( state->hashbitlen > 512 )
@@ -596,10 +596,10 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    __m256i* chainv = (__m256i*)state->chainv;
    __m256i  t;
    const __m128i zero = m128_zero;
-    const __m256i shuff_bswap32 = m256_const_64( 0x1c1d1e1f18191a1b,
-                                                 0x1415161710111213,
-                                                 0x0c0d0e0f08090a0b,
-                                                 0x0405060700010203 );
+    const __m256i shuff_bswap32 = _mm256_set_epi64x( 0x1c1d1e1f18191a1b,
+                                                     0x1415161710111213,
+                                                     0x0c0d0e0f08090a0b,
+                                                     0x0405060700010203 );

    rnd512( state, zero, zero );

--- a/algo/lyra2/sponge-2way.c
+++ b/algo/lyra2/sponge-2way.c
@@ -85,10 +85,10 @@ inline void absorbBlockBlake2Safe_2way( uint64_t *State, const uint64_t *In,

  state0 = 
  state1 = m512_zero;
-  state2 = m512_const4_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
-                           0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
-  state3 = m512_const4_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
-                           0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
+  state2 = _mm512_set4_epi64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
+                              0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state3 = _mm512_set4_epi64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
+                              0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );

  for ( int i = 0; i < nBlocks; i++ )
  { 
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -41,17 +41,17 @@
 inline void initState( uint64_t State[/*16*/] )
 {

-   /*
+/*
 #if defined (__AVX2__)

  __m256i* state = (__m256i*)State;
  const __m256i zero = m256_zero; 
  state[0] = zero;
  state[1] = zero;
-  state[2] = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
-                            0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
-  state[3] = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
-                            0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
+  state[2] = _mm256_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
+                                0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state[3] = _mm256_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
+                                0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );

 #elif defined (__SSE2__)

@@ -271,10 +271,10 @@ inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In,

  state0 = 
  state1 = m256_zero;
-  state2 = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
-                          0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
-  state3 = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
-                          0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
+  state2 = _mm256_set_epi64x( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
+                              0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state3 = _mm256_set_epi64x( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
+                              0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );

  for ( int i = 0; i < nBlocks; i++ )
  { 
--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -36,31 +36,31 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
   __m512i *noncev = vdata + 19; 
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m512i last_byte = m512_const1_32( 0x80000000 );
-   const __m512i sixteen = m512_const1_32( 16 );
+   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
+   const __m512i sixteen = _mm512_set1_epi32( 16 );

   for ( int i = 0; i < 19; i++ )
-      vdata[i] = mm512_bcast_i32( pdata[i] );
+      vdata[i] = _mm512_set1_epi32( pdata[i] );

   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_512( vdata+16 + 5, 10 );
-   vdata[16+15] = mm512_bcast_i32( 0x480 ); 
+   vdata[16+15] = _mm512_set1_epi32( 0x480 ); 
   
   block[ 8] = last_byte;
   memset_zero_512( block + 9, 6 );
-   block[15] = mm512_bcast_i32( 0x300 ); 
+   block[15] = _mm512_set1_epi32( 0x300 ); 
   
-   initstate[0] = mm512_bcast_i64( 0xdfa9bf2cdfa9bf2c );
-   initstate[1] = mm512_bcast_i64( 0xb72074d4b72074d4 );
-   initstate[2] = mm512_bcast_i64( 0x6bb011226bb01122 );
-   initstate[3] = mm512_bcast_i64( 0xd338e869d338e869 );
-   initstate[4] = mm512_bcast_i64( 0xaa3ff126aa3ff126 );
-   initstate[5] = mm512_bcast_i64( 0x475bbf30475bbf30 );
-   initstate[6] = mm512_bcast_i64( 0x8fd52e5b8fd52e5b );
-   initstate[7] = mm512_bcast_i64( 0x9f75c9ad9f75c9ad );
+   initstate[0] = _mm512_set1_epi64( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = _mm512_set1_epi64( 0xb72074d4b72074d4 );
+   initstate[2] = _mm512_set1_epi64( 0x6bb011226bb01122 );
+   initstate[3] = _mm512_set1_epi64( 0xd338e869d338e869 );
+   initstate[4] = _mm512_set1_epi64( 0xaa3ff126aa3ff126 );
+   initstate[5] = _mm512_set1_epi64( 0x475bbf30475bbf30 );
+   initstate[6] = _mm512_set1_epi64( 0x8fd52e5b8fd52e5b );
+   initstate[7] = _mm512_set1_epi64( 0x9f75c9ad9f75c9ad );

   sha256_16way_transform_le( midstate1, vdata, initstate );
   
@@ -118,31 +118,31 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
   __m256i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m256i last_byte = m256_const1_32( 0x80000000 );
-   const __m256i eight = m256_const1_32( 8 );
+   const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
+   const __m256i eight = _mm256_set1_epi32( 8 );

   for ( int i = 0; i < 19; i++ )
-      vdata[i] = mm256_bcast_i32( pdata[i] );
+      vdata[i] = _mm256_set1_epi32( pdata[i] );

   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_256( vdata+16 + 5, 10 );
-   vdata[16+15] = mm256_bcast_i32( 0x480 );
+   vdata[16+15] = _mm256_set1_epi32( 0x480 );

   block[ 8] = last_byte;
   memset_zero_256( block + 9, 6 );
-   block[15] = mm256_bcast_i32( 0x300 ); 
+   block[15] = _mm256_set1_epi32( 0x300 ); 
   
   // initialize state
-   initstate[0] = mm256_bcast_i64( 0xdfa9bf2cdfa9bf2c );
-   initstate[1] = mm256_bcast_i64( 0xb72074d4b72074d4 );
-   initstate[2] = mm256_bcast_i64( 0x6bb011226bb01122 );
-   initstate[3] = mm256_bcast_i64( 0xd338e869d338e869 );
-   initstate[4] = mm256_bcast_i64( 0xaa3ff126aa3ff126 );
-   initstate[5] = mm256_bcast_i64( 0x475bbf30475bbf30 );
-   initstate[6] = mm256_bcast_i64( 0x8fd52e5b8fd52e5b );
-   initstate[7] = mm256_bcast_i64( 0x9f75c9ad9f75c9ad );
+   initstate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
+   initstate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
+   initstate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
+   initstate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
+   initstate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
+   initstate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
+   initstate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );

   sha256_8way_transform_le( midstate1, vdata, initstate );

@@ -198,31 +198,31 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
   __m128i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m128i last_byte = m128_const1_32( 0x80000000 );
-   const __m128i four = m128_const1_32( 4 );
+   const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
+   const __m128i four = _mm_set1_epi32( 4 );

   for ( int i = 0; i < 19; i++ )
-       vdata[i] = mm128_bcast_i32( pdata[i] );
+       vdata[i] = _mm_set1_epi32( pdata[i] );

   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_128( vdata+16 + 5, 10 );
-   vdata[16+15] = mm128_bcast_i32( 0x480 );
+   vdata[16+15] = _mm_set1_epi32( 0x480 );

   block[ 8] = last_byte;
   memset_zero_128( block + 9, 6 );
-   block[15] = mm128_bcast_i32( 0x300 );
+   block[15] = _mm_set1_epi32( 0x300 );
   
   // initialize state
-   initstate[0] = mm128_bcast_i64( 0xdfa9bf2cdfa9bf2c );
-   initstate[1] = mm128_bcast_i64( 0xb72074d4b72074d4 );
-   initstate[2] = mm128_bcast_i64( 0x6bb011226bb01122 );
-   initstate[3] = mm128_bcast_i64( 0xd338e869d338e869 );
-   initstate[4] = mm128_bcast_i64( 0xaa3ff126aa3ff126 );
-   initstate[5] = mm128_bcast_i64( 0x475bbf30475bbf30 );
-   initstate[6] = mm128_bcast_i64( 0x8fd52e5b8fd52e5b );
-   initstate[7] = mm128_bcast_i64( 0x9f75c9ad9f75c9ad );
+   initstate[0] = _mm_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
+   initstate[1] = _mm_set1_epi64x( 0xb72074d4b72074d4 );
+   initstate[2] = _mm_set1_epi64x( 0x6bb011226bb01122 );
+   initstate[3] = _mm_set1_epi64x( 0xd338e869d338e869 );
+   initstate[4] = _mm_set1_epi64x( 0xaa3ff126aa3ff126 );
+   initstate[5] = _mm_set1_epi64x( 0x475bbf30475bbf30 );
+   initstate[6] = _mm_set1_epi64x( 0x8fd52e5b8fd52e5b );
+   initstate[7] = _mm_set1_epi64x( 0x9f75c9ad9f75c9ad );

   // hash first 64 bytes of data
   sha256_4way_transform_le( midstate, vdata, initstate );
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -155,14 +155,14 @@ sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
   }
   else
   {
-      A = m512_const1_64( 0x6A09E667F3BCC908 );
-      B = m512_const1_64( 0xBB67AE8584CAA73B );
-      C = m512_const1_64( 0x3C6EF372FE94F82B );
-      D = m512_const1_64( 0xA54FF53A5F1D36F1 );
-      E = m512_const1_64( 0x510E527FADE682D1 );
-      F = m512_const1_64( 0x9B05688C2B3E6C1F );
-      G = m512_const1_64( 0x1F83D9ABFB41BD6B );
-      H = m512_const1_64( 0x5BE0CD19137E2179 );
+      A = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
+      B = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
+      C = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
+      D = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
+      E = _mm512_set1_epi64( 0x510E527FADE682D1 );
+      F = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
+      G = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
+      H = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
   }

   for ( i = 0; i < 80; i += 8 )
@@ -191,14 +191,14 @@ sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
   else
   {
      ctx->initialized = true;
-      r[0] = _mm512_add_epi64( A, m512_const1_64( 0x6A09E667F3BCC908 ) );
-      r[1] = _mm512_add_epi64( B, m512_const1_64( 0xBB67AE8584CAA73B ) );
-      r[2] = _mm512_add_epi64( C, m512_const1_64( 0x3C6EF372FE94F82B ) );
-      r[3] = _mm512_add_epi64( D, m512_const1_64( 0xA54FF53A5F1D36F1 ) );
-      r[4] = _mm512_add_epi64( E, m512_const1_64( 0x510E527FADE682D1 ) );
-      r[5] = _mm512_add_epi64( F, m512_const1_64( 0x9B05688C2B3E6C1F ) );
-      r[6] = _mm512_add_epi64( G, m512_const1_64( 0x1F83D9ABFB41BD6B ) );
-      r[7] = _mm512_add_epi64( H, m512_const1_64( 0x5BE0CD19137E2179 ) );
+      r[0] = _mm512_add_epi64( A, _mm512_set1_epi64( 0x6A09E667F3BCC908 ) );
+      r[1] = _mm512_add_epi64( B, _mm512_set1_epi64( 0xBB67AE8584CAA73B ) );
+      r[2] = _mm512_add_epi64( C, _mm512_set1_epi64( 0x3C6EF372FE94F82B ) );
+      r[3] = _mm512_add_epi64( D, _mm512_set1_epi64( 0xA54FF53A5F1D36F1 ) );
+      r[4] = _mm512_add_epi64( E, _mm512_set1_epi64( 0x510E527FADE682D1 ) );
+      r[5] = _mm512_add_epi64( F, _mm512_set1_epi64( 0x9B05688C2B3E6C1F ) );
+      r[6] = _mm512_add_epi64( G, _mm512_set1_epi64( 0x1F83D9ABFB41BD6B ) );
+      r[7] = _mm512_add_epi64( H, _mm512_set1_epi64( 0x5BE0CD19137E2179 ) );
   }
 }

@@ -239,11 +239,8 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
    unsigned ptr;
    const int buf_size = 128;
    const int pad = buf_size - 16;
-    const __m512i shuff_bswap64 = m512_const_64(
-                                    0x38393a3b3c3d3e3f, 0x3031323334353637,
-                                    0x28292a2b2c2d2e2f, 0x2021222324252627,
-                                    0x18191a1b1c1d1e1f, 0x1011121314151617,
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 );
+    const __m512i shuff_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
+                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = m512_const1_64( 0x80 );
@@ -440,10 +437,8 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
    unsigned ptr;
    const int buf_size = 128;
    const int pad = buf_size - 16;
-    const __m256i shuff_bswap64 = m256_const_64( 0x18191a1b1c1d1e1f,
-                                                 0x1011121314151617,
-                                                 0x08090a0b0c0d0e0f,
-                                                 0x0001020304050607 );
+    const __m256i shuff_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
+                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -15,14 +15,14 @@ static void sha512256d_8way_init( sha512_8way_context *ctx )
 {
  ctx->count = 0;
  ctx->initialized = true;
-  ctx->val[0] = mm512_bcast_i64( 0x22312194FC2BF72C );
-  ctx->val[1] = mm512_bcast_i64( 0x9F555FA3C84C64C2 );
-  ctx->val[2] = mm512_bcast_i64( 0x2393B86B6F53B151 );
-  ctx->val[3] = mm512_bcast_i64( 0x963877195940EABD );
-  ctx->val[4] = mm512_bcast_i64( 0x96283EE2A88EFFE3 );
-  ctx->val[5] = mm512_bcast_i64( 0xBE5E1E2553863992 );
-  ctx->val[6] = mm512_bcast_i64( 0x2B0199FC2C85B8AA );
-  ctx->val[7] = mm512_bcast_i64( 0x0EB72DDC81C52CA2 );
+  ctx->val[0] = _mm512_set1_epi64( 0x22312194FC2BF72C );
+  ctx->val[1] = _mm512_set1_epi64( 0x9F555FA3C84C64C2 );
+  ctx->val[2] = _mm512_set1_epi64( 0x2393B86B6F53B151 );
+  ctx->val[3] = _mm512_set1_epi64( 0x963877195940EABD );
+  ctx->val[4] = _mm512_set1_epi64( 0x96283EE2A88EFFE3 );
+  ctx->val[5] = _mm512_set1_epi64( 0xBE5E1E2553863992 );
+  ctx->val[6] = _mm512_set1_epi64( 0x2B0199FC2C85B8AA );
+  ctx->val[7] = _mm512_set1_epi64( 0x0EB72DDC81C52CA2 );
 }

 int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
@@ -42,7 +42,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
    __m512i  *noncev = (__m512i*)vdata + 9;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
-    const __m512i eight = mm512_bcast_i64( 0x0000000800000000 );
+    const __m512i eight = _mm512_set1_epi64( 0x0000000800000000 );

    mm512_bswap32_intrlv80_8x64( vdata, pdata );
    *noncev = mm512_intrlv_blend_32(
@@ -83,14 +83,14 @@ static void sha512256d_4way_init( sha512_4way_context *ctx )
 {
  ctx->count = 0;
  ctx->initialized = true;
-  ctx->val[0] = mm256_bcast_i64( 0x22312194FC2BF72C );
-  ctx->val[1] = mm256_bcast_i64( 0x9F555FA3C84C64C2 );
-  ctx->val[2] = mm256_bcast_i64( 0x2393B86B6F53B151 );
-  ctx->val[3] = mm256_bcast_i64( 0x963877195940EABD );
-  ctx->val[4] = mm256_bcast_i64( 0x96283EE2A88EFFE3 );
-  ctx->val[5] = mm256_bcast_i64( 0xBE5E1E2553863992 );
-  ctx->val[6] = mm256_bcast_i64( 0x2B0199FC2C85B8AA );
-  ctx->val[7] = mm256_bcast_i64( 0x0EB72DDC81C52CA2 );
+  ctx->val[0] = _mm256_set1_epi64x( 0x22312194FC2BF72C );
+  ctx->val[1] = _mm256_set1_epi64x( 0x9F555FA3C84C64C2 );
+  ctx->val[2] = _mm256_set1_epi64x( 0x2393B86B6F53B151 );
+  ctx->val[3] = _mm256_set1_epi64x( 0x963877195940EABD );
+  ctx->val[4] = _mm256_set1_epi64x( 0x96283EE2A88EFFE3 );
+  ctx->val[5] = _mm256_set1_epi64x( 0xBE5E1E2553863992 );
+  ctx->val[6] = _mm256_set1_epi64x( 0x2B0199FC2C85B8AA );
+  ctx->val[7] = _mm256_set1_epi64x( 0x0EB72DDC81C52CA2 );
 }

 int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
@@ -110,7 +110,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
    __m256i  *noncev = (__m256i*)vdata + 9;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
-    const __m256i four = mm256_bcast_i64( 0x0000000400000000 );
+    const __m256i four = _mm256_set1_epi64x( 0x0000000400000000 );

    mm256_bswap32_intrlv80_4x64( vdata, pdata );
    *noncev = mm256_intrlv_blend_32(
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -276,6 +276,11 @@ do { \
   A1 = _mm256_xor_si256( A1, _mm256_set1_epi32( Whigh ) ); \
 } while (0)

+#define mm256_swap512_256( v1, v2 ) \
+   v1 = _mm256_xor_si256( v1, v2 ); \
+   v2 = _mm256_xor_si256( v1, v2 ); \
+   v1 = _mm256_xor_si256( v1, v2 );
+
 #define SWAP_BC8 \
 do { \
    mm256_swap512_256( B0, C0 ); \
@@ -866,6 +871,11 @@ do { \
   A1 = _mm_xor_si128( A1, _mm_set1_epi32( Whigh ) ); \
 } while (0)

+#define mm128_swap256_128( v1, v2 ) \
+   v1 = _mm_xor_si128( v1, v2 ); \
+   v2 = _mm_xor_si128( v1, v2 ); \
+   v1 = _mm_xor_si128( v1, v2 );
+
 #define SWAP_BC \
 do { \
    mm128_swap256_128( B0, C0 ); \
--- a/algo/shavite/shavite-hash-2way.c
+++ b/algo/shavite/shavite-hash-2way.c
@@ -18,14 +18,6 @@ static const uint32_t IV512[] =
        0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A
 };

-/*
-#define mm256_ror2x256hi_1x32( a, b ) \
-   _mm256_blend_epi32( mm256_shuflr128_32( a ), \
-                       mm256_shuflr128_32( b ), 0x88 )
-*/
-
-//#define mm256_ror2x256hi_1x32( a, b ) _mm256_alignr_epi8( b, a, 4 )
-
 #if defined(__VAES__)

 #define mm256_aesenc_2x128( x, k ) \
@@ -34,8 +26,9 @@ static const uint32_t IV512[] =
 #else

 #define mm256_aesenc_2x128( x, k ) \
-   mm256_concat_128( _mm_aesenc_si128( mm128_extr_hi128_256( x ), k ), \
-                     _mm_aesenc_si128( mm128_extr_lo128_256( x ), k ) )
+   _mm256_inserti128_si256( _mm256_castsi128_si256( \
+            _mm_aesenc_si128( _mm256_castsi256_si128(   x ),    k ) ), \
+            _mm_aesenc_si128( _mm256_extracti128_si256( x, 1 ), k ), 1 )

 #endif

@@ -257,10 +250,10 @@ void shavite512_2way_init( shavite512_2way_context *ctx )
    __m256i *h = (__m256i*)ctx->h;
    __m128i *iv = (__m128i*)IV512;
   
-   h[0] = m256_const1_128( iv[0] );
-   h[1] = m256_const1_128( iv[1] );
-   h[2] = m256_const1_128( iv[2] );
-   h[3] = m256_const1_128( iv[3] );
+   h[0] = mm256_bcast_m128( iv[0] );
+   h[1] = mm256_bcast_m128( iv[1] );
+   h[2] = mm256_bcast_m128( iv[2] );
+   h[3] = mm256_bcast_m128( iv[3] );

   ctx->ptr    = 0;
   ctx->count0 = 0;
@@ -320,7 +313,7 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
    uint32_t vp = ctx->ptr>>5;

    // Terminating byte then zero pad
-    casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 );
+    casti_m256i( buf, vp++ ) = mm256_bcast128lo_64( 0x0000000000000080 );

    // Zero pad full vectors up to count
    for ( ; vp < 6; vp++ )      
@@ -334,9 +327,9 @@ void shavite512_2way_close( shavite512_2way_context *ctx, void *dst )
    count.u32[2] = ctx->count2;
    count.u32[3] = ctx->count3;

-    casti_m256i( buf, 6 ) = m256_const1_128(
+    casti_m256i( buf, 6 ) = mm256_bcast_m128(
                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); 
-    casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
+    casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
                
@@ -400,19 +393,19 @@ void shavite512_2way_update_close( shavite512_2way_context *ctx, void *dst,

   if ( vp == 0 )    // empty buf, xevan.
   { 
-      casti_m256i( buf, 0 ) = m256_const1_i128( 0x0000000000000080 );
+      casti_m256i( buf, 0 ) = mm256_bcast128lo_64( 0x0000000000000080 );
      memset_zero_256( (__m256i*)buf + 1, 5 );
      ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
   }
   else     // half full buf, everyone else.
   {
-    casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 );
+    casti_m256i( buf, vp++ ) = mm256_bcast128lo_64( 0x0000000000000080 );
      memset_zero_256( (__m256i*)buf + vp, 6 - vp );
   }

-    casti_m256i( buf, 6 ) = m256_const1_128(
+    casti_m256i( buf, 6 ) = mm256_bcast_m128(
                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); 
-    casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
+    casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );

@@ -430,10 +423,10 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,
    __m256i *h = (__m256i*)ctx->h;
    __m128i *iv = (__m128i*)IV512;

-   h[0] = m256_const1_128( iv[0] );
-   h[1] = m256_const1_128( iv[1] );
-   h[2] = m256_const1_128( iv[2] );
-   h[3] = m256_const1_128( iv[3] );
+   h[0] = mm256_bcast_m128( iv[0] );
+   h[1] = mm256_bcast_m128( iv[1] );
+   h[2] = mm256_bcast_m128( iv[2] );
+   h[3] = mm256_bcast_m128( iv[3] );

   ctx->ptr    =
   ctx->count0 =
@@ -490,19 +483,19 @@ void shavite512_2way_full( shavite512_2way_context *ctx, void *dst,

   if ( vp == 0 )    // empty buf, xevan.
   {
-      casti_m256i( buf, 0 ) = m256_const1_i128( 0x0000000000000080 );
+      casti_m256i( buf, 0 ) = mm256_bcast128lo_64( 0x0000000000000080 );
      memset_zero_256( (__m256i*)buf + 1, 5 );
      ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
   }
   else     // half full buf, everyone else.
   {
-    casti_m256i( buf, vp++ ) = m256_const1_i128( 0x0000000000000080 );
+    casti_m256i( buf, vp++ ) = mm256_bcast128lo_64( 0x0000000000000080 );
      memset_zero_256( (__m256i*)buf + vp, 6 - vp );
   }

-    casti_m256i( buf, 6 ) = m256_const1_128(
+    casti_m256i( buf, 6 ) = mm256_bcast_m128(
                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
-    casti_m256i( buf, 7 ) = m256_const1_128( _mm_set_epi16(
+    casti_m256i( buf, 7 ) = mm256_bcast_m128( _mm_set_epi16(
                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );

--- a/algo/shavite/shavite-hash-4way.c
+++ b/algo/shavite/shavite-hash-4way.c
@@ -227,10 +227,10 @@ void shavite512_4way_init( shavite512_4way_context *ctx )
    __m512i *h = (__m512i*)ctx->h;
    __m128i *iv = (__m128i*)IV512;
   
-   h[0] = m512_const1_128( iv[0] );
-   h[1] = m512_const1_128( iv[1] );
-   h[2] = m512_const1_128( iv[2] );
-   h[3] = m512_const1_128( iv[3] );
+   h[0] = mm512_bcast_m128( iv[0] );
+   h[1] = mm512_bcast_m128( iv[1] );
+   h[2] = mm512_bcast_m128( iv[2] );
+   h[3] = mm512_bcast_m128( iv[3] );

   ctx->ptr    = 0;
   ctx->count0 = 0;
@@ -290,7 +290,7 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
    uint32_t vp = ctx->ptr>>6;

    // Terminating byte then zero pad
-    casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
+    casti_m512i( buf, vp++ ) = mm512_bcast128lo_64( 0x0000000000000080 );

    // Zero pad full vectors up to count
    for ( ; vp < 6; vp++ )      
@@ -304,9 +304,9 @@ void shavite512_4way_close( shavite512_4way_context *ctx, void *dst )
    count.u32[2] = ctx->count2;
    count.u32[3] = ctx->count3;

-    casti_m512i( buf, 6 ) = m512_const1_128(
+    casti_m512i( buf, 6 ) = mm512_bcast_m128(
                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); 
-    casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
+    casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );
                
@@ -370,19 +370,19 @@ void shavite512_4way_update_close( shavite512_4way_context *ctx, void *dst,

   if ( vp == 0 )    // empty buf, xevan.
   { 
-      casti_m512i( buf, 0 ) = m512_const1_i128( 0x0000000000000080 );
+      casti_m512i( buf, 0 ) = mm512_bcast128lo_64( 0x0000000000000080 );
      memset_zero_512( (__m512i*)buf + 1, 5 );
      ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
   }
   else     // half full buf, everyone else.
   {
-    casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
+    casti_m512i( buf, vp++ ) = mm512_bcast128lo_64( 0x0000000000000080 );
      memset_zero_512( (__m512i*)buf + vp, 6 - vp );
   }

-    casti_m512i( buf, 6 ) = m512_const1_128(
+    casti_m512i( buf, 6 ) = mm512_bcast_m128(
                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) ); 
-    casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
+    casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );

@@ -401,10 +401,10 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,
    __m512i *h = (__m512i*)ctx->h;
    __m128i *iv = (__m128i*)IV512;

-   h[0] = m512_const1_128( iv[0] );
-   h[1] = m512_const1_128( iv[1] );
-   h[2] = m512_const1_128( iv[2] );
-   h[3] = m512_const1_128( iv[3] );
+   h[0] = mm512_bcast_m128( iv[0] );
+   h[1] = mm512_bcast_m128( iv[1] );
+   h[2] = mm512_bcast_m128( iv[2] );
+   h[3] = mm512_bcast_m128( iv[3] );

   ctx->ptr    = 
   ctx->count0 = 
@@ -461,19 +461,19 @@ void shavite512_4way_full( shavite512_4way_context *ctx, void *dst,

   if ( vp == 0 )    // empty buf, xevan.
   {
-      casti_m512i( buf, 0 ) = m512_const1_i128( 0x0000000000000080 );
+      casti_m512i( buf, 0 ) = mm512_bcast128lo_64( 0x0000000000000080 );
      memset_zero_512( (__m512i*)buf + 1, 5 );
      ctx->count0 = ctx->count1 = ctx->count2 = ctx->count3 = 0;
   }
   else     // half full buf, everyone else.
   {
-    casti_m512i( buf, vp++ ) = m512_const1_i128( 0x0000000000000080 );
+    casti_m512i( buf, vp++ ) = mm512_bcast128lo_64( 0x0000000000000080 );
      memset_zero_512( (__m512i*)buf + vp, 6 - vp );
   }

-    casti_m512i( buf, 6 ) = m512_const1_128(
+    casti_m512i( buf, 6 ) = mm512_bcast_m128(
                  _mm_insert_epi16( m128_zero, count.u16[0], 7 ) );
-    casti_m512i( buf, 7 ) = m512_const1_128( _mm_set_epi16(
+    casti_m512i( buf, 7 ) = mm512_bcast_m128( _mm_set_epi16(
                  0x0200,       count.u16[7], count.u16[6], count.u16[5],
                  count.u16[4], count.u16[3], count.u16[2], count.u16[1] ) );

--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -484,14 +484,7 @@ do { \
 #undef BUTTERFLY_0
 #undef BUTTERFLY_N

-// twiddle is hard coded  T[0] = m512_const2_64( {128,64,32,16}, {8,4,2,1} )  
  // Multiply by twiddle factors
-//  X(6) = _mm512_mullo_epi16( X(6), m512_const2_64( 0x0080004000200010,
-//                                                   0x0008000400020001 );
-//  X(5) = _mm512_mullo_epi16( X(5), m512_const2_64( 0xffdc0008ffef0004,
-//                                                   0x00780002003c0001 );
-
-
  X(6) = _mm512_mullo_epi16( X(6), FFT64_Twiddle4w[0].v512 );
  X(5) = _mm512_mullo_epi16( X(5), FFT64_Twiddle4w[1].v512 );
  X(4) = _mm512_mullo_epi16( X(4), FFT64_Twiddle4w[2].v512 );
--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -74,6 +74,10 @@
   _mm256_or_si256( _mm256_and_si256( x, y ), \
                    _mm256_andnot_si256( x, z ) )

+#define mm256_rol_var_32( v, c ) \
+   _mm256_or_si256( _mm256_slli_epi32( v, c ), \
+                    _mm256_srli_epi32( v, 32-(c) ) )
+
 void sm3_8way_compress( __m256i *digest, __m256i *block )
 {
   __m256i W[68], W1[64];
@@ -251,6 +255,9 @@ void sm3_8way_close( void *cc, void *dst )
                                 _mm_andnot_si128( x, z ) )


+#define mm128_rol_var_32( v, c ) \
+   _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) )
+
 void sm3_4way_compress( __m128i *digest, __m128i *block )
 {
   __m128i W[68], W1[64];