v3.23.3

2026-02-23 00:43:08 +00:00 · 2023-09-28 18:43:18 -04:00
parent be88afc349
commit bc5a5c6df8
88 changed files with 5526 additions and 3361 deletions
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -6,13 +6,12 @@
 #include "sha256-hash.h"
 #include "compat.h"

-/*
-static const uint32_t H256[8] =
+static const uint32_t sha256_iv[8]  __attribute__ ((aligned (32))) =
 {
   0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 };
-*/
+

 static const uint32_t K256[64] =
 {
@@ -83,7 +82,7 @@ static const uint32_t K256[64] =
 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
  __m128i T1, T2; \
-  __m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
+  __m128i K = v128_32( K256[( (j)+(i) )] ); \
  T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
                                        K, W[i] ) ); \
  T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
@@ -358,19 +357,19 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 48 );
   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 48 );

-   T0 = _mm_add_epi32( _mm_set1_epi32( K256[58] ),
+   T0 = _mm_add_epi32( v128_32( K256[58] ),
                   mm128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
   B = _mm_add_epi32( B, T0 );

-   T1 = _mm_add_epi32( _mm_set1_epi32( K256[59] ),
+   T1 = _mm_add_epi32( v128_32( K256[59] ),
                    mm128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
   A = _mm_add_epi32( A, T1 );

-   T2 = _mm_add_epi32( _mm_set1_epi32( K256[60] ),
+   T2 = _mm_add_epi32( v128_32( K256[60] ),
                    mm128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
   H = _mm_add_epi32( H, T2 );

-   targ = _mm_set1_epi32( target[7] );
+   targ = v128_32( target[7] );
   hash = mm128_bswap_32( _mm_add_epi32( H, IV7 ) );

   flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ mm128_movmask_32( hash );
@@ -386,13 +385,13 @@ int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,

   // round 61  part 1
   W[13] = SHA2s_MEXP( W[11], W[ 6], W[14], W[13] );
-   T0 = _mm_add_epi32( _mm_set1_epi32( K256[61] ),
+   T0 = _mm_add_epi32( v128_32( K256[61] ),
                 mm128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
   G = _mm_add_epi32( G, T0 );

   if ( t6_mask )
   {
-      targ = _mm_and_si128( vmask, _mm_set1_epi32( target[6] ) );
+      targ = _mm_and_si128( vmask, v128_32( target[6] ) );
      hash = mm128_bswap_32( _mm_add_epi32( G, IV6 ) );

      if ( ( 0 != ( t6_mask & mm128_movmask_32(
@@ -440,14 +439,14 @@ return 1;
 void sha256_4way_init( sha256_4way_context *sc )
 {
   sc->count_high = sc->count_low = 0;
-   sc->val[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
-   sc->val[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
-   sc->val[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
-   sc->val[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
-   sc->val[4] = _mm_set1_epi64x( 0x510E527F510E527F );
-   sc->val[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
-   sc->val[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
-   sc->val[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
+   sc->val[0] = v128_32( sha256_iv[0] );
+   sc->val[1] = v128_32( sha256_iv[1] );
+   sc->val[2] = v128_32( sha256_iv[2] );
+   sc->val[3] = v128_32( sha256_iv[3] );
+   sc->val[4] = v128_32( sha256_iv[4] );
+   sc->val[5] = v128_32( sha256_iv[5] );
+   sc->val[6] = v128_32( sha256_iv[6] );
+   sc->val[7] = v128_32( sha256_iv[7] );
 }

 void sha256_4way_update( sha256_4way_context *sc, const void *data, size_t len )
@@ -490,7 +489,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
    const int pad = buf_size - 8;

    ptr = (unsigned)sc->count_low & (buf_size - 1U);
-    sc->buf[ ptr>>2 ] = _mm_set1_epi64x( 0x0000008000000080 );
+    sc->buf[ ptr>>2 ] = v128_64( 0x0000008000000080 );
    ptr += 4;

    if ( ptr > pad )
@@ -506,8 +505,8 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
    high = (sc->count_high << 3) | (low >> 29);
    low = low << 3;

-    sc->buf[  pad     >> 2 ] = _mm_set1_epi32( bswap_32( high ) );
-    sc->buf[( pad+4 ) >> 2 ] = _mm_set1_epi32( bswap_32( low ) );
+    sc->buf[  pad     >> 2 ] = v128_32( bswap_32( high ) );
+    sc->buf[( pad+4 ) >> 2 ] = v128_32( bswap_32( low ) );
    sha256_4way_transform_be( sc->val, sc->buf, sc->val );

    mm128_block_bswap_32( dst, sc->val );
@@ -580,7 +579,7 @@ void sha256_4way_full( void *dst, const void *data, size_t len )

 #define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
 do { \
-  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i) ] ), \
+  __m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i) ] ), \
                                 W[ i ] ); \
  __m256i T1 = BSG2_1x( E ); \
  __m256i T2 = BSG2_0x( A ); \
@@ -614,7 +613,7 @@ do { \
 #define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
 { \
   __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
-                              _mm256_set1_epi32( K256[(i)+(j)] ) ); \
+                              v256_32( K256[(i)+(j)] ) ); \
   __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
   D  = _mm256_add_epi32( D,  T1 ); \
   H  = _mm256_add_epi32( T1, T2 ); \
@@ -634,7 +633,7 @@ do { \
 #define SHA256_8WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
 { \
   __m256i T1 = mm256_add4_32( H, BSG2_1x(E), CHx(E, F, G), \
-                              _mm256_set1_epi32( K256[(i)+(j)] ) ); \
+                              v256_32( K256[(i)+(j)] ) ); \
   __m256i T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
   Y_xor_Z = X_xor_Y; \
   D  = _mm256_add_epi32( D,  T1 ); \
@@ -643,7 +642,7 @@ do { \

 #define SHA256_8WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
 do { \
-  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[(j)+(i)] ), W[i] ); \
+  __m256i T0 = _mm256_add_epi32( v256_32( K256[(j)+(i)] ), W[i] ); \
  __m256i T1 = BSG2_1x( E ); \
  __m256i T2 = BSG2_0x( A ); \
  T0 = _mm256_add_epi32( T0, CHx( E, F, G ) ); \
@@ -666,7 +665,7 @@ do { \

 #define SHA256_8WAY_2ROUNDS( A, B, C, D, E, F, G, H, i0, i1, j ) \
 do { \
-  __m256i T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i0) ] ), \
+  __m256i T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i0) ] ), \
                                 W[ i0 ] ); \
  __m256i T1 = BSG2_1x( E ); \
  __m256i T2 = BSG2_0x( A ); \
@@ -677,7 +676,7 @@ do { \
  D  = _mm256_add_epi32( D,  T1 ); \
  H  = _mm256_add_epi32( T1, T2 ); \
 \
-  T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[ (j)+(i1) ] ), \
+  T0 = _mm256_add_epi32( v256_32( K256[ (j)+(i1) ] ), \
                                 W[ (i1) ] ); \
  T1 = BSG2_1x( D ); \
  T2 = BSG2_0x( H ); \
@@ -790,7 +789,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,

   // round 3 part 1, avoid nonces W[3]
   T1 = mm256_add4_32( E, BSG2_1x(B), CHx(B, C, D),
-                       _mm256_set1_epi32( K256[3] ) );
+                       v256_32( K256[3] ) );
   A = _mm256_add_epi32( A, T1 );
   E = _mm256_add_epi32( T1, _mm256_add_epi32( BSG2_0x(F),
                                               MAJx(F, G, H) ) );
@@ -910,12 +909,11 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
                           const __m256i *state_in, const uint32_t *target )
 {
   __m256i A, B, C, D, E, F, G, H, T0, T1, T2;
-   int flip;
-   int t6_mask;
   __m256i vmask, targ, hash;
   __m256i W[16];  memcpy_256( W, data, 16 );
   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
                              0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
+   uint8_t flip, t6_mask;

   A = _mm256_load_si256( state_in   );
   B = _mm256_load_si256( state_in+1 );
@@ -991,26 +989,28 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   SHA256_8WAY_ROUND( H, A, B, C, D, E, F, G,  9, 48 );

   // round 58 to 60 part 1
-   T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[58] ),
+   T0 = _mm256_add_epi32( v256_32( K256[58] ),
                 mm256_add4_32( BSG2_1x( C ), CHx( C, D, E ), W[10], F ) );
   B = _mm256_add_epi32( B, T0 );

-   T1 = _mm256_add_epi32( _mm256_set1_epi32( K256[59] ),
+   T1 = _mm256_add_epi32( v256_32( K256[59] ),
                 mm256_add4_32( BSG2_1x( B ), CHx( B, C, D ), W[11], E ) );
   A = _mm256_add_epi32( A, T1 );

-   T2 = _mm256_add_epi32( _mm256_set1_epi32( K256[60] ),
+   T2 = _mm256_add_epi32( v256_32( K256[60] ),
                 mm256_add4_32( BSG2_1x( A ), CHx( A, B, C ), W[12], D ) );
   H = _mm256_add_epi32( H, T2 );

   // Got H, test it.
-   targ = _mm256_set1_epi32( target[7] );
+   targ = v256_32( target[7] );
   hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf );
-   flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
-   if ( likely( 0xff == ( flip ^
+   if ( target[7] )
+   {
+      flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
+      if ( likely( 0xff == ( flip ^
                    mm256_movmask_32( _mm256_cmpgt_epi32( hash, targ ) ) ) ))
-      return 0;
-
+         return 0;
+   }
   t6_mask = mm256_movmask_32( vmask =_mm256_cmpeq_epi32( hash, targ ) );

   // round 58 part 2
@@ -1018,14 +1018,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
                                               MAJx( G, H, A ) ) );
   // round 61 part 1
   W[13] = SHA256_8WAY_MEXP( W[11], W[ 6], W[14], W[13] );
-   T0 = _mm256_add_epi32( _mm256_set1_epi32( K256[61] ),
+   T0 = _mm256_add_epi32( v256_32( K256[61] ),
                 mm256_add4_32( BSG2_1x( H ), CHx( H, A, B ), W[13], C ) );
   G = _mm256_add_epi32( G, T0 );

   if ( t6_mask )
   { 
      // Testing H was inconclusive: hash7 == target7, need to test G
-      targ = _mm256_and_si256( vmask, _mm256_set1_epi32( target[6] ) );
+      targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
      hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );

      if ( likely( 0 == ( t6_mask & mm256_movmask_32(
@@ -1078,14 +1078,14 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
 void sha256_8way_init( sha256_8way_context *sc )
 {
   sc->count_high = sc->count_low = 0;
-   sc->val[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
-   sc->val[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
-   sc->val[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
-   sc->val[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
-   sc->val[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
-   sc->val[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
-   sc->val[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
-   sc->val[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
+   sc->val[0] = v256_32( sha256_iv[0] );
+   sc->val[1] = v256_32( sha256_iv[1] );
+   sc->val[2] = v256_32( sha256_iv[2] );
+   sc->val[3] = v256_32( sha256_iv[3] );
+   sc->val[4] = v256_32( sha256_iv[4] );
+   sc->val[5] = v256_32( sha256_iv[5] );
+   sc->val[6] = v256_32( sha256_iv[6] );
+   sc->val[7] = v256_32( sha256_iv[7] );
 }

 // need to handle odd byte length for yespower.
@@ -1131,7 +1131,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    const int pad = buf_size - 8;

    ptr = (unsigned)sc->count_low & (buf_size - 1U);
-    sc->buf[ ptr>>2 ] = _mm256_set1_epi64x( 0x0000008000000080 );
+    sc->buf[ ptr>>2 ] = v256_64( 0x0000008000000080 );
    ptr += 4;

    if ( ptr > pad )
@@ -1147,8 +1147,8 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    high = (sc->count_high << 3) | (low >> 29);
    low = low << 3;

-    sc->buf[   pad     >> 2 ] = _mm256_set1_epi32( bswap_32( high ) );
-    sc->buf[ ( pad+4 ) >> 2 ] = _mm256_set1_epi32( bswap_32( low ) );
+    sc->buf[   pad     >> 2 ] = v256_32( bswap_32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );

    sha256_8way_transform_be( sc->val, sc->buf, sc->val );

@@ -1210,7 +1210,7 @@ void sha256_8way_full( void *dst, const void *data, size_t len )

 #define SHA256_16WAY_ROUND( A, B, C, D, E, F, G, H, i, j ) \
 do { \
-  __m512i T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[(j)+(i)] ), W[i] ); \
+  __m512i T0 = _mm512_add_epi32( v512_32( K256[(j)+(i)] ), W[i] ); \
  __m512i T1 = BSG2_1x16( E ); \
  __m512i T2 = BSG2_0x16( A ); \
  T0 = _mm512_add_epi32( T0, CHx16( E, F, G ) ); \
@@ -1224,7 +1224,7 @@ do { \
 #define SHA256_16WAY_ROUND_NOMSG( A, B, C, D, E, F, G, H, i, j ) \
 { \
   __m512i T1 = mm512_add4_32( H, BSG2_1x16(E), CHx16(E, F, G), \
-                              _mm512_set1_epi32( K256[(i)+(j)] ) ); \
+                              v512_32( K256[(i)+(j)] ) ); \
   __m512i T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
   D  = _mm512_add_epi32( D,  T1 ); \
   H  = _mm512_add_epi32( T1, T2 ); \
@@ -1234,7 +1234,7 @@ do { \
 #define SHA256_16WAY_ROUND(A, B, C, D, E, F, G, H, i, j) \
 do { \
  __m512i T1, T2; \
-  __m512i K = _mm512_set1_epi32( K256[( (j)+(i) )] ); \
+  __m512i K = v512_32( K256[( (j)+(i) )] ); \
  T1 = _mm512_add_epi32( H, mm512_add4_32( BSG2_1x16(E), CHx16(E, F, G), \
                                           K, W[i] ) ); \
  T2 = _mm512_add_epi32( BSG2_0x16(A), MAJx16(A, B, C) ); \
@@ -1345,7 +1345,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,

   // round 3 part 1, avoid nonces W[3]
   T1 = mm512_add4_32( E, BSG2_1x16(B), CHx16(B, C, D), 
-                       _mm512_set1_epi32( K256[3] ) );
+                       v512_32( K256[3] ) );
   A = _mm512_add_epi32( A, T1 );
   E = _mm512_add_epi32( T1, _mm512_add_epi32( BSG2_0x16(F),
                                               MAJx16(F, G, H) ) ); 
@@ -1566,21 +1566,22 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   SHA256_16WAY_ROUND( H, A, B, C, D, E, F, G,  9, 48 );

   // rounds 58 to 60 part 1
-   T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[58] ),
+   T0 = _mm512_add_epi32( v512_32( K256[58] ),
                 mm512_add4_32( BSG2_1x16( C ), CHx16( C, D, E ), W[10], F ) );
   B = _mm512_add_epi32( B, T0 );
   
-   T1 = _mm512_add_epi32( _mm512_set1_epi32( K256[59] ),
+   T1 = _mm512_add_epi32( v512_32( K256[59] ),
                 mm512_add4_32( BSG2_1x16( B ), CHx16( B, C, D ), W[11], E ) );
   A = _mm512_add_epi32( A, T1 );

-   T2 = _mm512_add_epi32( _mm512_set1_epi32( K256[60] ),
+   T2 = _mm512_add_epi32( v512_32( K256[60] ),
                 mm512_add4_32( BSG2_1x16( A ), CHx16( A, B, C ), W[12], D ) );
   H = _mm512_add_epi32( H, T2 );

   // got H, test it against target[7]
-   hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf ); 
-   targ = _mm512_set1_epi32( target[7] );
+   hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
+   targ = v512_32( target[7] );
+   if ( target[7] )
   if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
      return 0;
   t6_mask = _mm512_cmpeq_epi32_mask( hash, targ );
@@ -1591,15 +1592,15 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,

   // round 61 part 1
   W[13] = SHA256_16WAY_MEXP( W[11], W[ 6], W[14], W[13] );
-   T0 = _mm512_add_epi32( _mm512_set1_epi32( K256[61] ),
+   T0 = _mm512_add_epi32( v512_32( K256[61] ),
                 mm512_add4_32( BSG2_1x16( H ), CHx16( H, A, B ), W[13], C ) );
   G = _mm512_add_epi32( G, T0 );

   // got G, test it against target[6] if indicated
-   if ( t6_mask != 0 )
+   if ( (uint16_t)t6_mask )
   {
      hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
-      targ = _mm512_set1_epi32( target[6] );
+      targ = v512_32( target[6] );
      if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
          return 0;
   }
@@ -1637,14 +1638,14 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
 void sha256_16way_init( sha256_16way_context *sc )
 {
   sc->count_high = sc->count_low = 0;
-   sc->val[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
-   sc->val[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
-   sc->val[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
-   sc->val[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
-   sc->val[4] = _mm512_set1_epi64( 0x510E527F510E527F );
-   sc->val[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
-   sc->val[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
-   sc->val[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
+   sc->val[0] = v512_32( sha256_iv[0] );
+   sc->val[1] = v512_32( sha256_iv[1] );
+   sc->val[2] = v512_32( sha256_iv[2] );
+   sc->val[3] = v512_32( sha256_iv[3] );
+   sc->val[4] = v512_32( sha256_iv[4] );
+   sc->val[5] = v512_32( sha256_iv[5] );
+   sc->val[6] = v512_32( sha256_iv[6] );
+   sc->val[7] = v512_32( sha256_iv[7] );
 }

 void sha256_16way_update( sha256_16way_context *sc, const void *data,
@@ -1688,7 +1689,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
    const int pad = buf_size - 8;

    ptr = (unsigned)sc->count_low & (buf_size - 1U);
-    sc->buf[ ptr>>2 ] = _mm512_set1_epi64( 0x0000008000000080 );
+    sc->buf[ ptr>>2 ] = v512_64( 0x0000008000000080 );
    ptr += 4;

    if ( ptr > pad )
@@ -1704,8 +1705,8 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
    high = (sc->count_high << 3) | (low >> 29);
    low = low << 3;

-    sc->buf[   pad     >> 2 ] = _mm512_set1_epi32( bswap_32( high ) );
-    sc->buf[ ( pad+4 ) >> 2 ] = _mm512_set1_epi32( bswap_32( low ) );
+    sc->buf[   pad     >> 2 ] = v512_32( bswap_32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );

    sha256_16way_transform_be( sc->val, sc->buf, sc->val );

--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -118,10 +118,10 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 16;
-   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
+   const __m512i last_byte = v512_32( 0x80000000 );
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
-   const __m512i sixteen = _mm512_set1_epi32( 16 );
+   const __m512i sixteen = v512_32( 16 );
   const bool bench = opt_benchmark;
   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
@@ -130,42 +130,42 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   sha256_transform_le( phash, pdata, sha256_iv );

   // vectorize block 0 hash for second block
-   mstate1[0] = _mm512_set1_epi32( phash[0] );
-   mstate1[1] = _mm512_set1_epi32( phash[1] );
-   mstate1[2] = _mm512_set1_epi32( phash[2] );
-   mstate1[3] = _mm512_set1_epi32( phash[3] );
-   mstate1[4] = _mm512_set1_epi32( phash[4] );
-   mstate1[5] = _mm512_set1_epi32( phash[5] );
-   mstate1[6] = _mm512_set1_epi32( phash[6] );
-   mstate1[7] = _mm512_set1_epi32( phash[7] );
+   mstate1[0] = v512_32( phash[0] );
+   mstate1[1] = v512_32( phash[1] );
+   mstate1[2] = v512_32( phash[2] );
+   mstate1[3] = v512_32( phash[3] );
+   mstate1[4] = v512_32( phash[4] );
+   mstate1[5] = v512_32( phash[5] );
+   mstate1[6] = v512_32( phash[6] );
+   mstate1[7] = v512_32( phash[7] );

   // second message block data, with nonce & padding
-   buf[0] = _mm512_set1_epi32( pdata[16] );
-   buf[1] = _mm512_set1_epi32( pdata[17] );
-   buf[2] = _mm512_set1_epi32( pdata[18] );
+   buf[0] = v512_32( pdata[16] );
+   buf[1] = v512_32( pdata[17] );
+   buf[2] = v512_32( pdata[18] );
   buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                              n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
   buf[4] = last_byte;
   memset_zero_512( buf+5, 10 );
-   buf[15] = _mm512_set1_epi32( 80*8 );  // bit count
+   buf[15] = v512_32( 80*8 );  // bit count

   // partially pre-expand & prehash second message block, avoiding the nonces
   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

   // vectorize IV for second hash
-   istate[0] = _mm512_set1_epi32( sha256_iv[0] );
-   istate[1] = _mm512_set1_epi32( sha256_iv[1] );
-   istate[2] = _mm512_set1_epi32( sha256_iv[2] );
-   istate[3] = _mm512_set1_epi32( sha256_iv[3] );
-   istate[4] = _mm512_set1_epi32( sha256_iv[4] );
-   istate[5] = _mm512_set1_epi32( sha256_iv[5] );
-   istate[6] = _mm512_set1_epi32( sha256_iv[6] );
-   istate[7] = _mm512_set1_epi32( sha256_iv[7] );
+   istate[0] = v512_32( sha256_iv[0] );
+   istate[1] = v512_32( sha256_iv[1] );
+   istate[2] = v512_32( sha256_iv[2] );
+   istate[3] = v512_32( sha256_iv[3] );
+   istate[4] = v512_32( sha256_iv[4] );
+   istate[5] = v512_32( sha256_iv[5] );
+   istate[6] = v512_32( sha256_iv[6] );
+   istate[7] = v512_32( sha256_iv[7] );

   // initialize padding for second hash
   block[ 8] = last_byte;
   memset_zero_512( block+9, 6 );
-   block[15] = _mm512_set1_epi32( 32*8 ); // bit count
+   block[15] = v512_32( 32*8 ); // bit count

   do
   {
@@ -216,33 +216,33 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   __m256i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
-   const __m256i eight = _mm256_set1_epi32( 8 );
+   const __m256i last_byte = v256_32( 0x80000000 );
+   const __m256i eight = v256_32( 8 );
   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
-      vdata[i] = _mm256_set1_epi32( pdata[i] );
+      vdata[i] = v256_32( pdata[i] );

   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_256( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm256_set1_epi32( 80*8 );
+   vdata[16+15] = v256_32( 80*8 );

   block[ 8] = last_byte;
   memset_zero_256( block + 9, 6 );
-   block[15] = _mm256_set1_epi32( 32*8 ); 
+   block[15] = v256_32( 32*8 ); 
   
   // initialize state for second hash
-   istate[0] = _mm256_set1_epi32( sha256_iv[0] );
-   istate[1] = _mm256_set1_epi32( sha256_iv[1] );
-   istate[2] = _mm256_set1_epi32( sha256_iv[2] );
-   istate[3] = _mm256_set1_epi32( sha256_iv[3] );
-   istate[4] = _mm256_set1_epi32( sha256_iv[4] );
-   istate[5] = _mm256_set1_epi32( sha256_iv[5] );
-   istate[6] = _mm256_set1_epi32( sha256_iv[6] );
-   istate[7] = _mm256_set1_epi32( sha256_iv[7] );
+   istate[0] = v256_32( sha256_iv[0] );
+   istate[1] = v256_32( sha256_iv[1] );
+   istate[2] = v256_32( sha256_iv[2] );
+   istate[3] = v256_32( sha256_iv[3] );
+   istate[4] = v256_32( sha256_iv[4] );
+   istate[5] = v256_32( sha256_iv[5] );
+   istate[6] = v256_32( sha256_iv[6] );
+   istate[7] = v256_32( sha256_iv[7] );

   sha256_8way_transform_le( mstate1, vdata, istate );

@@ -298,31 +298,31 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
   __m128i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
-   const __m128i four = _mm_set1_epi32( 4 );
+   const __m128i last_byte = v128_32( 0x80000000 );
+   const __m128i four = v128_32( 4 );

   for ( int i = 0; i < 19; i++ )
-       vdata[i] = _mm_set1_epi32( pdata[i] );
+       vdata[i] = v128_32( pdata[i] );

   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_128( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm_set1_epi32( 80*8 );
+   vdata[16+15] = v128_32( 80*8 );

   block[ 8] = last_byte;
   memset_zero_128( block + 9, 6 );
-   block[15] = _mm_set1_epi32( 32*8 );
+   block[15] = v128_32( 32*8 );
   
   // initialize state
-   istate[0] = _mm_set1_epi32( sha256_iv[0] );
-   istate[1] = _mm_set1_epi32( sha256_iv[1] );
-   istate[2] = _mm_set1_epi32( sha256_iv[2] );
-   istate[3] = _mm_set1_epi32( sha256_iv[3] );
-   istate[4] = _mm_set1_epi32( sha256_iv[4] );
-   istate[5] = _mm_set1_epi32( sha256_iv[5] );
-   istate[6] = _mm_set1_epi32( sha256_iv[6] );
-   istate[7] = _mm_set1_epi32( sha256_iv[7] );
+   istate[0] = v128_32( sha256_iv[0] );
+   istate[1] = v128_32( sha256_iv[1] );
+   istate[2] = v128_32( sha256_iv[2] );
+   istate[3] = v128_32( sha256_iv[3] );
+   istate[4] = v128_32( sha256_iv[4] );
+   istate[5] = v128_32( sha256_iv[5] );
+   istate[6] = v128_32( sha256_iv[6] );
+   istate[7] = v128_32( sha256_iv[7] );

   // hash first 64 bytes of data
   sha256_4way_transform_le( mstate, vdata, istate );
--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -51,8 +51,7 @@ int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
   // fill & pad second bock without nonce
   memcpy( block1a, pdata + 16, 12 );
   memcpy( block1b, pdata + 16, 12 );
-   block1a[ 3] = 0;
-   block1b[ 3] = 0;
+   block1a[ 3] = block1b[ 3] = 0;
   block1a[ 4] = block1b[ 4] = 0x80000000;
   memset( block1a + 5, 0, 40 );
   memset( block1b + 5, 0, 40 );
@@ -128,10 +127,10 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 16;
-   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
+   const __m512i last_byte = v512_32( 0x80000000 );
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
-   const __m512i sixteen = _mm512_set1_epi32( 16 );
+   const __m512i sixteen = v512_32( 16 );
   const bool bench = opt_benchmark;
   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
@@ -140,42 +139,42 @@ int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
   sha256_transform_le( phash, pdata, sha256dt_iv );

   // vectorize block 0 hash for second block
-   mstate1[0] = _mm512_set1_epi32( phash[0] );
-   mstate1[1] = _mm512_set1_epi32( phash[1] );
-   mstate1[2] = _mm512_set1_epi32( phash[2] );
-   mstate1[3] = _mm512_set1_epi32( phash[3] );
-   mstate1[4] = _mm512_set1_epi32( phash[4] );
-   mstate1[5] = _mm512_set1_epi32( phash[5] );
-   mstate1[6] = _mm512_set1_epi32( phash[6] );
-   mstate1[7] = _mm512_set1_epi32( phash[7] );
+   mstate1[0] = v512_32( phash[0] );
+   mstate1[1] = v512_32( phash[1] );
+   mstate1[2] = v512_32( phash[2] );
+   mstate1[3] = v512_32( phash[3] );
+   mstate1[4] = v512_32( phash[4] );
+   mstate1[5] = v512_32( phash[5] );
+   mstate1[6] = v512_32( phash[6] );
+   mstate1[7] = v512_32( phash[7] );

   // second message block data, with nonce & padding
-   buf[0] = _mm512_set1_epi32( pdata[16] );
-   buf[1] = _mm512_set1_epi32( pdata[17] );
-   buf[2] = _mm512_set1_epi32( pdata[18] );
+   buf[0] = v512_32( pdata[16] );
+   buf[1] = v512_32( pdata[17] );
+   buf[2] = v512_32( pdata[18] );
   buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                              n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
   buf[4] = last_byte;
   memset_zero_512( buf+5, 10 );
-   buf[15] = _mm512_set1_epi32( 0x480 ); // sha256dt funky bit count
+   buf[15] = v512_32( 0x480 ); // sha256dt funky bit count

   // partially pre-expand & prehash second message block, avoiding the nonces
   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

   // vectorize IV for second hash
-   istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
-   istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
-   istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
-   istate[3] = _mm512_set1_epi32( sha256dt_iv[3] );
-   istate[4] = _mm512_set1_epi32( sha256dt_iv[4] );
-   istate[5] = _mm512_set1_epi32( sha256dt_iv[5] );
-   istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
-   istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );
+   istate[0] = v512_32( sha256dt_iv[0] );
+   istate[1] = v512_32( sha256dt_iv[1] );
+   istate[2] = v512_32( sha256dt_iv[2] );
+   istate[3] = v512_32( sha256dt_iv[3] );
+   istate[4] = v512_32( sha256dt_iv[4] );
+   istate[5] = v512_32( sha256dt_iv[5] );
+   istate[6] = v512_32( sha256dt_iv[6] );
+   istate[7] = v512_32( sha256dt_iv[7] );

   // initialize padding for second hash
   block[ 8] = last_byte;
   memset_zero_512( block+9, 6 );
-   block[15] = _mm512_set1_epi32( 0x300 ); // bit count
+   block[15] = v512_32( 0x300 ); // bit count

   do
   {
@@ -226,33 +225,33 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
   __m256i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
-   const __m256i eight = _mm256_set1_epi32( 8 );
+   const __m256i last_byte = v256_32( 0x80000000 );
+   const __m256i eight = v256_32( 8 );
   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
-      vdata[i] = _mm256_set1_epi32( pdata[i] );
+      vdata[i] = v256_32( pdata[i] );

   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_256( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm256_set1_epi32( 0x480 );
+   vdata[16+15] = v256_32( 0x480 );

   block[ 8] = last_byte;
   memset_zero_256( block + 9, 6 );
-   block[15] = _mm256_set1_epi32( 0x300 ); 
+   block[15] = v256_32( 0x300 ); 
   
-   // initialize state for swecond hash
-   istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
-   istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
-   istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
-   istate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
-   istate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
-   istate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
-   istate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
-   istate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );
+   // initialize state for second hash
+   istate[0] = v256_32( sha256dt_iv[0] );
+   istate[1] = v256_32( sha256dt_iv[1] );
+   istate[2] = v256_32( sha256dt_iv[2] );
+   istate[3] = v256_32( sha256dt_iv[3] );
+   istate[4] = v256_32( sha256dt_iv[4] );
+   istate[5] = v256_32( sha256dt_iv[5] );
+   istate[6] = v256_32( sha256dt_iv[6] );
+   istate[7] = v256_32( sha256dt_iv[7] );

   sha256_8way_transform_le( mstate1, vdata, istate );

@@ -308,31 +307,31 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
   __m128i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
-   const __m128i four = _mm_set1_epi32( 4 );
+   const __m128i last_byte = v128_32( 0x80000000 );
+   const __m128i four = v128_32( 4 );

   for ( int i = 0; i < 19; i++ )
-       vdata[i] = _mm_set1_epi32( pdata[i] );
+       vdata[i] = v128_32( pdata[i] );

   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_128( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm_set1_epi32( 0x480 );
+   vdata[16+15] = v128_32( 0x480 );

   block[ 8] = last_byte;
   memset_zero_128( block + 9, 6 );
-   block[15] = _mm_set1_epi32( 0x300 );
+   block[15] = v128_32( 0x300 );
   
   // initialize state
-   initstate[0] = _mm_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
-   initstate[1] = _mm_set1_epi64x( 0xb72074d4b72074d4 );
-   initstate[2] = _mm_set1_epi64x( 0x6bb011226bb01122 );
-   initstate[3] = _mm_set1_epi64x( 0xd338e869d338e869 );
-   initstate[4] = _mm_set1_epi64x( 0xaa3ff126aa3ff126 );
-   initstate[5] = _mm_set1_epi64x( 0x475bbf30475bbf30 );
-   initstate[6] = _mm_set1_epi64x( 0x8fd52e5b8fd52e5b );
-   initstate[7] = _mm_set1_epi64x( 0x9f75c9ad9f75c9ad );
+   initstate[0] = v128_32( sha256dt_iv[0] );
+   initstate[1] = v128_32( sha256dt_iv[1] );
+   initstate[2] = v128_32( sha256dt_iv[2] );
+   initstate[3] = v128_32( sha256dt_iv[3] );
+   initstate[4] = v128_32( sha256dt_iv[4] );
+   initstate[5] = v128_32( sha256dt_iv[5] );
+   initstate[6] = v128_32( sha256dt_iv[6] );
+   initstate[7] = v128_32( sha256dt_iv[7] );

   // hash first 64 bytes of data
   sha256_4way_transform_le( midstate, vdata, initstate );
@@ -342,21 +341,18 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
      sha256_4way_transform_le( block,  vdata+16, midstate  );
      sha256_4way_transform_le( hash32, block, initstate );

-//      if ( sha256_4way_transform_le_short( hash32, block, initstate, ptarget ) )
-//      {
-         mm128_block_bswap_32( hash32, hash32 );
+      mm128_block_bswap_32( hash32, hash32 );

-         for ( int lane = 0; lane < 4; lane++ )
-         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
-            extr_lane_4x32( lane_hash, hash32, lane, 256 );
-            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
-            {
-               pdata[19] = n + lane;
-               submit_solution( work, lane_hash, mythr );
-            }
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
         }
-//      }
+      }
      *noncev = _mm_add_epi32( *noncev, four );
      n += 4;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -30,10 +30,10 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 16;
-   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
+   const __m512i last_byte = v512_32( 0x80000000 );
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
-   const __m512i sixteen = _mm512_set1_epi32( 16 );
+   const __m512i sixteen = v512_32( 16 );
   const bool bench = opt_benchmark;
   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
@@ -42,42 +42,42 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   sha256_transform_le( phash, pdata, sha256_iv );

   // vectorize block 0 hash for second block
-   mstate1[0] = _mm512_set1_epi32( phash[0] );
-   mstate1[1] = _mm512_set1_epi32( phash[1] );
-   mstate1[2] = _mm512_set1_epi32( phash[2] );
-   mstate1[3] = _mm512_set1_epi32( phash[3] );
-   mstate1[4] = _mm512_set1_epi32( phash[4] );
-   mstate1[5] = _mm512_set1_epi32( phash[5] );
-   mstate1[6] = _mm512_set1_epi32( phash[6] );
-   mstate1[7] = _mm512_set1_epi32( phash[7] );
+   mstate1[0] = v512_32( phash[0] );
+   mstate1[1] = v512_32( phash[1] );
+   mstate1[2] = v512_32( phash[2] );
+   mstate1[3] = v512_32( phash[3] );
+   mstate1[4] = v512_32( phash[4] );
+   mstate1[5] = v512_32( phash[5] );
+   mstate1[6] = v512_32( phash[6] );
+   mstate1[7] = v512_32( phash[7] );

   // second message block data, with nonce & padding   
-   buf[0] = _mm512_set1_epi32( pdata[16] );
-   buf[1] = _mm512_set1_epi32( pdata[17] );
-   buf[2] = _mm512_set1_epi32( pdata[18] );
+   buf[0] = v512_32( pdata[16] );
+   buf[1] = v512_32( pdata[17] );
+   buf[2] = v512_32( pdata[18] );
   buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
                              n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
   buf[4] = last_byte;
   memset_zero_512( buf+5, 10 );
-   buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
+   buf[15] = v512_32( 80*8 ); // bit count

   // partially pre-expand & prehash second message block, avoiding the nonces
   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

   // vectorize IV for 2nd & 3rd sha256
-   istate[0] = _mm512_set1_epi32( sha256_iv[0] );
-   istate[1] = _mm512_set1_epi32( sha256_iv[1] );
-   istate[2] = _mm512_set1_epi32( sha256_iv[2] );
-   istate[3] = _mm512_set1_epi32( sha256_iv[3] );
-   istate[4] = _mm512_set1_epi32( sha256_iv[4] );
-   istate[5] = _mm512_set1_epi32( sha256_iv[5] );
-   istate[6] = _mm512_set1_epi32( sha256_iv[6] );
-   istate[7] = _mm512_set1_epi32( sha256_iv[7] );
+   istate[0] = v512_32( sha256_iv[0] );
+   istate[1] = v512_32( sha256_iv[1] );
+   istate[2] = v512_32( sha256_iv[2] );
+   istate[3] = v512_32( sha256_iv[3] );
+   istate[4] = v512_32( sha256_iv[4] );
+   istate[5] = v512_32( sha256_iv[5] );
+   istate[6] = v512_32( sha256_iv[6] );
+   istate[7] = v512_32( sha256_iv[7] );

   // initialize padding for 2nd & 3rd sha256
   block[ 8] = last_byte;
   memset_zero_512( block + 9, 6 );
-   block[15] = _mm512_set1_epi32( 32*8 ); // bit count
+   block[15] = v512_32( 32*8 ); // bit count

   do
   {
@@ -222,33 +222,33 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   __m256i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
-   const __m256i eight = _mm256_set1_epi32( 8 );
+   const __m256i last_byte = v256_32( 0x80000000 );
+   const __m256i eight = v256_32( 8 );
   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
-      vdata[i] = _mm256_set1_epi32( pdata[i] );
+      vdata[i] = v256_32( pdata[i] );

   *noncev = _mm256_set_epi32( n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_256( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm256_set1_epi32( 80*8 ); // bit count
+   vdata[16+15] = v256_32( 80*8 ); // bit count

   block[ 8] = last_byte;
   memset_zero_256( block + 9, 6 );
-   block[15] = _mm256_set1_epi32( 32*8 ); // bit count
+   block[15] = v256_32( 32*8 ); // bit count

   // initialize state
-   istate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
-   istate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
-   istate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
-   istate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
-   istate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
-   istate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
-   istate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
-   istate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
+   istate[0] = v256_32( sha256_iv[0] );
+   istate[1] = v256_32( sha256_iv[1] );
+   istate[2] = v256_32( sha256_iv[2] );
+   istate[3] = v256_32( sha256_iv[3] );
+   istate[4] = v256_32( sha256_iv[4] );
+   istate[5] = v256_32( sha256_iv[5] );
+   istate[6] = v256_32( sha256_iv[6] );
+   istate[7] = v256_32( sha256_iv[7] );

   sha256_8way_transform_le( mstate1, vdata, istate );

@@ -313,31 +313,31 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
   __m128i *noncev = vdata + 19;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
-   const __m128i four = _mm_set1_epi32( 4 );
+   const __m128i last_byte = v128_32( 0x80000000 );
+   const __m128i four = v128_32( 4 );

   for ( int i = 0; i < 19; i++ )
-       vdata[i] = _mm_set1_epi32( pdata[i] );
+       vdata[i] = v128_32( pdata[i] );

   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );

   vdata[16+4] = last_byte;
   memset_zero_128( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
+   vdata[16+15] = v128_32( 80*8 ); // bit count

   block[ 8] = last_byte;
   memset_zero_128( block + 9, 6 );
-   block[15] = _mm_set1_epi32( 32*8 ); // bit count
+   block[15] = v128_32( 32*8 ); // bit count
   
   // initialize state
-   istate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
-   istate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
-   istate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
-   istate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
-   istate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
-   istate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
-   istate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
-   istate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
+   istate[0] = v128_32( sha256_iv[0] );
+   istate[1] = v128_32( sha256_iv[1] );
+   istate[2] = v128_32( sha256_iv[2] );
+   istate[3] = v128_32( sha256_iv[3] );
+   istate[4] = v128_32( sha256_iv[4] );
+   istate[5] = v128_32( sha256_iv[5] );
+   istate[6] = v128_32( sha256_iv[6] );
+   istate[7] = v128_32( sha256_iv[7] );

   // hash first 64 bytes of data
   sha256_4way_transform_le( mstate, vdata, istate );
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -39,57 +39,429 @@
 /*
 static const uit64_t H512[8] =
 {
-   0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
-   0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
-   0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
-   0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+ 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+ 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
 };
 */

 static const uint64_t K512[80] =
 {
-	0x428A2F98D728AE22, 0x7137449123EF65CD,
-	0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
-	0x3956C25BF348B538, 0x59F111F1B605D019,
-	0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
-	0xD807AA98A3030242, 0x12835B0145706FBE,
-	0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
-	0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1,
-	0x9BDC06A725C71235, 0xC19BF174CF692694,
-	0xE49B69C19EF14AD2, 0xEFBE4786384F25E3,
-	0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
-	0x2DE92C6F592B0275, 0x4A7484AA6EA6E483,
-	0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
-	0x983E5152EE66DFAB, 0xA831C66D2DB43210,
-	0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
-	0xC6E00BF33DA88FC2, 0xD5A79147930AA725,
-	0x06CA6351E003826F, 0x142929670A0E6E70,
-	0x27B70A8546D22FFC, 0x2E1B21385C26C926,
-	0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
-	0x650A73548BAF63DE, 0x766A0ABB3C77B2A8,
-	0x81C2C92E47EDAEE6, 0x92722C851482353B,
-	0xA2BFE8A14CF10364, 0xA81A664BBC423001,
-	0xC24B8B70D0F89791, 0xC76C51A30654BE30,
-	0xD192E819D6EF5218, 0xD69906245565A910,
-	0xF40E35855771202A, 0x106AA07032BBD1B8,
-	0x19A4C116B8D2D0C8, 0x1E376C085141AB53,
-	0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
-	0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB,
-	0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
-	0x748F82EE5DEFB2FC, 0x78A5636F43172F60,
-	0x84C87814A1F0AB72, 0x8CC702081A6439EC,
-	0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9,
-	0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
-	0xCA273ECEEA26619C, 0xD186B8C721C0C207,
-	0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
-	0x06F067AA72176FBA, 0x0A637DC5A2C898A6,
-	0x113F9804BEF90DAE, 0x1B710B35131C471B,
-	0x28DB77F523047D84, 0x32CAAB7B40C72493,
-	0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
-	0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A,
-	0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
+ 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
+ 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
+ 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
+ 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
+ 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
+ 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
+ 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
+ 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
+ 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
+ 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
+ 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
+ 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
+ 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
+ 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
+ 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
+ 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
+ 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
+ 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
+ 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
+ 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
 };

+#if defined(__AVX2__) && defined(__SHA512__)
+
+// SHA-512 implemented using SHA512 CPU extension.
+
+// Experimental. Not tested. Not reviewed. Compile tested only.
+
+// Needs GCC-13 for compilation.
+// Needs Intel Lunar lake or Arrow Lake CPU, or AMD Zen-{5,6}? for execution.
+// Modelled after noloader sha256 implementation.
+
+// It's not clear how SHA512 will be supported before AVX10 considering how
+// dependant it is on _mm256_alignr_epi64 which is only available with AVX512VL
+// until AVX10-256.
+
+#if defined(__AVX512VL__)
+
+#define mm256_alignr_1x64( v1, v0 )   _mm256_alignr_epi64( v1, v0, 1 )
+
+#else
+// Ugly workaround to make it work with AVX2
+
+static const __m256i mask __attribute__ ((aligned (32)))
+                          = { 0xffffffffffffffffull, 0ull, 0ull, 0ull };
+
+#define mm256_alignr_1x64( v1, v0 ) \
+  _mm256_or_si256( _mm256_and_si256( mm256_shuflr_64( v1 ),           mask  ), \
+                   _mm256_and_si256( mm256_shuflr_64( v0 ), mm256_not(mask) ) );
+
+#endif
+
+void sha512_opt_transform_be( uint64_t *state_out, const void *input,
+                              const uint64_t *state_in )
+{
+    __m256i STATE0, STATE1;
+    __m256i MSG, TMP, BSWAP64;
+    __m256i TMSG0, TMSG1, TMSG2, TMSG3;
+    __m256i ABEF_SAVE, CDGH_SAVE;
+
+    // Load initial values
+    TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
+    STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
+    BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
+                                                0x0001020304050607 ) )
+    TMP = _mm256_permute4x64_epi64( TMP, 0xB1 );             // CDAB
+    STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B );       // EFGH
+    STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
+    STATE1 = _mm256_blend_epi32( STATE1, TMP, 0xF0 );        // CDGH
+
+    // Save initial state
+    ABEF_SAVE = STATE0;
+    CDGH_SAVE = STATE1;
+
+    // Rounds 0-3
+    TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
+    TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
+    MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+
+    // Rounds 4-7
+    TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
+    TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
+    MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
+
+    // Rounds 8-11
+    TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
+    TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
+    MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
+
+    // Rounds 12-15
+    TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
+    TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
+    MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
+    TMSG0 = _mm256_add_epi32( TMSG0, TMP );
+    TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
+    
+    // Rounds 16-19
+    MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
+    TMSG1 = _mm256_add_epi64( TMSG1, TMP );
+    TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
+
+    // Rounds 20-23
+    MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
+    TMSG2 = _mm256_add_epi64( TMSG2, TMP );
+    TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
+
+    // Rounds 24-27
+    MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
+    TMSG3 = _mm256_add_epi32( TMSG3, TMP );
+    TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
+    
+    // Rounds 28-31
+    MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
+    TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
+    TMSG0 = _mm256_add_epi64( TMSG0, TMP );
+    TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
+
+    // Rounds 32-35
+    MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
+    TMSG1 = _mm256_add_epi64( TMSG1, TMP );
+    TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
+
+    // Rounds 36-39
+    MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
+    TMSG2 = _mm256_add_epi64( TMSG2, TMP );
+    TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
+
+    // Rounds 40-43
+    MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
+    TMSG3 = _mm256_add_epi64( TMSG3, TMP );
+    TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
+
+    // Rounds 44-47
+    MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
+    TMSG0 = _mm256_add_epi64( TMSG0, TMP );
+    TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
+
+    // Rounds 48-51
+    MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
+    TMSG1 = _mm256_add_epi64( TMSG1, TMP );
+    TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
+
+    // Rounds 52-55
+    MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 13 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
+    TMSG2 = _mm256_add_epi64( TMSG2, TMP );
+    TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+
+    // Rounds 56-59
+    MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
+    TMSG3 = _mm256_add_epi64( TMSG3, TMP );
+    TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+
+    // Rounds 60-63
+    MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
+    STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+
+    // Add initial state
+    STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
+    STATE1 = _mm256_add_epi64( STATE1, CDGH_SAVE );
+
+    TMP = _mm256_permute4x64_epi64( STATE0, 0x1B );          // FEBA
+    STATE1 = _mm256_permute4x64_epi64( STATE1, 0xB1 );       // DCHG
+    STATE0 = _mm256_blend_epi32( TMP, STATE1, 0xF0 );        // DCBA
+    STATE1 = _mm256_permute2x128_si256( STATE1, TMP, 0x21 ); // ABEF
+
+    // Save state
+    _mm256_store_si256((__m256i*) &state_out[0], STATE0 );
+    _mm256_store_si256((__m256i*) &state_out[4], STATE1 );
+}
+
+void sha512_opt_transform_le( uint64_t *state_out, const void *input,
+                              const uint64_t *state_in )
+{
+    __m256i STATE0, STATE1;
+    __m256i MSG, TMP, BSWAP64;
+    __m256i TMSG0, TMSG1, TMSG2, TMSG3;
+    __m256i ABEF_SAVE, CDGH_SAVE;
+
+    // Load initial values
+    TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
+    STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
+    TMP = _mm256_permute4x64_epi64( TMP, 0xB1 );             // CDAB
+    STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B );       // EFGH
+    STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
+    STATE1 = _mm256_blend_epi32( STATE1, TMP, 0xF0 );        // CDGH
+
+    // Save initial state
+    ABEF_SAVE = STATE0;
+    CDGH_SAVE = STATE1;
+
+    // Rounds 0-3
+    TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
+    MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+
+    // Rounds 4-7
+    TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
+    MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
+
+    // Rounds 8-11
+    TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
+    MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
+
+    // Rounds 12-15
+    TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
+    MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = _mm256_shuffle2_64( TMSG3, TMSG2, 1 );
+    TMSG0 = _mm256_add_epi32( TMSG0, TMP );
+    TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
+
+    // Rounds 16-19
+    MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 4 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
+    TMSG1 = _mm256_add_epi64( TMSG1, TMP );
+    TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
+
+    // Rounds 20-23
+    MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 5 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
+    TMSG2 = _mm256_add_epi64( TMSG2, TMP );
+    TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
+
+    // Rounds 24-27
+    MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 6 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
+    TMSG3 = _mm256_add_epi32( TMSG3, TMP );
+    TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
+
+    // Rounds 28-31
+    MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 7 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG) ;
+    TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
+    TMSG0 = _mm256_add_epi64( TMSG0, TMP );
+    TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
+
+    // Rounds 32-35
+    MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 8 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
+    TMSG1 = _mm256_add_epi64( TMSG1, TMP );
+    TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
+
+    // Rounds 36-39
+    MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 9 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG1, TMSG0 );
+    TMSG2 = _mm256_add_epi64( TMSG2, TMP );
+    TMSG2 = _mm256_sha512msg2_epi64( TMSG2, TMSG1 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG0 = _mm256_sha512msg1_epi64( TMSG0, TMSG1 );
+
+    // Rounds 40-43
+    MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 10 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG2, TMSG1 );
+    TMSG3 = _mm256_add_epi64( TMSG3, TMP );
+    TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG1 = _mm256_sha512msg1_epi64( TMSG1, TMSG2 );
+
+    // Rounds 44-47
+    MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 11 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG3, TMSG2 );
+    TMSG0 = _mm256_add_epi64( TMSG0, TMP );
+    TMSG0 = _mm256_sha512msg2_epi64( TMSG0, TMSG3 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG2 = _mm256_sha512msg1_epi64( TMSG2, TMSG3 );
+
+    // Rounds 48-51
+    MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 12 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG0, TMSG3 );
+    TMSG1 = _mm256_add_epi64( TMSG1, TMP );
+    TMSG1 = _mm256_sha512msg2_epi64( TMSG1, TMSG0 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+    TMSG3 = _mm256_sha512msg1_epi64( TMSG3, TMSG0 );
+
+    // Rounds 56-59
+    MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 14 ) );
+    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0, MSG );
+    TMP = mm256_alignr_1x64( TMSG2, TMSG1 ) ;
+    TMSG3 = _mm256_add_epi64( TMSG3, TMP );
+    TMSG3 = _mm256_sha512msg2_epi64( TMSG3, TMSG2 );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+
+    // Rounds 60-63
+    MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 15 ) );
+    STATE1 = _mm256_sha512nds2_epi64( STATE1, STATE0, MSG );
+    MSG = _mm256_permute4x64_epi64( MSG, 0x0E );
+    STATE0 = _mm256_sha512rnds2_epi64( STATE0, STATE1, MSG );
+
+    // Add initial state
+    STATE0 = _mm256_add_epi64( STATE0, ABEF_SAVE );
+    STATE1 = _mm256_add_epi64( STATE1, CDGH_SAVE );
+
+    TMP = _mm256_permute4x64_epi64( STATE0, 0x1B );          // FEBA
+    STATE1 = _mm256_permute4x64_epi64( STATE1, 0xB1 );       // DCHG
+    STATE0 = _mm256_blend_epi32( TMP, STATE1, 0xF0 );        // DCBA
+    STATE1 = _mm256_permute2x128_si256( STATE1, TMP, 0x21 ); // ABEF
+
+    // Save state
+    _mm256_store_si256((__m256i*) &state_out[0], STATE0 );
+    _mm256_store_si256((__m256i*) &state_out[4], STATE1 );
+}
+
+
+#endif
+

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

@@ -117,7 +489,7 @@ static const uint64_t K512[80] =

 #define SHA3_8WAY_STEP( A, B, C, D, E, F, G, H, i ) \
 do { \
-  __m512i T0 = _mm512_add_epi64( _mm512_set1_epi64( K512[i] ), W[ i ] ); \
+  __m512i T0 = _mm512_add_epi64( v512_64( K512[i] ), W[ i ] ); \
  __m512i T1 = BSG8W_5_1( E ); \
  __m512i T2 = BSG8W_5_0( A ); \
  T0 = _mm512_add_epi64( T0, CH8W( E, F, G ) ); \
@@ -155,14 +527,14 @@ sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
   }
   else
   {
-      A = _mm512_set1_epi64( 0x6A09E667F3BCC908 );
-      B = _mm512_set1_epi64( 0xBB67AE8584CAA73B );
-      C = _mm512_set1_epi64( 0x3C6EF372FE94F82B );
-      D = _mm512_set1_epi64( 0xA54FF53A5F1D36F1 );
-      E = _mm512_set1_epi64( 0x510E527FADE682D1 );
-      F = _mm512_set1_epi64( 0x9B05688C2B3E6C1F );
-      G = _mm512_set1_epi64( 0x1F83D9ABFB41BD6B );
-      H = _mm512_set1_epi64( 0x5BE0CD19137E2179 );
+      A = v512_64( 0x6A09E667F3BCC908 );
+      B = v512_64( 0xBB67AE8584CAA73B );
+      C = v512_64( 0x3C6EF372FE94F82B );
+      D = v512_64( 0xA54FF53A5F1D36F1 );
+      E = v512_64( 0x510E527FADE682D1 );
+      F = v512_64( 0x9B05688C2B3E6C1F );
+      G = v512_64( 0x1F83D9ABFB41BD6B );
+      H = v512_64( 0x5BE0CD19137E2179 );
   }

   for ( i = 0; i < 80; i += 8 )
@@ -191,14 +563,14 @@ sha512_8way_round( sha512_8way_context *ctx,  __m512i *in, __m512i r[8] )
   else
   {
      ctx->initialized = true;
-      r[0] = _mm512_add_epi64( A, _mm512_set1_epi64( 0x6A09E667F3BCC908 ) );
-      r[1] = _mm512_add_epi64( B, _mm512_set1_epi64( 0xBB67AE8584CAA73B ) );
-      r[2] = _mm512_add_epi64( C, _mm512_set1_epi64( 0x3C6EF372FE94F82B ) );
-      r[3] = _mm512_add_epi64( D, _mm512_set1_epi64( 0xA54FF53A5F1D36F1 ) );
-      r[4] = _mm512_add_epi64( E, _mm512_set1_epi64( 0x510E527FADE682D1 ) );
-      r[5] = _mm512_add_epi64( F, _mm512_set1_epi64( 0x9B05688C2B3E6C1F ) );
-      r[6] = _mm512_add_epi64( G, _mm512_set1_epi64( 0x1F83D9ABFB41BD6B ) );
-      r[7] = _mm512_add_epi64( H, _mm512_set1_epi64( 0x5BE0CD19137E2179 ) );
+      r[0] = _mm512_add_epi64( A, v512_64( 0x6A09E667F3BCC908 ) );
+      r[1] = _mm512_add_epi64( B, v512_64( 0xBB67AE8584CAA73B ) );
+      r[2] = _mm512_add_epi64( C, v512_64( 0x3C6EF372FE94F82B ) );
+      r[3] = _mm512_add_epi64( D, v512_64( 0xA54FF53A5F1D36F1 ) );
+      r[4] = _mm512_add_epi64( E, v512_64( 0x510E527FADE682D1 ) );
+      r[5] = _mm512_add_epi64( F, v512_64( 0x9B05688C2B3E6C1F ) );
+      r[6] = _mm512_add_epi64( G, v512_64( 0x1F83D9ABFB41BD6B ) );
+      r[7] = _mm512_add_epi64( H, v512_64( 0x5BE0CD19137E2179 ) );
   }
 }

@@ -243,7 +615,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

    ptr = (unsigned)sc->count & (buf_size - 1U);
-    sc->buf[ ptr>>3 ] = _mm512_set1_epi64( 0x80 );
+    sc->buf[ ptr>>3 ] = v512_64( 0x80 );
    ptr += 8;
    if ( ptr > pad )
    {
@@ -255,9 +627,9 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
         memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );

    sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
-                       _mm512_set1_epi64( sc->count >> 61 ), shuff_bswap64 );
+                       v512_64( sc->count >> 61 ), shuff_bswap64 );
    sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
-                       _mm512_set1_epi64( sc->count <<  3 ), shuff_bswap64 );
+                       v512_64( sc->count <<  3 ), shuff_bswap64 );
    sha512_8way_round( sc, sc->buf, sc->val );

    mm512_block_bswap_64( dst, sc->val );
@@ -295,7 +667,7 @@ void sha512_8way_close( sha512_8way_context *sc, void *dst )
   
 #define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
 do { \
-  __m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
+  __m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \
  __m256i T1 = BSG5_1( E ); \
  __m256i T2 = BSG5_0( A ); \
  T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
@@ -317,7 +689,7 @@ do { \

 #define SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i ) \
 do { \
-  __m256i T0 = _mm256_add_epi64( _mm256_set1_epi64x( K512[i] ), W[i] ); \
+  __m256i T0 = _mm256_add_epi64( v256_64( K512[i] ), W[i] ); \
  __m256i T1 = BSG5_1( E ); \
  __m256i T2 = BSG5_0( A ); \
  T0 = _mm256_add_epi64( T0, CH( E, F, G ) ); \
@@ -364,14 +736,14 @@ sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
   }
   else
   {
-      A = _mm256_set1_epi64x( 0x6A09E667F3BCC908 );
-      B = _mm256_set1_epi64x( 0xBB67AE8584CAA73B );
-      C = _mm256_set1_epi64x( 0x3C6EF372FE94F82B );
-      D = _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 );
-      E = _mm256_set1_epi64x( 0x510E527FADE682D1 );
-      F = _mm256_set1_epi64x( 0x9B05688C2B3E6C1F );
-      G = _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B );
-      H = _mm256_set1_epi64x( 0x5BE0CD19137E2179 );
+      A = v256_64( 0x6A09E667F3BCC908 );
+      B = v256_64( 0xBB67AE8584CAA73B );
+      C = v256_64( 0x3C6EF372FE94F82B );
+      D = v256_64( 0xA54FF53A5F1D36F1 );
+      E = v256_64( 0x510E527FADE682D1 );
+      F = v256_64( 0x9B05688C2B3E6C1F );
+      G = v256_64( 0x1F83D9ABFB41BD6B );
+      H = v256_64( 0x5BE0CD19137E2179 );
   }

 #if !defined(__AVX512VL__)
@@ -405,14 +777,14 @@ sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
   else
   {
      ctx->initialized = true;
-      r[0] = _mm256_add_epi64( A, _mm256_set1_epi64x( 0x6A09E667F3BCC908 ) );
-      r[1] = _mm256_add_epi64( B, _mm256_set1_epi64x( 0xBB67AE8584CAA73B ) );
-      r[2] = _mm256_add_epi64( C, _mm256_set1_epi64x( 0x3C6EF372FE94F82B ) );
-      r[3] = _mm256_add_epi64( D, _mm256_set1_epi64x( 0xA54FF53A5F1D36F1 ) );
-      r[4] = _mm256_add_epi64( E, _mm256_set1_epi64x( 0x510E527FADE682D1 ) );
-      r[5] = _mm256_add_epi64( F, _mm256_set1_epi64x( 0x9B05688C2B3E6C1F ) );
-      r[6] = _mm256_add_epi64( G, _mm256_set1_epi64x( 0x1F83D9ABFB41BD6B ) );
-      r[7] = _mm256_add_epi64( H, _mm256_set1_epi64x( 0x5BE0CD19137E2179 ) );
+      r[0] = _mm256_add_epi64( A, v256_64( 0x6A09E667F3BCC908 ) );
+      r[1] = _mm256_add_epi64( B, v256_64( 0xBB67AE8584CAA73B ) );
+      r[2] = _mm256_add_epi64( C, v256_64( 0x3C6EF372FE94F82B ) );
+      r[3] = _mm256_add_epi64( D, v256_64( 0xA54FF53A5F1D36F1 ) );
+      r[4] = _mm256_add_epi64( E, v256_64( 0x510E527FADE682D1 ) );
+      r[5] = _mm256_add_epi64( F, v256_64( 0x9B05688C2B3E6C1F ) );
+      r[6] = _mm256_add_epi64( G, v256_64( 0x1F83D9ABFB41BD6B ) );
+      r[7] = _mm256_add_epi64( H, v256_64( 0x5BE0CD19137E2179 ) );
   }
 }

@@ -457,7 +829,7 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

    ptr = (unsigned)sc->count & (buf_size - 1U);
-    sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
+    sc->buf[ ptr>>3 ] = v256_64( 0x80 );
    ptr += 8;
    if ( ptr > pad )
    {
@@ -469,9 +841,9 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
         memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );

    sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
-                       _mm256_set1_epi64x( sc->count >> 61 ), shuff_bswap64 );
+                       v256_64( sc->count >> 61 ), shuff_bswap64 );
    sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8( 
-                       _mm256_set1_epi64x( sc->count <<  3 ), shuff_bswap64 );
+                       v256_64( sc->count <<  3 ), shuff_bswap64 );
    sha512_4way_round( sc, sc->buf, sc->val );

    mm256_block_bswap_64( dst, sc->val );
--- a/algo/sha/sha512-hash.h
+++ b/algo/sha/sha512-hash.h
@@ -5,11 +5,32 @@
 #include "simd-utils.h"
 #include "sph_sha2.h"

+#if defined(__SHA512__) && defined(__AVX2__)
+
+// Experimental, untested
+// Need to substitute for sph_sha512
+
+typedef struct
+{
+   uint64_t buf[128>>3];
+   uint64_t val[8];
+   uint64_t count;
+} sha512_context __attribute__ ((aligned (64)));
+
+void sha512_opt_transform_be( uint64_t *state_out, const void *input,
+                              const uint64_t *state_in );
+
+void sha512_opt_transform_le( uint64_t *state_out, const void *input,
+                              const uint64_t *state_in );
+
+#endif
+
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 // SHA-512 8 way

-typedef struct {
+typedef struct
+{
   __m512i buf[128>>3];
   __m512i val[8];
   uint64_t count;
@@ -28,7 +49,8 @@ void sha512_8way_full( void *dst, const void *data, size_t len );

 // SHA-512 4 way

-typedef struct {
+typedef struct
+{
   __m256i buf[128>>3];
   __m256i val[8];
   uint64_t count;
--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -16,14 +16,14 @@ static void sha512256d_8way_init( sha512_8way_context *ctx )
 {
  ctx->count = 0;
  ctx->initialized = true;
-  ctx->val[0] = _mm512_set1_epi64( 0x22312194FC2BF72C );
-  ctx->val[1] = _mm512_set1_epi64( 0x9F555FA3C84C64C2 );
-  ctx->val[2] = _mm512_set1_epi64( 0x2393B86B6F53B151 );
-  ctx->val[3] = _mm512_set1_epi64( 0x963877195940EABD );
-  ctx->val[4] = _mm512_set1_epi64( 0x96283EE2A88EFFE3 );
-  ctx->val[5] = _mm512_set1_epi64( 0xBE5E1E2553863992 );
-  ctx->val[6] = _mm512_set1_epi64( 0x2B0199FC2C85B8AA );
-  ctx->val[7] = _mm512_set1_epi64( 0x0EB72DDC81C52CA2 );
+  ctx->val[0] = v512_64( 0x22312194FC2BF72C );
+  ctx->val[1] = v512_64( 0x9F555FA3C84C64C2 );
+  ctx->val[2] = v512_64( 0x2393B86B6F53B151 );
+  ctx->val[3] = v512_64( 0x963877195940EABD );
+  ctx->val[4] = v512_64( 0x96283EE2A88EFFE3 );
+  ctx->val[5] = v512_64( 0xBE5E1E2553863992 );
+  ctx->val[6] = v512_64( 0x2B0199FC2C85B8AA );
+  ctx->val[7] = v512_64( 0x0EB72DDC81C52CA2 );
 }

 int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
@@ -43,7 +43,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
    __m512i  *noncev = (__m512i*)vdata + 9;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
-    const __m512i eight = _mm512_set1_epi64( 0x0000000800000000 );
+    const __m512i eight = v512_64( 0x0000000800000000 );

    mm512_bswap32_intrlv80_8x64( vdata, pdata );
    *noncev = mm512_intrlv_blend_32(
@@ -84,14 +84,14 @@ static void sha512256d_4way_init( sha512_4way_context *ctx )
 {
  ctx->count = 0;
  ctx->initialized = true;
-  ctx->val[0] = _mm256_set1_epi64x( 0x22312194FC2BF72C );
-  ctx->val[1] = _mm256_set1_epi64x( 0x9F555FA3C84C64C2 );
-  ctx->val[2] = _mm256_set1_epi64x( 0x2393B86B6F53B151 );
-  ctx->val[3] = _mm256_set1_epi64x( 0x963877195940EABD );
-  ctx->val[4] = _mm256_set1_epi64x( 0x96283EE2A88EFFE3 );
-  ctx->val[5] = _mm256_set1_epi64x( 0xBE5E1E2553863992 );
-  ctx->val[6] = _mm256_set1_epi64x( 0x2B0199FC2C85B8AA );
-  ctx->val[7] = _mm256_set1_epi64x( 0x0EB72DDC81C52CA2 );
+  ctx->val[0] = v256_64( 0x22312194FC2BF72C );
+  ctx->val[1] = v256_64( 0x9F555FA3C84C64C2 );
+  ctx->val[2] = v256_64( 0x2393B86B6F53B151 );
+  ctx->val[3] = v256_64( 0x963877195940EABD );
+  ctx->val[4] = v256_64( 0x96283EE2A88EFFE3 );
+  ctx->val[5] = v256_64( 0xBE5E1E2553863992 );
+  ctx->val[6] = v256_64( 0x2B0199FC2C85B8AA );
+  ctx->val[7] = v256_64( 0x0EB72DDC81C52CA2 );
 }

 int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
@@ -111,7 +111,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
    __m256i  *noncev = (__m256i*)vdata + 9;
    const int thr_id = mythr->id;
    const bool bench = opt_benchmark;
-    const __m256i four = _mm256_set1_epi64x( 0x0000000400000000 );
+    const __m256i four = v256_64( 0x0000000400000000 );

    mm256_bswap32_intrlv80_4x64( vdata, pdata );
    *noncev = mm256_intrlv_blend_32(