v25.4

2025-09-17 23:44:27 +00:00 · 2025-06-20 20:31:41 -04:00
parent dd99580a4c
commit 66191db93c
86 changed files with 2701 additions and 4322 deletions
--- a/algo/sha/hmac-sha256-hash-4way.c
+++ b/algo/sha/hmac-sha256-hash-4way.c
@@ -31,7 +31,7 @@
 #include "hmac-sha256-hash-4way.h"
 #include "compat.h"

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)
 // HMAC 4-way SSE2

 /**
@@ -62,30 +62,30 @@ hmac_sha256_4way_init( hmac_sha256_4way_context *ctx, const void *_K,
 	/* If Klen > 64, the key is really SHA256(K). */
 	if ( Klen > 64 )
   {
-		sha256_4way_init( &ctx->ictx );
-		sha256_4way_update( &ctx->ictx, K, Klen );
-		sha256_4way_close( &ctx->ictx, khash );
+		sha256_4x32_init( &ctx->ictx );
+		sha256_4x32_update( &ctx->ictx, K, Klen );
+		sha256_4x32_close( &ctx->ictx, khash );
 		K = khash;
 		Klen = 32;
 	}

 	/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-   sha256_4way_init( &ctx->ictx );
+   sha256_4x32_init( &ctx->ictx );
 	memset( pad, 0x36, 64*4 );

   for ( i = 0; i < Klen; i++ )
-		casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
-                                               casti_v128u32( K, i ) );
+		casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
+                                          casti_v128u32( K, i ) );

-   sha256_4way_update( &ctx->ictx, pad, 64 );
+   sha256_4x32_update( &ctx->ictx, pad, 64 );

 	/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-	sha256_4way_init( &ctx->octx );
+	sha256_4x32_init( &ctx->octx );
 	memset( pad, 0x5c, 64*4 );
 	for ( i = 0; i < Klen/4; i++ )
-		casti_v128u32( pad, i ) = _mm_xor_si128( casti_v128u32( pad, i ),
-                                               casti_v128u32( K, i ) );
-	sha256_4way_update( &ctx->octx, pad, 64 );
+		casti_v128u32( pad, i ) = v128_xor( casti_v128u32( pad, i ),
+                                          casti_v128u32( K, i ) );
+	sha256_4x32_update( &ctx->octx, pad, 64 );
 }

 /* Add bytes to the HMAC-SHA256 operation. */
@@ -94,7 +94,7 @@ hmac_sha256_4way_update( hmac_sha256_4way_context *ctx, const void *in,
                         size_t len )
 {
 	/* Feed data to the inner SHA256 operation. */
-	sha256_4way_update( &ctx->ictx, in, len );
+	sha256_4x32_update( &ctx->ictx, in, len );
 }

 /* Finish an HMAC-SHA256 operation. */
@@ -104,13 +104,13 @@ hmac_sha256_4way_close( hmac_sha256_4way_context *ctx, void *digest )
 	unsigned char ihash[32*4] __attribute__ ((aligned (64)));

 	/* Finish the inner SHA256 operation. */
-	sha256_4way_close( &ctx->ictx, ihash );
+	sha256_4x32_close( &ctx->ictx, ihash );

 	/* Feed the inner hash to the outer SHA256 operation. */
-	sha256_4way_update( &ctx->octx, ihash, 32 );
+	sha256_4x32_update( &ctx->octx, ihash, 32 );

 	/* Finish the outer SHA256 operation. */
-	sha256_4way_close( &ctx->octx, digest );
+	sha256_4x32_close( &ctx->octx, digest );
 }

 /**
@@ -126,7 +126,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
 	hmac_sha256_4way_context PShctx, hctx;
 	uint8_t _ALIGN(128) T[32*4];
 	uint8_t _ALIGN(128) U[32*4];
-   __m128i ivec;
+   v128u32_t ivec;
   size_t i, clen;
 	uint64_t j;
 	int k;
@@ -139,7 +139,7 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,
 	for ( i = 0; i * 32 < dkLen; i++ )
   {
 		/* Generate INT(i + 1). */
-      ivec = _mm_set1_epi32( bswap_32( i+1 ) ); 
+      ivec = v128_32( bswap_32( i+1 ) ); 

 		/* Compute U_1 = PRF(P, S || INT(i)). */
 		memcpy( &hctx, &PShctx, sizeof(hmac_sha256_4way_context) );
@@ -158,8 +158,8 @@ pbkdf2_sha256_4way( uint8_t *buf, size_t dkLen,

 			/* ... xor U_j ... */
 			for ( k = 0; k < 8; k++ )
-				casti_v128u32( T, k ) = _mm_xor_si128( casti_v128u32( T, k ),
-                                                   casti_v128u32( U, k ) );
+				casti_v128u32( T, k ) = v128_xor( casti_v128u32( T, k ),
+                                              casti_v128u32( U, k ) );
 		}

 		/* Copy as many bytes as necessary into buf. */
@@ -199,30 +199,30 @@ hmac_sha256_8way_init( hmac_sha256_8way_context *ctx, const void *_K,
   /* If Klen > 64, the key is really SHA256(K). */
   if ( Klen > 64 )
   {
-      sha256_8way_init( &ctx->ictx );
-      sha256_8way_update( &ctx->ictx, K, Klen );
-      sha256_8way_close( &ctx->ictx, khash );
+      sha256_8x32_init( &ctx->ictx );
+      sha256_8x32_update( &ctx->ictx, K, Klen );
+      sha256_8x32_close( &ctx->ictx, khash );
      K = khash;
      Klen = 32;
   }

   /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-   sha256_8way_init( &ctx->ictx );
+   sha256_8x32_init( &ctx->ictx );
   memset( pad, 0x36, 64*8);

   for ( i = 0; i < Klen/4; i++ )
      casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
                                                casti_m256i( K, i ) );

-   sha256_8way_update( &ctx->ictx, pad, 64 );
+   sha256_8x32_update( &ctx->ictx, pad, 64 );

   /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
-   sha256_8way_init( &ctx->octx );
+   sha256_8x32_init( &ctx->octx );
   memset( pad, 0x5c, 64*8 );
   for ( i = 0; i < Klen/4; i++ )
      casti_m256i( pad, i ) = _mm256_xor_si256( casti_m256i( pad, i ),
                                                casti_m256i( K, i ) );
-   sha256_8way_update( &ctx->octx, pad, 64 );
+   sha256_8x32_update( &ctx->octx, pad, 64 );
 }

 void
@@ -230,7 +230,7 @@ hmac_sha256_8way_update( hmac_sha256_8way_context *ctx, const void *in,
                         size_t len )
 {
   /* Feed data to the inner SHA256 operation. */
-   sha256_8way_update( &ctx->ictx, in, len );
+   sha256_8x32_update( &ctx->ictx, in, len );
 }

 /* Finish an HMAC-SHA256 operation. */
@@ -240,13 +240,13 @@ hmac_sha256_8way_close( hmac_sha256_8way_context *ctx, void *digest )
   unsigned char ihash[32*8] __attribute__ ((aligned (128)));

   /* Finish the inner SHA256 operation. */
-   sha256_8way_close( &ctx->ictx, ihash );
+   sha256_8x32_close( &ctx->ictx, ihash );

   /* Feed the inner hash to the outer SHA256 operation. */
-   sha256_8way_update( &ctx->octx, ihash, 32 );
+   sha256_8x32_update( &ctx->octx, ihash, 32 );

   /* Finish the outer SHA256 operation. */
-   sha256_8way_close( &ctx->octx, digest );
+   sha256_8x32_close( &ctx->octx, digest );
 }

 /**
@@ -332,21 +332,21 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
   /* If Klen > 64, the key is really SHA256(K). */
   if ( Klen > 64 )
   {
-      sha256_16way_init( &ctx->ictx );
-      sha256_16way_update( &ctx->ictx, K, Klen );
-      sha256_16way_close( &ctx->ictx, khash );
+      sha256_16x32_init( &ctx->ictx );
+      sha256_16x32_update( &ctx->ictx, K, Klen );
+      sha256_16x32_close( &ctx->ictx, khash );
      K = khash;
      Klen = 32;
   }

   /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
-   sha256_16way_init( &ctx->ictx );
+   sha256_16x32_init( &ctx->ictx );
   memset( pad, 0x36, 64*16 );

   for ( i = 0; i < Klen; i++ )
      casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
                                                casti_m512i( K, i ) );
-   sha256_16way_update( &ctx->ictx, pad, 64 );
+   sha256_16x32_update( &ctx->ictx, pad, 64 );

   /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
   sha256_16way_init( &ctx->octx );
@@ -354,7 +354,7 @@ hmac_sha256_16way_init( hmac_sha256_16way_context *ctx, const void *_K,
   for ( i = 0; i < Klen/4; i++ )
      casti_m512i( pad, i ) = _mm512_xor_si512( casti_m512i( pad, i ),
                                             casti_m512i( K, i ) );
-   sha256_16way_update( &ctx->octx, pad, 64 );
+   sha256_16x32_update( &ctx->octx, pad, 64 );
 }
   
 void
@@ -362,7 +362,7 @@ hmac_sha256_16way_update( hmac_sha256_16way_context *ctx, const void *in,
                         size_t len )
 {
   /* Feed data to the inner SHA256 operation. */
-   sha256_16way_update( &ctx->ictx, in, len );
+   sha256_16x32_update( &ctx->ictx, in, len );
 }

 /* Finish an HMAC-SHA256 operation. */
@@ -372,13 +372,13 @@ hmac_sha256_16way_close( hmac_sha256_16way_context *ctx, void *digest )
   unsigned char ihash[32*16] __attribute__ ((aligned (128)));

   /* Finish the inner SHA256 operation. */
-   sha256_16way_close( &ctx->ictx, ihash );
+   sha256_16x32_close( &ctx->ictx, ihash );

   /* Feed the inner hash to the outer SHA256 operation. */
-   sha256_16way_update( &ctx->octx, ihash, 32 );
+   sha256_16x32_update( &ctx->octx, ihash, 32 );

   /* Finish the outer SHA256 operation. */
-   sha256_16way_close( &ctx->octx, digest );
+   sha256_16x32_close( &ctx->octx, digest );
 }

 /**
--- a/algo/sha/hmac-sha256-hash-4way.h
+++ b/algo/sha/hmac-sha256-hash-4way.h
@@ -1,6 +1,6 @@
 /*-
 * Copyright 2005,2007,2009 Colin Percival
- * Copyright 2020 JayDDee@gmailcom
+ * Copyright 2020 JayDDee246@gmailcom
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
@@ -38,11 +38,12 @@
 #include "simd-utils.h"
 #include "sha256-hash.h"

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)
+
 typedef struct _hmac_sha256_4way_context
 {
-   sha256_4way_context ictx;
-   sha256_4way_context octx;
+   sha256_4x32_context ictx;
+   sha256_4x32_context octx;
 } hmac_sha256_4way_context;

 //void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
@@ -67,8 +68,8 @@ void pbkdf2_sha256_4way( uint8_t *, size_t, const uint8_t *, size_t,

 typedef struct _hmac_sha256_8way_context
 {
-   sha256_8way_context ictx;
-   sha256_8way_context octx;
+   sha256_8x32_context ictx;
+   sha256_8x32_context octx;
 } hmac_sha256_8way_context;

 //void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
@@ -88,8 +89,8 @@ void pbkdf2_sha256_8way( uint8_t *, size_t, const uint8_t *, size_t,

 typedef struct _hmac_sha256_16way_context
 {
-   sha256_16way_context ictx;
-   sha256_16way_context octx;
+   sha256_16x32_context ictx;
+   sha256_16x32_context octx;
 } hmac_sha256_16way_context;

 //void SHA256_Buf( const void *, size_t len, uint8_t digest[32] );
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -30,6 +30,7 @@ static const uint32_t K256[64] =
   0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
 };

+#if defined(__SSE2__) || defined(__ARM_NEON)
 // SHA-256 4 way SSE2

 #define CHs(X, Y, Z) \
@@ -309,142 +310,6 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
   v128_store( state_out + 7,  H );
 }

-
-# if 0
-
-// Working correctly but still slower
-int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
-                            const v128_t *state_in, const uint32_t *target )
-{
-   v128_t A, B, C, D, E, F, G, H, T0, T1, T2;
-   v128_t vmask, targ, hash;
-   int t6_mask, flip;
-   v128_t W[16];      v128_memcpy( W, data, 16 );
-
-   A = v128_load( state_in   );
-   B = v128_load( state_in+1 );
-   C = v128_load( state_in+2 );
-   D = v128_load( state_in+3 );
-   E = v128_load( state_in+4 );
-   F = v128_load( state_in+5 );
-   G = v128_load( state_in+6 );
-   H = v128_load( state_in+7 );
-
-   const v128_t IV7 = H;
-   const v128_t IV6 = G;
-
-   SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 0 );
-   SHA256_4X32_MSG_EXPANSION( W );
-   SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 16 );
-   SHA256_4X32_MSG_EXPANSION( W );
-   SHA256_4X32_16ROUNDS( A, B, C, D, E, F, G, H, 32 );
-
-   W[ 0] = SHA256_4X32_MEXP( W[14], W[ 9], W[ 1], W[ 0] );
-   W[ 1] = SHA256_4X32_MEXP( W[15], W[10], W[ 2], W[ 1] );
-   W[ 2] = SHA256_4X32_MEXP( W[ 0], W[11], W[ 3], W[ 2] );
-   W[ 3] = SHA256_4X32_MEXP( W[ 1], W[12], W[ 4], W[ 3] );
-   W[ 4] = SHA256_4X32_MEXP( W[ 2], W[13], W[ 5], W[ 4] );
-   W[ 5] = SHA256_4X32_MEXP( W[ 3], W[14], W[ 6], W[ 5] );
-   W[ 6] = SHA256_4X32_MEXP( W[ 4], W[15], W[ 7], W[ 6] );
-   W[ 7] = SHA256_4X32_MEXP( W[ 5], W[ 0], W[ 8], W[ 7] );
-   W[ 8] = SHA256_4X32_MEXP( W[ 6], W[ 1], W[ 9], W[ 8] );
-   W[ 9] = SHA256_4X32_MEXP( W[ 7], W[ 2], W[10], W[ 9] );
-   W[10] = SHA256_4X32_MEXP( W[ 8], W[ 3], W[11], W[10] );
-   W[11] = SHA256_4X32_MEXP( W[ 9], W[ 4], W[12], W[11] );
-   W[12] = SHA256_4X32_MEXP( W[10], W[ 5], W[13], W[12] );
-
-   v128_t X_xor_Y, Y_xor_Z = v128_xor( B, C );
-   
-   SHA256_4X32_ROUND( A, B, C, D, E, F, G, H,  0, 48 );
-   SHA256_4X32_ROUND( H, A, B, C, D, E, F, G,  1, 48 );
-   SHA256_4X32_ROUND( G, H, A, B, C, D, E, F,  2, 48 );
-   SHA256_4X32_ROUND( F, G, H, A, B, C, D, E,  3, 48 );
-   SHA256_4X32_ROUND( E, F, G, H, A, B, C, D,  4, 48 );
-   SHA256_4X32_ROUND( D, E, F, G, H, A, B, C,  5, 48 );
-   SHA256_4X32_ROUND( C, D, E, F, G, H, A, B,  6, 48 );
-   SHA256_4X32_ROUND( B, C, D, E, F, G, H, A,  7, 48 );
-   SHA256_4X32_ROUND( A, B, C, D, E, F, G, H,  8, 48 );
-   SHA256_4X32_ROUND( H, A, B, C, D, E, F, G,  9, 48 );
-
-   T0 = v128_add32( v128_32( K256[58] ),
-                   v128_add4_32( BSG2_1( C ), CHs( C, D, E ), W[10], F ) );
-   B = v128_add32( B, T0 );
-
-   T1 = v128_add32( v128_32( K256[59] ),
-                    v128_add4_32( BSG2_1( B ), CHs( B, C, D ), W[11], E ) );
-   A = v128_add32( A, T1 );
-
-   T2 = v128_add32( v128_32( K256[60] ),
-                    v128_add4_32( BSG2_1( A ), CHs( A, B, C ), W[12], D ) );
-   H = v128_add32( H, T2 );
-
-   targ = v128_32( target[7] );
-   hash = v128_bswap32( v128_add32( H, IV7 ) );
-
-   flip = ( (int)target[7] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
-
-   if ( likely(
-             0xf == ( flip ^ v128_movmask32( v128_cmpgt32( hash, targ ) ) ) ))
-   return 0;
-
-   t6_mask = v128_movmask32( vmask = v128_cmpeq32( hash, targ ) );
-
-   // round 58 part 2
-   F = v128_add32( T0, v128_add32( BSG2_0( G ), MAJs( G, H, A ) ) );
-
-   // round 61  part 1
-   W[13] = SHA256_4X32_MEXP( W[11], W[ 6], W[14], W[13] );
-   T0 = v128_add32( v128_32( K256[61] ),
-                    v128_add4_32( BSG2_1( H ), CHs( H, A, B ), W[13], C ) );
-   G = v128_add32( G, T0 );
-
-   if ( t6_mask )
-   {
-      targ = v128_and( vmask, v128_32( target[6] ) );
-      hash = v128_bswap32( v128_add32( G, IV6 ) );
-
-      if ( ( 0 != ( t6_mask & v128_movmask32( v128_cmpeq32( hash, targ ) ) ) ))
-         return 0;
-      else
-      {
-         flip = ( (int)target[6] < 0 ? 0xf : 0 ) ^ v128_movmask32( hash );
-         if ( 0 != ( t6_mask & ( flip ^ v128_movmask32(
-                                             v128_cmpgt32( hash, targ ) ) ) ) )
-            return 0;
-          else if ( target[6] == 0x80000000 )
-          {
-             if ( 0 == ( t6_mask & v128_movmask32(
-                            v128_cmpgt32( hash, v128_xor( hash, hash ) ) ) ) )
-                return 0;
-          }
-       }
-   }
-   
-   // rounds 59 to 61 part 2
-   E = v128_add32( T1, v128_add32( BSG2_0( F ), MAJs( F, G, H ) ) );
-   D = v128_add32( T2, v128_add32( BSG2_0( E ), MAJs( E, F, G ) ) );
-   C = v128_add32( T0, v128_add32( BSG2_0( D ), MAJs( D, E, F ) ) );
-
-   // rounds 62 & 63
-   W[14] = SHA256_4X32_MEXP( W[12], W[ 7], W[15], W[14] );
-   W[15] = SHA256_4X32_MEXP( W[13], W[ 8], W[ 0], W[15] );
-
-   SHA256_4X32_ROUND( C, D, E, F, G, H, A, B, 14, 48 );
-   SHA256_4X32_ROUND( B, C, D, E, F, G, H, A, 15, 48 );
-
-   state_out[0] = v128_add32( state_in[0], A );
-   state_out[1] = v128_add32( state_in[1], B );
-   state_out[2] = v128_add32( state_in[2], C );
-   state_out[3] = v128_add32( state_in[3], D );
-   state_out[4] = v128_add32( state_in[4], E );
-   state_out[5] = v128_add32( state_in[5], F );
-   state_out[6] = v128_add32( state_in[6], G );
-   state_out[7] = v128_add32( state_in[7], H );
-return 1;
-}
-
-#endif
-
 void sha256_4x32_init( sha256_4x32_context *sc )
 {
   sc->count_high = sc->count_low = 0;
@@ -529,29 +394,31 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
   sha256_4x32_close( &ctx, dst );
 }

+#endif  // SSE2 || NEON
+
 #if defined(__AVX2__)

 // SHA-256 8 way

 #define BSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  2 ), \
-                                       mm256_ror_32( x, 13 ) ), \
-                                       mm256_ror_32( x, 22 ) )
+   mm256_xor3( mm256_ror_32( x,  2 ), \
+               mm256_ror_32( x, 13 ), \
+               mm256_ror_32( x, 22 ) )

 #define BSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  6 ), \
-                                       mm256_ror_32( x, 11 ) ), \
-                                       mm256_ror_32( x, 25 ) )
+   mm256_xor3( mm256_ror_32( x,  6 ), \
+               mm256_ror_32( x, 11 ), \
+               mm256_ror_32( x, 25 ) )

 #define SSG2_0x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x,  7 ), \
-                                       mm256_ror_32( x, 18 ) ), \
-                                       _mm256_srli_epi32( x, 3 ) ) 
+   mm256_xor3( mm256_ror_32( x,  7 ), \
+               mm256_ror_32( x, 18 ), \
+               _mm256_srli_epi32( x, 3 ) ) 

 #define SSG2_1x(x) \
-   _mm256_xor_si256( _mm256_xor_si256( mm256_ror_32( x, 17 ), \
-                                       mm256_ror_32( x, 19 ) ), \
-                                       _mm256_srli_epi32( x, 10 ) )
+   mm256_xor3( mm256_ror_32( x, 17 ), \
+               mm256_ror_32( x, 19 ), \
+               _mm256_srli_epi32( x, 10 ) )

 #define SHA256_8WAY_MEXP( a, b, c, d ) \
     mm256_add4_32( SSG2_1x( a ), b, SSG2_0x( c ), d );
@@ -574,13 +441,8 @@ void sha256_4x32_full( void *dst, const void *data, size_t len )
      W[14] = SHA256_8WAY_MEXP( W[12], W[ 7], W[15], W[14] ); \
      W[15] = SHA256_8WAY_MEXP( W[13], W[ 8], W[ 0], W[15] ); 

-
-// With AVX512VL ternary logic optimizations are available.
-// If not optimize by forwarding the result of X^Y in MAJ to the next round
-// to avoid recalculating it as Y^Z. This optimization is not applicable
-// when MAJ is optimized with ternary logic.
-
 #if defined(VL256)
+// AVX512 or AVX10-256

 #define CHx(X, Y, Z)    _mm256_ternarylogic_epi32( X, Y, Z, 0xca )

@@ -745,7 +607,7 @@ static inline void SHA256_8WAY_TRANSFORM( __m256i *out, __m256i *W,
 }

 // accepts LE input data
-void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
+void sha256_8x32_transform_le( __m256i *state_out, const __m256i *data,
                               const __m256i *state_in )
 {
   __m256i W[16];
@@ -754,7 +616,7 @@ void sha256_8way_transform_le( __m256i *state_out, const __m256i *data,
 }

 // Accepts BE input data, need to bswap
-void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
+void sha256_8x32_transform_be( __m256i *state_out, const __m256i *data,
                               const __m256i *state_in )
 {
   __m256i W[16];
@@ -764,7 +626,7 @@ void sha256_8way_transform_be( __m256i *state_out, const __m256i *data,
 }

 // Aggressive prehashing, LE byte order
-void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
+void sha256_8x32_prehash_3rounds( __m256i *state_mid, __m256i *X,
                                  const __m256i *W, const __m256i *state_in )
 {
   __m256i A, B, C, D, E, F, G, H, T1;
@@ -813,7 +675,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
   _mm256_store_si256( state_mid + 7, H );
 }

-void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
+void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
          const __m256i *state_in, const __m256i *state_mid, const __m256i *X )
 {
   __m256i A, B, C, D, E, F, G, H;
@@ -914,14 +776,12 @@ void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
   _mm256_store_si256( state_out + 7,  H );
 }

-int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
+int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
                           const __m256i *state_in, const uint32_t *target )
 {
   __m256i A, B, C, D, E, F, G, H, T0, T1, T2;
   __m256i vmask, targ, hash;
   __m256i W[16];  memcpy_256( W, data, 16 );
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
-                              0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
   uint8_t flip, t6_mask;

   A = _mm256_load_si256( state_in   );
@@ -1012,7 +872,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,

   // Got H, test it.
   targ = v256_32( target[7] );
-   hash = _mm256_shuffle_epi8( _mm256_add_epi32( H, IV7 ), bswap_shuf );
+   hash = mm256_bswap_32( _mm256_add_epi32( H, IV7 ) );
   if ( target[7] )
   {
      flip = ( (int)target[7] < 0 ? -1 : 0 ) ^ mm256_movmask_32( hash );
@@ -1035,7 +895,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   { 
      // Testing H was inconclusive: hash7 == target7, need to test G
      targ = _mm256_and_si256( vmask, v256_32( target[6] ) );
-      hash = _mm256_shuffle_epi8( _mm256_add_epi32( G, IV6 ), bswap_shuf );
+      hash = mm256_bswap_32( _mm256_add_epi32( G, IV6 ) );

      if ( likely( 0 == ( t6_mask & mm256_movmask_32(
                                      _mm256_cmpeq_epi32( hash, targ ) ) ) ))
@@ -1083,8 +943,7 @@ int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
   return 1;
 }

-
-void sha256_8way_init( sha256_8way_context *sc )
+void sha256_8x32_init( sha256_8x32_context *sc )
 {
   sc->count_high = sc->count_low = 0;
   sc->val[0] = v256_32( sha256_iv[0] );
@@ -1100,7 +959,7 @@ void sha256_8way_init( sha256_8way_context *sc )
 // need to handle odd byte length for yespower.
 // Assume only last update is odd.

-void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
+void sha256_8x32_update( sha256_8x32_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
@@ -1121,7 +980,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
      len -= clen;
      if ( ptr == buf_size )
      {
-         sha256_8way_transform_be( sc->val, sc->buf, sc->val );
+         sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -1132,7 +991,7 @@ void sha256_8way_update( sha256_8way_context *sc, const void *data, size_t len )
   }
 }

-void sha256_8way_close( sha256_8way_context *sc, void *dst )
+void sha256_8x32_close( sha256_8x32_context *sc, void *dst )
 {
    unsigned ptr;
    uint32_t low, high;
@@ -1146,7 +1005,7 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    if ( ptr > pad )
    {
         memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_8way_transform_be( sc->val, sc->buf, sc->val );
+         sha256_8x32_transform_be( sc->val, sc->buf, sc->val );
         memset_zero_256( sc->buf, pad >> 2 );
    }
    else
@@ -1159,17 +1018,17 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    sc->buf[   pad     >> 2 ] = v256_32( bswap_32( high ) );
    sc->buf[ ( pad+4 ) >> 2 ] = v256_32( bswap_32( low ) );

-    sha256_8way_transform_be( sc->val, sc->buf, sc->val );
+    sha256_8x32_transform_be( sc->val, sc->buf, sc->val );

    mm256_block_bswap_32( dst, sc->val );
 }

-void sha256_8way_full( void *dst, const void *data, size_t len )
+void sha256_8x32_full( void *dst, const void *data, size_t len )
 {
-   sha256_8way_context ctx;
-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, data, len );
-   sha256_8way_close( &ctx, dst );
+   sha256_8x32_context ctx;
+   sha256_8x32_init( &ctx );
+   sha256_8x32_update( &ctx, data, len );
+   sha256_8x32_close( &ctx, dst );
 }

 #if defined(SIMD512)
@@ -1302,7 +1161,7 @@ static inline void SHA256_16WAY_TRANSFORM( __m512i *out, __m512i *W,
 }

 // accepts LE input data
-void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
+void sha256_16x32_transform_le( __m512i *state_out, const __m512i *data,
                                const __m512i *state_in )
 {
   __m512i W[16];
@@ -1311,7 +1170,7 @@ void sha256_16way_transform_le( __m512i *state_out, const __m512i *data,
 }

 // Accepts BE input data, need to bswap
-void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
+void sha256_16x32_transform_be( __m512i *state_out, const __m512i *data,
                                const __m512i *state_in )
 {
   __m512i W[16];
@@ -1321,7 +1180,7 @@ void sha256_16way_transform_be( __m512i *state_out, const __m512i *data,
 }
 
 // Aggressive prehashing, LE byte order
-void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X, 
+void sha256_16x32_prehash_3rounds( __m512i *state_mid, __m512i *X, 
                                   const __m512i *W, const __m512i *state_in )
 {
   __m512i A, B, C, D, E, F, G, H, T1;
@@ -1369,7 +1228,7 @@ void sha256_16way_prehash_3rounds( __m512i *state_mid, __m512i *X,
   _mm512_store_si512( state_mid + 7, H );
 }   

-void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
+void sha256_16x32_final_rounds( __m512i *state_out, const __m512i *data,
          const __m512i *state_in, const __m512i *state_mid, const __m512i *X )
 {
   __m512i A, B, C, D, E, F, G, H;
@@ -1470,15 +1329,13 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,

 // returns 0 if hash aborted early and invalid,
 // returns 1 for completed hash with at least one valid candidate.
-int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
+int sha256_16x32_transform_le_short( __m512i *state_out, const __m512i *data,
                              const __m512i *state_in, const uint32_t *target )
 {
   __m512i A, B, C, D, E, F, G, H, hash, targ;
   __m512i T0, T1, T2;
   __m512i W[16];      memcpy_512( W, data, 16 );
   __mmask16 t6_mask;
-   const __m512i bswap_shuf = mm512_bcast_m128( _mm_set_epi64x(
-                              0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); 

   A = _mm512_load_si512( state_in   );
   B = _mm512_load_si512( state_in+1 );
@@ -1588,7 +1445,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   H = _mm512_add_epi32( H, T2 );

   // got H, test it against target[7]
-   hash = _mm512_shuffle_epi8( _mm512_add_epi32( H , IV7 ), bswap_shuf );
+   hash = mm512_bswap_32( _mm512_add_epi32( H , IV7 ) );
   targ = v512_32( target[7] );
   if ( target[7] )
   if ( likely( 0 == _mm512_cmple_epu32_mask( hash, targ ) ))
@@ -1608,7 +1465,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   // got G, test it against target[6] if indicated
   if ( (uint16_t)t6_mask )
   {
-      hash = _mm512_shuffle_epi8( _mm512_add_epi32( G, IV6 ), bswap_shuf );
+      hash = mm512_bswap_32( _mm512_add_epi32( G, IV6 ) );
      targ = v512_32( target[6] );
      if ( likely( 0 == _mm512_mask_cmple_epu32_mask( t6_mask, hash, targ ) ))
          return 0;
@@ -1644,7 +1501,7 @@ int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
   return 1;
 }

-void sha256_16way_init( sha256_16way_context *sc )
+void sha256_16x32_init( sha256_16x32_context *sc )
 {
   sc->count_high = sc->count_low = 0;
   sc->val[0] = v512_32( sha256_iv[0] );
@@ -1657,7 +1514,7 @@ void sha256_16way_init( sha256_16way_context *sc )
   sc->val[7] = v512_32( sha256_iv[7] );
 }

-void sha256_16way_update( sha256_16way_context *sc, const void *data,
+void sha256_16x32_update( sha256_16x32_context *sc, const void *data,
                           size_t len )
 {
   __m512i *vdata = (__m512i*)data;
@@ -1679,7 +1536,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
      len -= clen;
      if ( ptr == buf_size )
      {
-         sha256_16way_transform_be( sc->val, sc->buf, sc->val );
+         sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -1690,7 +1547,7 @@ void sha256_16way_update( sha256_16way_context *sc, const void *data,
   }
 }

-void sha256_16way_close( sha256_16way_context *sc, void *dst )
+void sha256_16x32_close( sha256_16x32_context *sc, void *dst )
 {
    unsigned ptr;
    uint32_t low, high;
@@ -1704,7 +1561,7 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
    if ( ptr > pad )
    {
         memset_zero_512( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_16way_transform_be( sc->val, sc->buf, sc->val );
+         sha256_16x32_transform_be( sc->val, sc->buf, sc->val );
         memset_zero_512( sc->buf, pad >> 2 );
    }
    else
@@ -1717,17 +1574,17 @@ void sha256_16way_close( sha256_16way_context *sc, void *dst )
    sc->buf[   pad     >> 2 ] = v512_32( bswap_32( high ) );
    sc->buf[ ( pad+4 ) >> 2 ] = v512_32( bswap_32( low ) );

-    sha256_16way_transform_be( sc->val, sc->buf, sc->val );
+    sha256_16x32_transform_be( sc->val, sc->buf, sc->val );

    mm512_block_bswap_32( dst, sc->val );
 }

-void sha256_16way_full( void *dst, const void *data, size_t len )
+void sha256_16x32_full( void *dst, const void *data, size_t len )
 {
-   sha256_16way_context ctx;
-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, data, len );
-   sha256_16way_close( &ctx, dst );
+   sha256_16x32_context ctx;
+   sha256_16x32_init( &ctx );
+   sha256_16x32_update( &ctx, data, len );
+   sha256_16x32_close( &ctx, dst );
 }

 #undef CH
--- a/algo/sha/sha256-hash.h
+++ b/algo/sha/sha256-hash.h
@@ -180,20 +180,9 @@ void sha256_8x32_final_rounds( __m256i *state_out, const __m256i *data,
 int sha256_8x32_transform_le_short( __m256i *state_out, const __m256i *data,
                             const __m256i *state_in, const uint32_t *target );

-// Temporary API during naming transition
-#define sha256_8way_context               sha256_8x32_context
-#define sha256_8way_init                  sha256_8x32_init
-#define sha256_8way_update                sha256_8x32_update
-#define sha256_8way_close                 sha256_8x32_close
-#define sha256_8way_full                  sha256_8x32_full
-#define sha256_8way_transform_le          sha256_8x32_transform_le
-#define sha256_8way_transform_be          sha256_8x32_transform_be
-#define sha256_8way_prehash_3rounds       sha256_8x32_prehash_3rounds
-#define sha256_8way_final_rounds          sha256_8x32_final_rounds
-#define sha256_8way_transform_le_short    sha256_8x32_transform_le_short
-
 #endif  // AVX2

+#if defined(__SSE2__) || defined(__ARM_NEON)
 // SHA-256 4 way x86_64 with SSE2 or AArch64 with NEON

 typedef struct
@@ -219,16 +208,5 @@ void sha256_4x32_final_rounds( v128_t *state_out, const v128_t *data,
 int sha256_4x32_transform_le_short( v128_t *state_out, const v128_t *data,
                            const v128_t *state_in, const uint32_t *target );

-// Temporary API during naming transition
-#define sha256_4way_context              sha256_4x32_context
-#define sha256_4way_init                 sha256_4x32_init
-#define sha256_4way_update               sha256_4x32_update
-#define sha256_4way_close                sha256_4x32_close
-#define sha256_4way_full                 sha256_4x32_full
-#define sha256_4way_transform_le         sha256_4x32_transform_le
-#define sha256_4way_transform_be         sha256_4x32_transform_be
-#define sha256_4way_prehash_3rounds      sha256_4x32_prehash_3rounds
-#define sha256_4way_final_rounds         sha256_4x32_final_rounds
-#define sha256_4way_transform_le_short   sha256_4x32_transform_le_short
-
-#endif
+#endif // SSE2 || NEON
+#endif // SHA256_HASH_H__
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -32,8 +32,6 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
-   const v128_t shuf_bswap32 =
-           v128_set64( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );

   // hash first 64 byte block of data
   sha256_transform_le( mstatea, pdata, sha256_iv );
@@ -69,10 +67,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,

      if ( unlikely( bswap_32( hasha[7] ) <= ptarget[7] ) )
      {
-          casti_v128( hasha, 0 ) =
-               _mm_shuffle_epi8( casti_v128( hasha, 0 ), shuf_bswap32 );
-          casti_v128( hasha, 1 ) =
-               _mm_shuffle_epi8( casti_v128( hasha, 1 ), shuf_bswap32 );
+          casti_v128( hasha, 0 ) = v128_bswap32( casti_v128( hasha, 0 ) );
+          casti_v128( hasha, 1 ) = v128_bswap32( casti_v128( hasha, 1 ) );
          if ( likely( valid_hash( hasha, ptarget ) && !bench ) )
          {
             pdata[19] = n;
@@ -81,10 +77,8 @@ int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
      }
      if ( unlikely( bswap_32( hashb[7] ) <= ptarget[7] ) )
      {
-         casti_v128( hashb, 0 ) =
-               _mm_shuffle_epi8( casti_v128( hashb, 0 ), shuf_bswap32 );
-         casti_v128( hashb, 1 ) =
-               _mm_shuffle_epi8( casti_v128( hashb, 1 ), shuf_bswap32 );
+         casti_v128( hashb, 0 ) = v128_bswap32( casti_v128( hashb, 0 ) );
+         casti_v128( hashb, 1 ) = v128_bswap32( casti_v128( hashb, 1 ) );
         if ( likely( valid_hash( hashb, ptarget ) && !bench ) )
         {
            pdata[19] = n+1;
@@ -204,8 +198,6 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   const int thr_id = mythr->id;
   const __m512i sixteen = v512_32( 16 );
   const bool bench = opt_benchmark;
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   // prehash first block directly from pdata
   sha256_transform_le( phash, pdata, sha256_iv );
@@ -231,7 +223,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   buf[15] = v512_32( 80*8 );  // bit count

   // partially pre-expand & prehash second message block, avoiding the nonces
-   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
+   sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

   // vectorize IV for second hash
   istate[0] = v512_32( sha256_iv[0] );
@@ -250,15 +242,14 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,

   do
   {
-      sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
-      if ( unlikely( sha256_16way_transform_le_short(
+      sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
+      if ( unlikely( sha256_16x32_transform_le_short(
                                  hash32, block, istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 16; lane++ )
         {
            extr_lane_16x32( phash, hash32, lane, 256 );
-            casti_m256i( phash, 0 ) =
-                   _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); 
+            casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) ); 
            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
            {
              pdata[19] = n + lane;
@@ -299,8 +290,6 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = v256_32( 0x80000000 );
   const __m256i eight = v256_32( 8 );
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
      vdata[i] = v256_32( pdata[i] );
@@ -325,22 +314,22 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   istate[6] = v256_32( sha256_iv[6] );
   istate[7] = v256_32( sha256_iv[7] );

-   sha256_8way_transform_le( mstate1, vdata, istate );
+   sha256_8x32_transform_le( mstate1, vdata, istate );

   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
+   sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
   
   do
   {
-      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
-      if ( unlikely( sha256_8way_transform_le_short( hash32, block,
+      sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2, mexp_pre );
+      if ( unlikely( sha256_8x32_transform_le_short( hash32, block,
                                                     istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 8; lane++ )
         {
            extr_lane_8x32( lane_hash, hash32, lane, 256 );
            casti_m256i( lane_hash, 0 ) =
-               _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
+                                mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
--- a/algo/sha/sha256d.h
+++ b/algo/sha/sha256d.h
@@ -12,7 +12,7 @@
  #define SHA256D_NEON_SHA2 1
 #elif defined(__AVX2__)
  #define SHA256D_8WAY 1
-#else
+#elif defined(__SSE2__) || defined(__ARM_NEON) 
  #define SHA256D_4WAY 1
 #endif

--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -17,7 +17,6 @@
 #elif defined (__SSE2__) || defined(__ARM_NEON) 
  #define SHA256DT_4X32 1
 #endif
-// else ref, should never happen

 static const uint32_t sha256dt_iv[8]  __attribute__ ((aligned (32))) =
 {
@@ -205,8 +204,6 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce,
   const int thr_id = mythr->id;
   const __m512i sixteen = v512_32( 16 );
   const bool bench = opt_benchmark;
-   const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   // prehash first block directly from pdata
   sha256_transform_le( phash, pdata, sha256dt_iv );
@@ -258,8 +255,7 @@ int scanhash_sha256dt_16x32( struct work *work, const uint32_t max_nonce,
         for ( int lane = 0; lane < 16; lane++ )
         {
            extr_lane_16x32( phash, hash32, lane, 256 );
-            casti_m256i( phash, 0 ) =
-                   _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); 
+            casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) ); 
            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
            {
              pdata[19] = n + lane;
@@ -298,8 +294,6 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = v256_32( 0x80000000 );
   const __m256i eight = v256_32( 8 );
-   const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
      vdata[i] = v256_32( pdata[i] );
@@ -339,7 +333,7 @@ int scanhash_sha256dt_8x32( struct work *work, const uint32_t max_nonce,
         {
            extr_lane_8x32( lane_hash, hash32, lane, 256 );
            casti_m256i( lane_hash, 0 ) =
-               _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
+                                mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
@@ -406,7 +400,6 @@ int scanhash_sha256dt_4x32( struct work *work, const uint32_t max_nonce,
   do
   {
      sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
-//      sha256_4x32_transform_le( block, vdata+16, mhash1 );
      sha256_4x32_transform_le( hash32, block, iv );

      for ( int lane = 0; lane < 4; lane++ )
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -7,28 +7,28 @@

 #if defined(SHA256T_16WAY)

-static __thread sha256_16way_context sha256_ctx16 __attribute__ ((aligned (64)));
+static __thread sha256_16x32_context sha256_ctx16 __attribute__ ((aligned (64)));

 void sha256q_16way_hash( void* output, const void* input )
 {
   uint32_t vhash[8*16] __attribute__ ((aligned (64)));
-   sha256_16way_context ctx;
+   sha256_16x32_context ctx;
   memcpy( &ctx, &sha256_ctx16, sizeof ctx );

-   sha256_16way_update( &ctx, input + (64<<4), 16 );
-   sha256_16way_close( &ctx, vhash );
+   sha256_16x32_update( &ctx, input + (64<<4), 16 );
+   sha256_16x32_close( &ctx, vhash );

-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, vhash, 32 );
-   sha256_16way_close( &ctx, vhash );
+   sha256_16x32_init( &ctx );
+   sha256_16x32_update( &ctx, vhash, 32 );
+   sha256_16x32_close( &ctx, vhash );

-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, vhash, 32 );
-   sha256_16way_close( &ctx, vhash );
+   sha256_16x32_init( &ctx );
+   sha256_16x32_update( &ctx, vhash, 32 );
+   sha256_16x32_close( &ctx, vhash );

-   sha256_16way_init( &ctx );
-   sha256_16way_update( &ctx, vhash, 32 );
-   sha256_16way_close( &ctx, output );
+   sha256_16x32_init( &ctx );
+   sha256_16x32_update( &ctx, vhash, 32 );
+   sha256_16x32_close( &ctx, output );
 }

 int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
@@ -51,8 +51,8 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,
   mm512_bswap32_intrlv80_16x32( vdata, pdata );
   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
-   sha256_16way_init( &sha256_ctx16 );
-   sha256_16way_update( &sha256_ctx16, vdata, 64 );
+   sha256_16x32_init( &sha256_ctx16 );
+   sha256_16x32_update( &sha256_ctx16, vdata, 64 );

   do
   {
@@ -80,28 +80,28 @@ int scanhash_sha256q_16way( struct work *work, const uint32_t max_nonce,

 #if defined(SHA256T_8WAY)

-static __thread sha256_8way_context sha256_ctx8 __attribute__ ((aligned (64)));
+static __thread sha256_8x32_context sha256_ctx8 __attribute__ ((aligned (64)));

 void sha256q_8way_hash( void* output, const void* input )
 {
   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-   sha256_8way_context ctx;
+   sha256_8x32_context ctx;
   memcpy( &ctx, &sha256_ctx8, sizeof ctx );

-   sha256_8way_update( &ctx, input + (64<<3), 16 );
-   sha256_8way_close( &ctx, vhash );
+   sha256_8x32_update( &ctx, input + (64<<3), 16 );
+   sha256_8x32_close( &ctx, vhash );

-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, vhash );
+   sha256_8x32_init( &ctx );
+   sha256_8x32_update( &ctx, vhash, 32 );
+   sha256_8x32_close( &ctx, vhash );

-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, vhash );
+   sha256_8x32_init( &ctx );
+   sha256_8x32_update( &ctx, vhash, 32 );
+   sha256_8x32_close( &ctx, vhash );

-   sha256_8way_init( &ctx );
-   sha256_8way_update( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, output );
+   sha256_8x32_init( &ctx );
+   sha256_8x32_update( &ctx, vhash, 32 );
+   sha256_8x32_close( &ctx, output );
 }

 int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,
@@ -123,8 +123,8 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
-   sha256_8way_init( &sha256_ctx8 );
-   sha256_8way_update( &sha256_ctx8, vdata, 64 );
+   sha256_8x32_init( &sha256_ctx8 );
+   sha256_8x32_update( &sha256_ctx8, vdata, 64 );

   do
   {
@@ -152,28 +152,28 @@ int scanhash_sha256q_8way( struct work *work, const uint32_t max_nonce,

 #if defined(SHA256T_4WAY)

-static __thread sha256_4way_context sha256_ctx4 __attribute__ ((aligned (64)));
+static __thread sha256_4x32_context sha256_ctx4 __attribute__ ((aligned (64)));

 void sha256q_4way_hash( void* output, const void* input )
 {
   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-   sha256_4way_context ctx;
+   sha256_4x32_context ctx;
   memcpy( &ctx, &sha256_ctx4, sizeof ctx );

-   sha256_4way_update( &ctx, input + (64<<2), 16 );
-   sha256_4way_close( &ctx, vhash );
+   sha256_4x32_update( &ctx, input + (64<<2), 16 );
+   sha256_4x32_close( &ctx, vhash );

-   sha256_4way_init( &ctx );
-   sha256_4way_update( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, vhash );
+   sha256_4x32_init( &ctx );
+   sha256_4x32_update( &ctx, vhash, 32 );
+   sha256_4x32_close( &ctx, vhash );

-   sha256_4way_init( &ctx );
-   sha256_4way_update( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, vhash );
+   sha256_4x32_init( &ctx );
+   sha256_4x32_update( &ctx, vhash, 32 );
+   sha256_4x32_close( &ctx, vhash );

-   sha256_4way_init( &ctx );
-   sha256_4way_update( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, output );
+   sha256_4x32_init( &ctx );
+   sha256_4x32_update( &ctx, vhash, 32 );
+   sha256_4x32_close( &ctx, output );
 }

 int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
@@ -205,8 +205,8 @@ int scanhash_sha256q_4way( struct work *work, uint32_t max_nonce,
                                        0 };

   v128_bswap32_intrlv80_4x32( vdata, pdata );
-   sha256_4way_init( &sha256_ctx4 );
-   sha256_4way_update( &sha256_ctx4, vdata, 64 );
+   sha256_4x32_init( &sha256_ctx4 );
+   sha256_4x32_update( &sha256_ctx4, vdata, 64 );

   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
   {
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -35,8 +35,6 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   const int thr_id = mythr->id;
   const __m512i sixteen = v512_32( 16 );
   const bool bench = opt_benchmark;
-   const __m256i bswap_shuf = mm256_bcast_m128( v128_set64(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   // prehash first block directly from pdata
   sha256_transform_le( phash, pdata, sha256_iv );
@@ -62,7 +60,7 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   buf[15] = v512_32( 80*8 ); // bit count

   // partially pre-expand & prehash second message block, avoiding the nonces
-   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
+   sha256_16x32_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );

   // vectorize IV for 2nd & 3rd sha256
   istate[0] = v512_32( sha256_iv[0] );
@@ -81,18 +79,17 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,

   do
   {
-      sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
+      sha256_16x32_final_rounds( block, buf, mstate1, mstate2, mexp_pre );

-      sha256_16way_transform_le( block, block, istate );
+      sha256_16x32_transform_le( block, block, istate );

-      if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
+      if ( sha256_16x32_transform_le_short( hash32, block, istate, ptarget ) )
      {
         for ( int lane = 0; lane < 16; lane++ )
         if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
         {
            extr_lane_16x32( phash, hash32, lane, 256 );
-            casti_m256i( phash, 0 ) =
-                _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
+            casti_m256i( phash, 0 ) = mm256_bswap_32( casti_m256i( phash, 0 ) );
            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
@@ -301,8 +298,6 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = v256_32( 0x80000000 );
   const __m256i eight = v256_32( 8 );
-   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
-                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
      vdata[i] = v256_32( pdata[i] );
@@ -327,29 +322,29 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   istate[6] = v256_32( sha256_iv[6] );
   istate[7] = v256_32( sha256_iv[7] );

-   sha256_8way_transform_le( mstate1, vdata, istate );
+   sha256_8x32_transform_le( mstate1, vdata, istate );

   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
+   sha256_8x32_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );

   do
   {
      // 1. final 16 bytes of data, with padding
-      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
+      sha256_8x32_final_rounds( block, vdata+16, mstate1, mstate2,
                                mexp_pre );

      // 2. 32 byte hash from 1.
-      sha256_8way_transform_le( block, block, istate );
+      sha256_8x32_transform_le( block, block, istate );

      // 3. 32 byte hash from 2.
-      if ( unlikely( sha256_8way_transform_le_short(
+      if ( unlikely( sha256_8x32_transform_le_short(
                                    hash32, block, istate, ptarget ) ) )
      {
         for ( int lane = 0; lane < 8; lane++ )
         {
            extr_lane_8x32( lane_hash, hash32, lane, 256 );
            casti_m256i( lane_hash, 0 ) =
-             _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
+                             mm256_bswap_32( casti_m256i( lane_hash, 0 ) );
            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
@@ -419,8 +414,8 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
   do
   {
      sha256_4x32_final_rounds( block, vdata+16, mhash1, mhash2, mexp_pre );
-      sha256_4way_transform_le( block,  block, iv );
-      sha256_4way_transform_le( hash32, block, iv );
+      sha256_4x32_transform_le( block,  block, iv );
+      sha256_4x32_transform_le( hash32, block, iv );

      for ( int lane = 0; lane < 4; lane++ )
      {
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -83,15 +83,13 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,
                              const uint64_t *state_in )
 {
    __m256i STATE0, STATE1;
-    __m256i MSG, TMP, BSWAP64;
+    __m256i MSG, TMP;
    __m256i TMSG0, TMSG1, TMSG2, TMSG3;
    __m256i ABEF_SAVE, CDGH_SAVE;

    // Load initial values
    TMP = _mm256_load_si256( (__m256i*) &state_in[0] );
    STATE1 = _mm256_load_si256( (__m256i*) &state_in[4] );
-    BSWAP64 = mm256_bcast_m128( _mm_set_epi64x( 0x08090a0b0c0d0e0f,
-                                                0x0001020304050607 ) );
    TMP = _mm256_permute4x64_epi64( TMP, 0xB1 );             // CDAB
    STATE1 = _mm256_permute4x64_epi64( STATE1, 0x1B );       // EFGH
    STATE0 = _mm256_permute2x128_si256( TMP, STATE1, 0x21 ); // ABEF
@@ -103,7 +101,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,

    // Rounds 0-3
    TMSG0 = _mm256_load_si256( (const __m256i*) (input+0) );
-    TMSG0 = _mm256_shuffle_epi8( TMSG0, BSWAP64 );
+    TMSG0 = mm256_bswap_64( TMSG0 );
    MSG = _mm256_add_epi64( TMSG0, casti_m256i( K512, 0 ) );
    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
                                       _mm256_castsi256_si128 (MSG ) );
@@ -113,7 +111,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,

    // Rounds 4-7
    TMSG1 = _mm256_load_si256( (const __m256i*) (input+16) );
-    TMSG1 = _mm256_shuffle_epi8( TMSG1, BSWAP64 );
+    TMSG1 = mm256_bswap_64( TMSG1 );
    MSG = _mm256_add_epi64( TMSG1, casti_m256i( K512, 1 ) );
    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
                                        _mm256_castsi256_si128( MSG ) );
@@ -124,7 +122,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,

    // Rounds 8-11
    TMSG2 = _mm256_load_si256( (const __m256i*) (input+32) );
-    TMSG2 = _mm256_shuffle_epi8( TMSG2, BSWAP64 );
+    TMSG2 = mm256_bswap_64( TMSG2 );
    MSG = _mm256_add_epi64( TMSG2, casti_m256i( K512, 2 ) );
    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
                                       _mm256_castsi256_si128( MSG ) );
@@ -135,7 +133,7 @@ void sha512_opt_transform_be( uint64_t *state_out, const void *input,

    // Rounds 12-15
    TMSG3 = _mm256_load_si256( (const __m256i*) (input+48) );
-    TMSG3 = _mm256_shuffle_epi8( TMSG3, BSWAP64 );
+    TMSG3 = mm256_bswap_64( TMSG3 );
    MSG = _mm256_add_epi64( TMSG3, casti_m256i( K512, 3 ) );
    STATE1 = _mm256_sha512rnds2_epi64( STATE1, STATE0,
                                       _mm256_castsi256_si128( MSG ) );
@@ -735,8 +733,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst )
    unsigned ptr;
    const int buf_size = 128;
    const int pad = buf_size - 16;
-    const __m512i shuff_bswap64 = mm512_bcast_m128( _mm_set_epi64x(
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = v512_64( 0x80 );
@@ -750,10 +746,8 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst )
    else
         memset_zero_512( sc->buf + (ptr>>3), (pad - ptr) >> 3 );

-    sc->buf[ pad >> 3 ] = _mm512_shuffle_epi8(
-                       v512_64( sc->count >> 61 ), shuff_bswap64 );
-    sc->buf[ ( pad+8 ) >> 3 ] = _mm512_shuffle_epi8(
-                       v512_64( sc->count <<  3 ), shuff_bswap64 );
+    sc->buf[ pad >> 3 ] = v512_64( bswap_64( sc->count >> 61 ) );
+    sc->buf[ ( pad+8 ) >> 3 ] = v512_64( bswap_64( sc->count <<  3 ) );
    sha512_8x64_round( sc, sc->buf, sc->val );

    mm512_block_bswap_64( dst, sc->val );
@@ -957,8 +951,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst )
    unsigned ptr;
    const int buf_size = 128;
    const int pad = buf_size - 16;
-    const __m256i shuff_bswap64 = mm256_bcast_m128( _mm_set_epi64x(
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = v256_64( 0x80 );
@@ -972,10 +964,8 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst )
    else
         memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );

-    sc->buf[ pad >> 3 ] = _mm256_shuffle_epi8(
-                       v256_64( sc->count >> 61 ), shuff_bswap64 );
-    sc->buf[ ( pad+8 ) >> 3 ] = _mm256_shuffle_epi8( 
-                       v256_64( sc->count <<  3 ), shuff_bswap64 );
+    sc->buf[ pad >> 3 ] = v256_64( bswap_64( sc->count >> 61 ) );
+    sc->buf[ ( pad+8 ) >> 3 ] = v256_64( bswap_64( sc->count <<  3 ) );
    sha512_4x64_round( sc, sc->buf, sc->val );

    mm256_block_bswap_64( dst, sc->val );
@@ -1138,8 +1128,8 @@ void sha512_2x64_close( sha512_2x64_context *sc, void *dst )
    else
         v128_memset_zero( sc->buf + (ptr>>3), (pad - ptr) >> 3 );

-    sc->buf[ pad >> 3 ] = v128_bswap64( v128_64( sc->count >> 61 ) );
-    sc->buf[ ( pad+8 ) >> 3 ] = v128_bswap64( v128_64( sc->count << 3 ) );
+    sc->buf[ pad >> 3 ] = v128_64( bswap_64( sc->count >> 61 ) );
+    sc->buf[ ( pad+8 ) >> 3 ] = v128_64( bswap_64( sc->count << 3 ) );
    sha512_2x64_round( sc, sc->buf, sc->val );

    v128_block_bswap64( castp_v128u64( dst ), sc->val );
--- a/algo/sha/sha512-hash.h
+++ b/algo/sha/sha512-hash.h
@@ -36,7 +36,6 @@ typedef struct
   uint64_t count;
   bool initialized;
 } sha512_8x64_context __attribute__ ((aligned (128)));
-#define sha512_8way_context sha512_8x64_context

 void sha512_8x64_init( sha512_8x64_context *sc);
 void sha512_8x64_update( sha512_8x64_context *sc, const void *data, 
@@ -45,10 +44,6 @@ void sha512_8x64_close( sha512_8x64_context *sc, void *dst );
 void sha512_8x64_ctx( sha512_8x64_context *sc, void *dst, const void *data,
                      size_t len );

-#define sha512_8way_init     sha512_8x64_init
-#define sha512_8way_update   sha512_8x64_update
-#define sha512_8way_close    sha512_8x64_close
-
 #endif  // AVX512

 #if defined (__AVX2__)
@@ -62,7 +57,6 @@ typedef struct
   uint64_t count;
   bool initialized;
 } sha512_4x64_context __attribute__ ((aligned (64)));
-#define sha512_4way_context sha512_4x64_context

 void sha512_4x64_init( sha512_4x64_context *sc);
 void sha512_4x64_update( sha512_4x64_context *sc, const void *data,
@@ -71,10 +65,6 @@ void sha512_4x64_close( sha512_4x64_context *sc, void *dst );
 void sha512_4x64_ctx( sha512_4x64_context *sc, void *dst, const void *data,
                       size_t len );

-#define sha512_4way_init     sha512_4x64_init
-#define sha512_4way_update   sha512_4x64_update
-#define sha512_4way_close    sha512_4x64_close
-
 #endif  // AVX2

 typedef struct
--- a/algo/sha/sha512256d-4way.c
+++ b/algo/sha/sha512256d-4way.c
@@ -14,7 +14,7 @@

 #if defined(SHA512256D_8WAY)

-static void sha512256d_8way_init( sha512_8way_context *ctx )
+static void sha512256d_8x64_init( sha512_8x64_context *ctx )
 {
  ctx->count = 0;
  ctx->initialized = true;
@@ -33,7 +33,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
 {
    uint64_t hash[8*8] __attribute__ ((aligned (128)));
    uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-    sha512_8way_context ctx; 
+    sha512_8x64_context ctx; 
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    uint64_t *hash_q3 = &(hash[3*8]);
    uint32_t *pdata = work->data;
@@ -53,13 +53,13 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ), *noncev );
    do
    {
-       sha512256d_8way_init( &ctx );
-       sha512_8way_update( &ctx, vdata, 80 );
-       sha512_8way_close( &ctx, hash );        
+       sha512256d_8x64_init( &ctx );
+       sha512_8x64_update( &ctx, vdata, 80 );
+       sha512_8x64_close( &ctx, hash );        

-       sha512256d_8way_init( &ctx );
-       sha512_8way_update( &ctx, hash, 32 );
-       sha512_8way_close( &ctx, hash );
+       sha512256d_8x64_init( &ctx );
+       sha512_8x64_update( &ctx, hash, 32 );
+       sha512_8x64_close( &ctx, hash );

       for ( int lane = 0; lane < 8; lane++ )
       if ( unlikely( hash_q3[ lane ] <= targ_q3 && !bench ) )
@@ -82,7 +82,7 @@ int scanhash_sha512256d_8way( struct work *work, uint32_t max_nonce,

 #elif defined(SHA512256D_4WAY)

-static void sha512256d_4way_init( sha512_4way_context *ctx )
+static void sha512256d_4x64_init( sha512_4x64_context *ctx )
 {
  ctx->count = 0;
  ctx->initialized = true;
@@ -101,7 +101,7 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
 {
    uint64_t hash[8*4] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-    sha512_4way_context ctx;
+    sha512_4x64_context ctx;
    uint32_t lane_hash[8] __attribute__ ((aligned (32)));
    uint64_t *hash_q3 = &(hash[3*4]);
    uint32_t *pdata = work->data;
@@ -119,13 +119,13 @@ int scanhash_sha512256d_4way( struct work *work, uint32_t max_nonce,
                     n+3, 0, n+2, 0, n+1, 0, n, 0 ), casti_m256i( vdata,9 ) );
    do
    {
-       sha512256d_4way_init( &ctx );
-       sha512_4way_update( &ctx, vdata, 80 );
-       sha512_4way_close( &ctx, hash );
+       sha512256d_4x64_init( &ctx );
+       sha512_4x64_update( &ctx, vdata, 80 );
+       sha512_4x64_close( &ctx, hash );

-       sha512256d_4way_init( &ctx );
-       sha512_4way_update( &ctx, hash, 32 );
-       sha512_4way_close( &ctx, hash );
+       sha512256d_4x64_init( &ctx );
+       sha512_4x64_update( &ctx, hash, 32 );
+       sha512_4x64_close( &ctx, hash );

       for ( int lane = 0; lane < 4; lane++ )
       if ( hash_q3[ lane ] <= targ_q3 )