v25.4

2025-09-17 23:44:27 +00:00 · 2025-06-20 20:31:41 -04:00
parent dd99580a4c
commit 66191db93c
86 changed files with 2701 additions and 4322 deletions
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -6,15 +6,15 @@

 #if defined (BLAKE_4WAY)

-blake256r14_4way_context blake_4w_ctx;
+blake256r14_4x32_context blake_4w_ctx;

 void blakehash_4way(void *state, const void *input)
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake256r14_4way_context ctx;
+     blake256r14_4x32_context ctx;
     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
-     blake256r14_4way_update( &ctx, input + (64<<2), 16 );
-     blake256r14_4way_close( &ctx, vhash );
+     blake256r14_4x32_update( &ctx, input + (64<<2), 16 );
+     blake256r14_4x32_close( &ctx, vhash );
     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

@@ -35,8 +35,8 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,
      HTarget = 0x7f;

   v128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake256r14_4way_init( &blake_4w_ctx );
-   blake256r14_4way_update( &blake_4w_ctx, vdata, 64 );
+   blake256r14_4x32_init( &blake_4w_ctx );
+   blake256r14_4x32_update( &blake_4w_ctx, vdata, 64 );

   do {
      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
@@ -61,15 +61,15 @@ int scanhash_blake_4way( struct work *work, uint32_t max_nonce,

 #if defined(BLAKE_8WAY)

-blake256r14_8way_context blake_8w_ctx;
+blake256r14_8x32_context blake_8w_ctx;

 void blakehash_8way( void *state, const void *input )
 {
     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
-     blake256r14_8way_context ctx;
+     blake256r14_8x32_context ctx;
     memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
-     blake256r14_8way( &ctx, input + (64<<3), 16 );
-     blake256r14_8way_close( &ctx, vhash );
+     blake256r14_8x32( &ctx, input + (64<<3), 16 );
+     blake256r14_8x32_close( &ctx, vhash );
     _dintrlv_8x32( state,     state+ 32, state+ 64, state+ 96,
                    state+128, state+160, state+192, state+224,
                    vhash, 256 );
@@ -93,8 +93,8 @@ int scanhash_blake_8way( struct work *work, uint32_t max_nonce,

   mm256_bswap32_intrlv80_8x32( vdata, pdata );

-   blake256r14_8way_init( &blake_8w_ctx );
-   blake256r14_8way( &blake_8w_ctx, vdata, 64 );
+   blake256r14_8x32_init( &blake_8w_ctx );
+   blake256r14_8x32( &blake_8w_ctx, vdata, 64 );

   do {
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
--- a/algo/blake/blake256-hash.c
+++ b/algo/blake/blake256-hash.c
@@ -423,33 +423,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 		(state)->T1 = T1; \
 	} while (0)

-
-#if defined(__SSSE3__)
-
-#define BLAKE256_4X32_BLOCK_BSWAP32 \
-{ \
-   v128_t shuf_bswap32 = v128_set64( 0x0c0d0e0f08090a0b, \
-                                     0x0405060700010203 ); \
-   M0 = _mm_shuffle_epi8( buf[ 0], shuf_bswap32 ); \
-   M1 = _mm_shuffle_epi8( buf[ 1], shuf_bswap32 ); \
-   M2 = _mm_shuffle_epi8( buf[ 2], shuf_bswap32 ); \
-   M3 = _mm_shuffle_epi8( buf[ 3], shuf_bswap32 ); \
-   M4 = _mm_shuffle_epi8( buf[ 4], shuf_bswap32 ); \
-   M5 = _mm_shuffle_epi8( buf[ 5], shuf_bswap32 ); \
-   M6 = _mm_shuffle_epi8( buf[ 6], shuf_bswap32 ); \
-   M7 = _mm_shuffle_epi8( buf[ 7], shuf_bswap32 ); \
-   M8 = _mm_shuffle_epi8( buf[ 8], shuf_bswap32 ); \
-   M9 = _mm_shuffle_epi8( buf[ 9], shuf_bswap32 ); \
-   MA = _mm_shuffle_epi8( buf[10], shuf_bswap32 ); \
-   MB = _mm_shuffle_epi8( buf[11], shuf_bswap32 ); \
-   MC = _mm_shuffle_epi8( buf[12], shuf_bswap32 ); \
-   MD = _mm_shuffle_epi8( buf[13], shuf_bswap32 ); \
-   ME = _mm_shuffle_epi8( buf[14], shuf_bswap32 ); \
-   MF = _mm_shuffle_epi8( buf[15], shuf_bswap32 ); \
-}
-
-#else  // SSE2
-
 #define BLAKE256_4X32_BLOCK_BSWAP32 \
 { \
   M0 = v128_bswap32( buf[0] ); \
@@ -470,8 +443,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
   MF = v128_bswap32( buf[15] ); \
 }

-#endif  // SSSE3 else SSE2
-
 #define COMPRESS32_4X32( rounds ) \
 { \
   v128_t M0, M1, M2, M3, M4, M5, M6, M7; \
@@ -926,22 +897,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
      ROUND_S_4X32_3;
   }

-#if defined(__SSSE3__)
-
-   const v128_t shuf_bswap32 =
-                      v128_set64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
-
-   H[0] = _mm_shuffle_epi8( v128_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm_shuffle_epi8( v128_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm_shuffle_epi8( v128_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm_shuffle_epi8( v128_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm_shuffle_epi8( v128_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm_shuffle_epi8( v128_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm_shuffle_epi8( v128_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm_shuffle_epi8( v128_xor3( VF, V7, h[7] ), shuf_bswap32 );
-
-#else
-
   H[0] = v128_bswap32( v128_xor3( V8, V0, h[0] ) );
   H[1] = v128_bswap32( v128_xor3( V9, V1, h[1] ) );
   H[2] = v128_bswap32( v128_xor3( VA, V2, h[2] ) );
@@ -950,8 +905,6 @@ void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
   H[5] = v128_bswap32( v128_xor3( VD, V5, h[5] ) );
   H[6] = v128_bswap32( v128_xor3( VE, V6, h[6] ) );
   H[7] = v128_bswap32( v128_xor3( VF, V7, h[7] ) );
-
-#endif
 }

 #if defined (__AVX2__)
@@ -1291,24 +1244,22 @@ do { \
   VD = v256_32( T0 ^ 0x299F31D0 ); \
   VE = v256_32( T1 ^ 0x082EFA98 ); \
   VF = v256_32( T1 ^ 0xEC4E6C89 ); \
-   const __m256i shuf_bswap32 = mm256_set2_64( \
-                               0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
-   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
-   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
-   M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
-   M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
-   M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
-   M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
-   M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
-   M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
-   M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
-   M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
-   MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
-   MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
-   MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
-   MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
-   ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
-   MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
+   M0 = mm256_bswap_32( * buf     ); \
+   M1 = mm256_bswap_32( *(buf+ 1) ); \
+   M2 = mm256_bswap_32( *(buf+ 2) ); \
+   M3 = mm256_bswap_32( *(buf+ 3) ); \
+   M4 = mm256_bswap_32( *(buf+ 4) ); \
+   M5 = mm256_bswap_32( *(buf+ 5) ); \
+   M6 = mm256_bswap_32( *(buf+ 6) ); \
+   M7 = mm256_bswap_32( *(buf+ 7) ); \
+   M8 = mm256_bswap_32( *(buf+ 8) ); \
+   M9 = mm256_bswap_32( *(buf+ 9) ); \
+   MA = mm256_bswap_32( *(buf+10) ); \
+   MB = mm256_bswap_32( *(buf+11) ); \
+   MC = mm256_bswap_32( *(buf+12) ); \
+   MD = mm256_bswap_32( *(buf+13) ); \
+   ME = mm256_bswap_32( *(buf+14) ); \
+   MF = mm256_bswap_32( *(buf+15) ); \
   ROUND_S_8WAY(0); \
   ROUND_S_8WAY(1); \
   ROUND_S_8WAY(2); \
@@ -1401,7 +1352,7 @@ do { \
   H7 = mm256_xor3( VF, V7, H7 ); \
 }

-void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
+void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
                                      void *data )
 {
   __m256i *M = (__m256i*)data;
@@ -1491,7 +1442,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
                         _mm256_xor_si256( v256_32( CSE ), M[15] ) );
 }

-void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
                     const void *midhash, const void *data, const int rounds )
 {
   __m256i *H = (__m256i*)final_hash;
@@ -1596,17 +1547,14 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
      ROUND256_8WAY_3;
   }

-   const __m256i shuf_bswap32 =
-                  mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
-
-   H[0] = _mm256_shuffle_epi8( mm256_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm256_shuffle_epi8( mm256_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm256_shuffle_epi8( mm256_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm256_shuffle_epi8( mm256_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm256_shuffle_epi8( mm256_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm256_shuffle_epi8( mm256_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm256_shuffle_epi8( mm256_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm256_shuffle_epi8( mm256_xor3( VF, V7, h[7] ), shuf_bswap32 );
+   H[0] = mm256_bswap_32( mm256_xor3( V8, V0, h[0] ) );
+   H[1] = mm256_bswap_32( mm256_xor3( V9, V1, h[1] ) );
+   H[2] = mm256_bswap_32( mm256_xor3( VA, V2, h[2] ) );
+   H[3] = mm256_bswap_32( mm256_xor3( VB, V3, h[3] ) );
+   H[4] = mm256_bswap_32( mm256_xor3( VC, V4, h[4] ) );
+   H[5] = mm256_bswap_32( mm256_xor3( VD, V5, h[5] ) );
+   H[6] = mm256_bswap_32( mm256_xor3( VE, V6, h[6] ) );
+   H[7] = mm256_bswap_32( mm256_xor3( VF, V7, h[7] ) );
 }

 #endif
@@ -1933,8 +1881,6 @@ do { \
   __m512i M8, M9, MA, MB, MC, MD, ME, MF; \
   __m512i V0, V1, V2, V3, V4, V5, V6, V7; \
   __m512i V8, V9, VA, VB, VC, VD, VE, VF; \
-   const __m512i shuf_bswap32 = mm512_bcast_m128( v128_set64( \
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) ); \
   V0 = H0; \
   V1 = H1; \
   V2 = H2; \
@@ -1951,22 +1897,22 @@ do { \
   VD = v512_32( T0 ^ 0x299F31D0 ); \
   VE = v512_32( T1 ^ 0x082EFA98 ); \
   VF = v512_32( T1 ^ 0xEC4E6C89 ); \
-   M0 = _mm512_shuffle_epi8( * buf    , shuf_bswap32 ); \
-   M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
-   M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
-   M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap32 ); \
-   M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap32 ); \
-   M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap32 ); \
-   M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap32 ); \
-   M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap32 ); \
-   M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap32 ); \
-   M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap32 ); \
-   MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap32 ); \
-   MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap32 ); \
-   MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap32 ); \
-   MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap32 ); \
-   ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap32 ); \
-   MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap32 ); \
+   M0 = mm512_bswap_32( * buf     ); \
+   M1 = mm512_bswap_32( *(buf+ 1) ); \
+   M2 = mm512_bswap_32( *(buf+ 2) ); \
+   M3 = mm512_bswap_32( *(buf+ 3) ); \
+   M4 = mm512_bswap_32( *(buf+ 4) ); \
+   M5 = mm512_bswap_32( *(buf+ 5) ); \
+   M6 = mm512_bswap_32( *(buf+ 6) ); \
+   M7 = mm512_bswap_32( *(buf+ 7) ); \
+   M8 = mm512_bswap_32( *(buf+ 8) ); \
+   M9 = mm512_bswap_32( *(buf+ 9) ); \
+   MA = mm512_bswap_32( *(buf+10) ); \
+   MB = mm512_bswap_32( *(buf+11) ); \
+   MC = mm512_bswap_32( *(buf+12) ); \
+   MD = mm512_bswap_32( *(buf+13) ); \
+   ME = mm512_bswap_32( *(buf+14) ); \
+   MF = mm512_bswap_32( *(buf+15) ); \
   ROUND_S_16WAY(0); \
   ROUND_S_16WAY(1); \
   ROUND_S_16WAY(2); \
@@ -2063,7 +2009,7 @@ do { \
 // is constant for every nonce and only needs to be run once per job. The
 // second part is run for each nonce using the precalculated midstate and the
 // hash from the first block.
-void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
+void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
                                       void *data )
 {
   __m512i *M = (__m512i*)data;
@@ -2157,7 +2103,7 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
 }

 // Dfault is 14 rounds, blakecoin & vanilla are 8.
-void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
                     const void *midhash, const void *data, const int rounds )
 {
   __m512i *H = (__m512i*)final_hash;
@@ -2274,27 +2220,23 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
   }

   // Byte swap final hash
-   const __m512i shuf_bswap32 =  mm512_bcast_m128( v128_set64( 
-                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
-   H[0] = _mm512_shuffle_epi8( mm512_xor3( V8, V0, h[0] ), shuf_bswap32 );
-   H[1] = _mm512_shuffle_epi8( mm512_xor3( V9, V1, h[1] ), shuf_bswap32 );
-   H[2] = _mm512_shuffle_epi8( mm512_xor3( VA, V2, h[2] ), shuf_bswap32 );
-   H[3] = _mm512_shuffle_epi8( mm512_xor3( VB, V3, h[3] ), shuf_bswap32 );
-   H[4] = _mm512_shuffle_epi8( mm512_xor3( VC, V4, h[4] ), shuf_bswap32 );
-   H[5] = _mm512_shuffle_epi8( mm512_xor3( VD, V5, h[5] ), shuf_bswap32 );
-   H[6] = _mm512_shuffle_epi8( mm512_xor3( VE, V6, h[6] ), shuf_bswap32 );
-   H[7] = _mm512_shuffle_epi8( mm512_xor3( VF, V7, h[7] ), shuf_bswap32 );
+   H[0] = mm512_bswap_32( mm512_xor3( V8, V0, h[0] ) );
+   H[1] = mm512_bswap_32( mm512_xor3( V9, V1, h[1] ) );
+   H[2] = mm512_bswap_32( mm512_xor3( VA, V2, h[2] ) );
+   H[3] = mm512_bswap_32( mm512_xor3( VB, V3, h[3] ) );
+   H[4] = mm512_bswap_32( mm512_xor3( VC, V4, h[4] ) );
+   H[5] = mm512_bswap_32( mm512_xor3( VD, V5, h[5] ) );
+   H[6] = mm512_bswap_32( mm512_xor3( VE, V6, h[6] ) );
+   H[7] = mm512_bswap_32( mm512_xor3( VF, V7, h[7] ) );
 }

 #endif

 // Blake-256 4 way

-static const uint32_t salt_zero_4x32_small[4] = { 0, 0, 0, 0 };
-
 static void
 blake32_4x32_init( blake_4x32_small_context *ctx, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
+                   int rounds )
 {
   casti_v128( ctx->H, 0 ) = v128_64( 0x6A09E6676A09E667 );
   casti_v128( ctx->H, 1 ) = v128_64( 0xBB67AE85BB67AE85 );
@@ -2404,11 +2346,10 @@ blake32_4x32_close( blake_4x32_small_context *ctx, unsigned ub, unsigned n,

 // Blake-256 8 way

-static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };

 static void
-blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
+blake32_8way_init( blake256_8x32_context *sc, const uint32_t *iv,
+                   int rounds )
 {
   casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E6676A09E667 );
   casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE85BB67AE85 );
@@ -2424,7 +2365,7 @@ blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
 }

 static void
-blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
+blake32_8way( blake256_8x32_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf;
@@ -2466,7 +2407,7 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
 }

 static void
-blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
+blake32_8way_close( blake256_8x32_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
   __m256i buf[16];
@@ -2520,7 +2461,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
 }

 static void
-blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
+blake32_8way_le( blake256_8x32_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf;
@@ -2562,7 +2503,7 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
 }

 static void
-blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
+blake32_8way_close_le( blake256_8x32_context *sc, unsigned ub, unsigned n,
                       void *dst, size_t out_size_w32 )
 {
   __m256i buf[16];
@@ -2622,8 +2563,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
 //Blake-256 16 way AVX512

 static void
-blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
+blake32_16way_init( blake256_16x32_context *sc, const uint32_t *iv,
+                    int rounds )
 {
   casti_m512i( sc->H, 0 ) = v512_64( 0x6A09E6676A09E667 );
   casti_m512i( sc->H, 1 ) = v512_64( 0xBB67AE85BB67AE85 );
@@ -2639,7 +2580,7 @@ blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
 }

 static void
-blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
+blake32_16way( blake256_16x32_context *sc, const void *data, size_t len )
 {
   __m512i *vdata = (__m512i*)data;
   __m512i *buf;
@@ -2679,7 +2620,7 @@ blake32_16way( blake_16way_small_context *sc, const void *data, size_t len )
   sc->ptr = ptr;
 }
 static void
-blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
+blake32_16way_close( blake256_16x32_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
   __m512i buf[16];
@@ -2733,7 +2674,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
 }

 static void
-blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
+blake32_16way_le( blake256_16x32_context *sc, const void *data, size_t len )
 {
   __m512i *vdata = (__m512i*)data;
   __m512i *buf;
@@ -2776,7 +2717,7 @@ blake32_16way_le( blake_16way_small_context *sc, const void *data, size_t len )
 }

 static void
-blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
+blake32_16way_close_le( blake256_16x32_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
   __m512i buf[16];
@@ -2827,65 +2768,65 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
 }

 void
-blake256_16way_init(void *cc)
+blake256_16x32_init(void *cc)
 {
-   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_16way_init( cc, IV256, 14 );
 }

 void
-blake256_16way_update(void *cc, const void *data, size_t len)
+blake256_16x32_update(void *cc, const void *data, size_t len)
 {
        blake32_16way(cc, data, len);
 }

 void
-blake256_16way_close(void *cc, void *dst)
+blake256_16x32_close(void *cc, void *dst)
 {
        blake32_16way_close(cc, 0, 0, dst, 8);
 }

 void
-blake256_16way_update_le(void *cc, const void *data, size_t len)
+blake256_16x32_update_le(void *cc, const void *data, size_t len)
 {
   blake32_16way_le(cc, data, len);
 }

 void
-blake256_16way_close_le(void *cc, void *dst)
+blake256_16x32_close_le(void *cc, void *dst)
 {
    blake32_16way_close_le(cc, 0, 0, dst, 8);
 }

 void blake256r14_16way_init(void *cc)
 {
-   blake32_16way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_16way_init( cc, IV256, 14 );
 }

 void
-blake256r14_16way_update(void *cc, const void *data, size_t len)
+blake256r14_16x32_update(void *cc, const void *data, size_t len)
 {
   blake32_16way(cc, data, len);
 }

 void
-blake256r14_16way_close(void *cc, void *dst)
+blake256r14_16x32_close(void *cc, void *dst)
 {
   blake32_16way_close(cc, 0, 0, dst, 8);
 }

 void blake256r8_16way_init(void *cc)
 {
-   blake32_16way_init( cc, IV256, salt_zero_8way_small, 8 );
+   blake32_16way_init( cc, IV256, 8 );
 }

 void
-blake256r8_16way_update(void *cc, const void *data, size_t len)
+blake256r8_16x32_update(void *cc, const void *data, size_t len)
 {
   blake32_16way(cc, data, len);
 }

 void
-blake256r8_16way_close(void *cc, void *dst)
+blake256r8_16x32_close(void *cc, void *dst)
 {
   blake32_16way_close(cc, 0, 0, dst, 8);
 }
@@ -2898,7 +2839,7 @@ blake256r8_16way_close(void *cc, void *dst)
 void
 blake256_4x32_init(void *ctx)
 {
-   blake32_4x32_init( ctx, IV256, salt_zero_4x32_small, 14 );
+   blake32_4x32_init( ctx, IV256, 14 );
 }

 void
@@ -2918,31 +2859,31 @@ blake256_4x32_close(void *ctx, void *dst)
 // Blake-256 8 way

 void
-blake256_8way_init(void *cc)
+blake256_8x32_init(void *cc)
 {
-   blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_8way_init( cc, IV256, 14 );
 }

 void
-blake256_8way_update(void *cc, const void *data, size_t len)
+blake256_8x32_update(void *cc, const void *data, size_t len)
 {
        blake32_8way(cc, data, len);
 }

 void
-blake256_8way_close(void *cc, void *dst)
+blake256_8x32_close(void *cc, void *dst)
 {
        blake32_8way_close(cc, 0, 0, dst, 8);
 }

 void
-blake256_8way_update_le(void *cc, const void *data, size_t len)
+blake256_8x32_update_le(void *cc, const void *data, size_t len)
 {
        blake32_8way_le(cc, data, len);
 }

 void
-blake256_8way_close_le(void *cc, void *dst)
+blake256_8x32_close_le(void *cc, void *dst)
 {
        blake32_8way_close_le(cc, 0, 0, dst, 8);
 }
@@ -2952,7 +2893,7 @@ blake256_8way_close_le(void *cc, void *dst)
 // 14 rounds Blake, Decred
 void blake256r14_4x32_init(void *cc)
 {
-   blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 14 );
+   blake32_4x32_init( cc, IV256, 14 );
 }

 void
@@ -2969,19 +2910,19 @@ blake256r14_4x32_close(void *cc, void *dst)

 #if defined(__AVX2__)

-void blake256r14_8way_init(void *cc)
+void blake256r14_8x32_init(void *cc)
 {
-   blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
+   blake32_8way_init( cc, IV256, 14 );
 }

 void
-blake256r14_8way_update(void *cc, const void *data, size_t len)
+blake256r14_8x32_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }

 void
-blake256r14_8way_close(void *cc, void *dst)
+blake256r14_8x32_close(void *cc, void *dst)
 {
   blake32_8way_close(cc, 0, 0, dst, 8);
 }
@@ -2991,7 +2932,7 @@ blake256r14_8way_close(void *cc, void *dst)
 // 8 rounds Blakecoin, Vanilla
 void blake256r8_4x32_init(void *cc)
 {
-   blake32_4x32_init( cc, IV256, salt_zero_4x32_small, 8 );
+   blake32_4x32_init( cc, IV256, 8 );
 }

 void
@@ -3008,19 +2949,19 @@ blake256r8_4x32_close(void *cc, void *dst)

 #if defined (__AVX2__)

-void blake256r8_8way_init(void *cc)
+void blake256r8_8x32_init(void *cc)
 {
-   blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 );
+   blake32_8way_init( cc, IV256, 8 );
 }

 void
-blake256r8_8way_update(void *cc, const void *data, size_t len)
+blake256r8_8x32_update(void *cc, const void *data, size_t len)
 {
   blake32_8way(cc, data, len);
 }

 void
-blake256r8_8way_close(void *cc, void *dst)
+blake256r8_8x32_close(void *cc, void *dst)
 {
   blake32_8way_close(cc, 0, 0, dst, 8);
 }
--- a/algo/blake/blake256-hash.h
+++ b/algo/blake/blake256-hash.h
@@ -29,13 +29,6 @@ typedef struct

 void blake256_transform_le( uint32_t *H, const uint32_t *buf,
                            const uint32_t T0, const uint32_t T1, int rounds );
-/*
-void blake256_init( blake256_context *sc );
-void blake256_update( blake256_context *sc, const void *data, size_t len );
-void blake256_close( blake256_context *sc, void *dst );
-void blake256_full( blake256_context *sc, void *dst, const void *data,
-                    size_t len );
-*/

 //////////////////////////////////
 //
@@ -55,6 +48,10 @@ typedef blake_4x32_small_context blake256_4x32_context;
 void blake256_4x32_init(void *ctx);
 void blake256_4x32_update(void *ctx, const void *data, size_t len);
 void blake256_4x32_close(void *ctx, void *dst);
+void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
+                                      void *data );
+void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
+                     const void *midhash, const void *data, const int rounds );

 // 14 rounds
 typedef blake_4x32_small_context blake256r14_4x32_context;
@@ -68,29 +65,6 @@ void blake256r8_4x32_init(void *cc);
 void blake256r8_4x32_update(void *cc, const void *data, size_t len);
 void blake256r8_4x32_close(void *cc, void *dst);

-void blake256_4x32_round0_prehash_le( void *midstate, const void *midhash,
-                                      void *data );
-void blake256_4x32_final_rounds_le( void *final_hash, const void *midstate,
-                     const void *midhash, const void *data, const int rounds );
-
-#define blake_4way_small_context         blake256_4x32_context
-#define blake256_4way_context            blake256_4x32_context
-#define blake256_4way_init               blake256_4x32_init
-#define blake256_4way_update             blake256_4x32_update
-#define blake256_4way_close              blake256_4x32_close
-#define blake256_4way_update_le          blake256_4x32_update_le
-#define blake256_4way_close_le           blake256_4x32_close_le
-#define blake256_4way_round0_prehash_le  blake256_4x32_round0_prehash_le
-#define blake256_4way_final_rounds_le    blake256_4x32_final_rounds_le
-#define blake256r14_4way_context         blake256r14_4x32_context
-#define blake256r14_4way_init            blake256r14_4x32_init
-#define blake256r14_4way_update          blake256r14_4x32_update
-#define blake256r14_4way_close           blake256r14_4x32_close
-#define blake256r8_4way_context          blake256r14_4x32_context
-#define blake256r8_4way_init             blake256r14_4x32_init
-#define blake256r8_4way_update           blake256r14_4x32_update
-#define blake256r8_4way_close            blake256r14_4x32_close
-
 #ifdef __AVX2__

 //////////////////////////////
@@ -107,45 +81,28 @@ typedef struct
 } blake_8way_small_context;

 // Default 14 rounds
-typedef blake_8way_small_context blake256_8way_context;
-void blake256_8way_init(void *cc);
-void blake256_8way_update(void *cc, const void *data, size_t len);
-void blake256_8way_close(void *cc, void *dst);
-void blake256_8way_update_le(void *cc, const void *data, size_t len);
-void blake256_8way_close_le(void *cc, void *dst);
-void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
+typedef blake_8way_small_context blake256_8x32_context;
+void blake256_8x32_init(void *cc);
+void blake256_8x32_update(void *cc, const void *data, size_t len);
+void blake256_8x32_close(void *cc, void *dst);
+void blake256_8x32_update_le(void *cc, const void *data, size_t len);
+void blake256_8x32_close_le(void *cc, void *dst);
+void blake256_8x32_round0_prehash_le( void *midstate, const void *midhash,
                                      void *data );
-void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_8x32_final_rounds_le( void *final_hash, const void *midstate,
                    const void *midhash, const void *data, const int rounds );

 // 14 rounds, blake, decred
-typedef blake_8way_small_context blake256r14_8way_context;
-void blake256r14_8way_init(void *cc);
-void blake256r14_8way_update(void *cc, const void *data, size_t len);
-void blake256r14_8way_close(void *cc, void *dst);
+typedef blake_8way_small_context blake256r14_8x32_context;
+void blake256r14_8x32_init(void *cc);
+void blake256r14_8x32_update(void *cc, const void *data, size_t len);
+void blake256r14_8x32_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
-typedef blake_8way_small_context blake256r8_8way_context;
-void blake256r8_8way_init(void *cc);
-void blake256r8_8way_update(void *cc, const void *data, size_t len);
-void blake256r8_8way_close(void *cc, void *dst);
-
-#define blake_8x32_small_context      blake256_8way_context
-#define blake_8x32_init               blake256_8way_init
-#define blake_8x32_update             blake256_8way_update
-#define blake_8x32_close              blake256_8way_close
-#define blake_8x32_update_le          blake256_8way_update_le
-#define blake_8x32_close_le           blake256_8way_close_le
-#define blake_8x32_round0_prehash_le  blake256_8way_round0_prehash
-#define blake_8x32_final_rounds_le    blake256_8way_final_rounds_le
-#define blake256r14_8x32_context      blake256r14_8way_context
-#define blake256r14_8x32_init         blake256r14_8way_init
-#define blake256r14_8x32_update       blake256r14_8way_update
-#define blake256r14_8x32_close        blake256r14_8way_close
-#define blake256r8_8x32_context       blake256r14_8way_context
-#define blake256r8_8x32_init          blake256r14_8way_init
-#define blake256r8_8x32_update        blake256r14_8way_update
-#define blake256r8_8x32_close         blake256r14_8way_close
+typedef blake_8way_small_context blake256r8_8x32_context;
+void blake256r8_8x32_init(void *cc);
+void blake256r8_8x32_update(void *cc, const void *data, size_t len);
+void blake256r8_8x32_close(void *cc, void *dst);

 #if defined(SIMD512)

@@ -163,46 +120,29 @@ typedef struct
 } blake_16way_small_context __attribute__ ((aligned (128)));

 // Default 14 rounds
-typedef blake_16way_small_context blake256_16way_context;
-void blake256_16way_init(void *cc);
-void blake256_16way_update(void *cc, const void *data, size_t len);
-void blake256_16way_close(void *cc, void *dst);
+typedef blake_16way_small_context blake256_16x32_context;
+void blake256_16x32_init(void *cc);
+void blake256_16x32_update(void *cc, const void *data, size_t len);
+void blake256_16x32_close(void *cc, void *dst);
 // Expects data in little endian order, no byte swap needed
-void blake256_16way_update_le(void *cc, const void *data, size_t len);
-void blake256_16way_close_le(void *cc, void *dst);
-void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
+void blake256_16x32_update_le(void *cc, const void *data, size_t len);
+void blake256_16x32_close_le(void *cc, void *dst);
+void blake256_16x32_round0_prehash_le( void *midstate, const void *midhash,
                                       void *data );
-void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
+void blake256_16x32_final_rounds_le( void *final_hash, const void *midstate,
                     const void *midhash, const void *data, const int rounds );

 // 14 rounds, blake, decred
-typedef blake_16way_small_context blake256r14_16way_context;
-void blake256r14_16way_init(void *cc);
-void blake256r14_16way_update(void *cc, const void *data, size_t len);
-void blake256r14_16way_close(void *cc, void *dst);
+typedef blake_16way_small_context blake256r14_16x32_context;
+void blake256r14_16x32_init(void *cc);
+void blake256r14_16x32_update(void *cc, const void *data, size_t len);
+void blake256r14_16x32_close(void *cc, void *dst);

 // 8 rounds, blakecoin, vanilla
-typedef blake_16way_small_context blake256r8_16way_context;
-void blake256r8_16way_init(void *cc);
-void blake256r8_16way_update(void *cc, const void *data, size_t len);
-void blake256r8_16way_close(void *cc, void *dst);
-
-#define blake_16x32_small_context      blake256_16way_context
-#define blake_16x32_init               blake256_16way_init
-#define blake_16x32_update             blake256_16way_update
-#define blake_16x32_close              blake256_16way_close
-#define blake_16x32_update_le          blake256_16way_update_le
-#define blake_16x32_close_le           blake256_16way_close_le
-#define blake_16x32_round0_prehash_le  blake256_16way_round0_prehash
-#define blake_16x32_final_rounds_le    blake256_16way_final_rounds_le
-#define blake256r14_16x32_context      blake256r14_16way_context
-#define blake256r14_16x32_init         blake256r14_16way_init
-#define blake256r14_16x32_update       blake256r14_16way_update
-#define blake256r14_16x32_close        blake256r14_16way_close
-#define blake256r8_16x32_context       blake256r8_16way_context
-#define blake256r8_16x32_init          blake256r8_16way_init
-#define blake256r8_16x32_update        blake256r8_16way_update
-#define blake256r8_16x32_close         blake256r8_16way_close
+typedef blake_16way_small_context blake256r8_16x32_context;
+void blake256r8_16x32_init(void *cc);
+void blake256r8_16x32_update(void *cc, const void *data, size_t len);
+void blake256r8_16x32_close(void *cc, void *dst);

 #endif  // AVX512
 #endif  // AVX2
--- a/algo/blake/blake2b-hash.h
+++ b/algo/blake/blake2b-hash.h
@@ -14,7 +14,6 @@
 #define ALIGN(x) __attribute__((aligned(x)))
 #endif

-
 #if defined(SIMD512)

 typedef struct ALIGN( 64 ) {
@@ -30,11 +29,6 @@ void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input,
                          size_t inlen );
 void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out );

-#define blake2b_8way_ctx         blake2b_8x64_ctx
-#define blake2b_8way_init        blake2b_8x64_init
-#define blake2b_8way_update      blake2b_8x64_update
-#define blake2b_8way_final       blake2b_8x64_final
-
 #endif

 #if defined(__AVX2__)
@@ -53,11 +47,6 @@ void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input,
                          size_t inlen );
 void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out );

-#define blake2b_4way_ctx         blake2b_4x64_ctx
-#define blake2b_4way_init        blake2b_4x64_init
-#define blake2b_4way_update      blake2b_4x64_update
-#define blake2b_4way_final       blake2b_4x64_final
-
 #endif

 #endif
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -17,7 +17,7 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
   uint32_t hash[8*8] __attribute__ ((aligned (128)));;
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));;
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   blake2b_8way_ctx ctx __attribute__ ((aligned (64)));
+   blake2b_8x64_ctx ctx __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[49]);   // 3*16+1
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -35,9 +35,9 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
                _mm512_set_epi32( n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                                  n+3, 0, n+2, 0, n+1, 0, n  , 0 ) ), *noncev );

-      blake2b_8way_init( &ctx );
-      blake2b_8way_update( &ctx, vdata, 80 );
-      blake2b_8way_final( &ctx, hash );
+      blake2b_8x64_init( &ctx );
+      blake2b_8x64_update( &ctx, vdata, 80 );
+      blake2b_8x64_final( &ctx, hash );

      for ( int lane = 0; lane < 8; lane++ )
      if ( hash7[ lane<<1 ] <= Htarg )
@@ -61,10 +61,10 @@ int scanhash_blake2b_8way( struct work *work, uint32_t max_nonce,
 // Function not used, code inlined.
 void blake2b_4way_hash(void *output, const void *input)
 {
-    blake2b_4way_ctx ctx;
-    blake2b_4way_init( &ctx );
-    blake2b_4way_update( &ctx, input, 80 );
-    blake2b_4way_final( &ctx, output );
+    blake2b_4x64_ctx ctx;
+    blake2b_4x64_init( &ctx );
+    blake2b_4x64_update( &ctx, input, 80 );
+    blake2b_4x64_final( &ctx, output );
 }

 int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
@@ -73,7 +73,7 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
 	uint32_t hash[8*4] __attribute__ ((aligned (64)));;
   uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
+   blake2b_4x64_ctx ctx __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[25]);   // 3*8+1
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
@@ -90,9 +90,9 @@ int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );

-      blake2b_4way_init( &ctx ); 
-      blake2b_4way_update( &ctx, vdata, 80 );
-      blake2b_4way_final( &ctx, hash );
+      blake2b_4x64_init( &ctx ); 
+      blake2b_4x64_update( &ctx, vdata, 80 );
+      blake2b_4x64_final( &ctx, hash );

      for ( int lane = 0; lane < 4; lane++ )
      if ( hash7[ lane<<1 ] <= Htarg )
--- a/algo/blake/blake2s-hash.h
+++ b/algo/blake/blake2s-hash.h
@@ -61,6 +61,11 @@ int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
 int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
                              const void *input, uint64_t inlen );

+#define blake2s_4x32_state       blake2s_4way_state
+#define blake2s_4x32_init        blake2s_4way_init
+#define blake2s_4x32_update      blake2s_4way_update
+#define blake2s_4x32_final       blake2s_4way_final
+#define blake2s_4x32_full_blocks blake2s_4way_full_blocks

 #if defined(__AVX2__)

@@ -81,6 +86,12 @@ int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
 int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,
                              const void *input, uint64_t inlen );

+#define blake2s_8x32_state       blake2s_8way_state
+#define blake2s_8x32_init        blake2s_8way_init
+#define blake2s_8x32_update      blake2s_8way_update
+#define blake2s_8x32_final       blake2s_8way_final
+#define blake2s_8x32_full_blocks blake2s_8way_full_blocks
+
 #endif

 #if defined(SIMD512)
@@ -100,6 +111,11 @@ int blake2s_16way_update( blake2s_16way_state *S, const void *in,
                         uint64_t inlen );
 int blake2s_16way_final( blake2s_16way_state *S, void *out, uint8_t outlen );

+#define blake2s_16x32_state       blake2s_16way_state
+#define blake2s_16x32_init        blake2s_16way_init
+#define blake2s_16x32_update      blake2s_16way_update
+#define blake2s_16x32_final       blake2s_16way_final
+
 #endif

 #if 0
--- a/algo/blake/blake512-hash.c
+++ b/algo/blake/blake512-hash.c
@@ -617,24 +617,22 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
  VD = v512_64( CB5 ^ T0 ); \
  VE = v512_64( CB6 ^ T1 ); \
  VF = v512_64( CB7 ^ T1 ); \
-  const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( \
-                                   0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
-  M0 = _mm512_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
-  M1 = _mm512_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
-  M2 = _mm512_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
-  M3 = _mm512_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
-  M4 = _mm512_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
-  M5 = _mm512_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
-  M6 = _mm512_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
-  M7 = _mm512_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
-  M8 = _mm512_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
-  M9 = _mm512_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
-  MA = _mm512_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
-  MB = _mm512_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
-  MC = _mm512_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
-  MD = _mm512_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
-  ME = _mm512_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
-  MF = _mm512_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
+  M0 = mm512_bswap_64( *(buf+ 0) ); \
+  M1 = mm512_bswap_64( *(buf+ 1) ); \
+  M2 = mm512_bswap_64( *(buf+ 2) ); \
+  M3 = mm512_bswap_64( *(buf+ 3) ); \
+  M4 = mm512_bswap_64( *(buf+ 4) ); \
+  M5 = mm512_bswap_64( *(buf+ 5) ); \
+  M6 = mm512_bswap_64( *(buf+ 6) ); \
+  M7 = mm512_bswap_64( *(buf+ 7) ); \
+  M8 = mm512_bswap_64( *(buf+ 8) ); \
+  M9 = mm512_bswap_64( *(buf+ 9) ); \
+  MA = mm512_bswap_64( *(buf+10) ); \
+  MB = mm512_bswap_64( *(buf+11) ); \
+  MC = mm512_bswap_64( *(buf+12) ); \
+  MD = mm512_bswap_64( *(buf+13) ); \
+  ME = mm512_bswap_64( *(buf+14) ); \
+  MF = mm512_bswap_64( *(buf+15) ); \
  ROUND_B_8WAY(0); \
  ROUND_B_8WAY(1); \
  ROUND_B_8WAY(2); \
@@ -661,7 +659,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,
  H7 = mm512_xor3( VF, V7, H7 ); \
 }

-void blake512_8way_compress( blake_8way_big_context *sc )
+void blake512_8x64_compress( blake_8x64_big_context *sc )
 { 
  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -685,25 +683,22 @@ void blake512_8way_compress( blake_8way_big_context *sc )
  VE = v512_64( CB6 ^ sc->T1 );
  VF = v512_64( CB7 ^ sc->T1 );

-  const __m512i shuf_bswap64 = mm512_bcast_m128( v128_set64( 
-                                   0x08090a0b0c0d0e0f, 0x0001020304050607 ) );
-
-  M0 = _mm512_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
-  M1 = _mm512_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
-  M2 = _mm512_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
-  M3 = _mm512_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
-  M4 = _mm512_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
-  M5 = _mm512_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
-  M6 = _mm512_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
-  M7 = _mm512_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
-  M8 = _mm512_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
-  M9 = _mm512_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
-  MA = _mm512_shuffle_epi8( sc->buf[10], shuf_bswap64 );
-  MB = _mm512_shuffle_epi8( sc->buf[11], shuf_bswap64 );
-  MC = _mm512_shuffle_epi8( sc->buf[12], shuf_bswap64 );
-  MD = _mm512_shuffle_epi8( sc->buf[13], shuf_bswap64 );
-  ME = _mm512_shuffle_epi8( sc->buf[14], shuf_bswap64 );
-  MF = _mm512_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+  M0 = mm512_bswap_64( sc->buf[ 0] );
+  M1 = mm512_bswap_64( sc->buf[ 1] );
+  M2 = mm512_bswap_64( sc->buf[ 2] );
+  M3 = mm512_bswap_64( sc->buf[ 3] );
+  M4 = mm512_bswap_64( sc->buf[ 4] );
+  M5 = mm512_bswap_64( sc->buf[ 5] );
+  M6 = mm512_bswap_64( sc->buf[ 6] );
+  M7 = mm512_bswap_64( sc->buf[ 7] );
+  M8 = mm512_bswap_64( sc->buf[ 8] );
+  M9 = mm512_bswap_64( sc->buf[ 9] );
+  MA = mm512_bswap_64( sc->buf[10] );
+  MB = mm512_bswap_64( sc->buf[11] );
+  MC = mm512_bswap_64( sc->buf[12] );
+  MD = mm512_bswap_64( sc->buf[13] );
+  ME = mm512_bswap_64( sc->buf[14] );
+  MF = mm512_bswap_64( sc->buf[15] );

  ROUND_B_8WAY(0);
  ROUND_B_8WAY(1);
@@ -733,7 +728,7 @@ void blake512_8way_compress( blake_8way_big_context *sc )
 }

 // won't be used after prehash implemented
-void blake512_8way_compress_le( blake_8x64_big_context *sc )
+void blake512_8x64_compress_le( blake_8x64_big_context *sc )
 {
  __m512i M0, M1, M2, M3, M4, M5, M6, M7;
  __m512i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1177,7 +1172,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
   {
      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
            sc->T1 = sc->T1 + 1;
-      blake512_8way_compress( sc );
+      blake512_8x64_compress( sc );
      sc->ptr = 0;
   }

@@ -1213,7 +1208,7 @@ void blake512_8x64_full( blake_8x64_big_context *sc, void * dst,
   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;

-   blake512_8way_compress( sc );
+   blake512_8x64_compress( sc );
   
   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }
@@ -1244,7 +1239,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
   {
      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
            sc->T1 = sc->T1 + 1;
-      blake512_8way_compress_le( sc );
+      blake512_8x64_compress_le( sc );
      sc->ptr = 0;
   }

@@ -1280,7 +1275,7 @@ void blake512_8x64_full_le( blake_8x64_big_context *sc, void * dst,
   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;

-   blake512_8way_compress_le( sc );
+   blake512_8x64_compress_le( sc );

   mm512_block_bswap_64( (__m512i*)dst, sc->H );
 }
@@ -1355,24 +1350,22 @@ blake512_8x64_close(void *cc, void *dst)
  VD = v256_64( CB5 ^ T0 ); \
  VE = v256_64( CB6 ^ T1 ); \
  VF = v256_64( CB7 ^ T1 ); \
-  const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64( \
-                             0x08090a0b0c0d0e0f, 0x0001020304050607 ) ); \
-  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
-  M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
-  M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
-  M3 = _mm256_shuffle_epi8( *(buf+ 3), shuf_bswap64 ); \
-  M4 = _mm256_shuffle_epi8( *(buf+ 4), shuf_bswap64 ); \
-  M5 = _mm256_shuffle_epi8( *(buf+ 5), shuf_bswap64 ); \
-  M6 = _mm256_shuffle_epi8( *(buf+ 6), shuf_bswap64 ); \
-  M7 = _mm256_shuffle_epi8( *(buf+ 7), shuf_bswap64 ); \
-  M8 = _mm256_shuffle_epi8( *(buf+ 8), shuf_bswap64 ); \
-  M9 = _mm256_shuffle_epi8( *(buf+ 9), shuf_bswap64 ); \
-  MA = _mm256_shuffle_epi8( *(buf+10), shuf_bswap64 ); \
-  MB = _mm256_shuffle_epi8( *(buf+11), shuf_bswap64 ); \
-  MC = _mm256_shuffle_epi8( *(buf+12), shuf_bswap64 ); \
-  MD = _mm256_shuffle_epi8( *(buf+13), shuf_bswap64 ); \
-  ME = _mm256_shuffle_epi8( *(buf+14), shuf_bswap64 ); \
-  MF = _mm256_shuffle_epi8( *(buf+15), shuf_bswap64 ); \
+  M0 = mm256_bswap_64( *(buf+ 0) ); \
+  M1 = mm256_bswap_64( *(buf+ 1) ); \
+  M2 = mm256_bswap_64( *(buf+ 2) ); \
+  M3 = mm256_bswap_64( *(buf+ 3) ); \
+  M4 = mm256_bswap_64( *(buf+ 4) ); \
+  M5 = mm256_bswap_64( *(buf+ 5) ); \
+  M6 = mm256_bswap_64( *(buf+ 6) ); \
+  M7 = mm256_bswap_64( *(buf+ 7) ); \
+  M8 = mm256_bswap_64( *(buf+ 8) ); \
+  M9 = mm256_bswap_64( *(buf+ 9) ); \
+  MA = mm256_bswap_64( *(buf+10) ); \
+  MB = mm256_bswap_64( *(buf+11) ); \
+  MC = mm256_bswap_64( *(buf+12) ); \
+  MD = mm256_bswap_64( *(buf+13) ); \
+  ME = mm256_bswap_64( *(buf+14) ); \
+  MF = mm256_bswap_64( *(buf+15) ); \
  ROUND_B_4WAY(0); \
  ROUND_B_4WAY(1); \
  ROUND_B_4WAY(2); \
@@ -1400,7 +1393,7 @@ blake512_8x64_close(void *cc, void *dst)
 }


-void blake512_4way_compress( blake_4x64_big_context *sc )
+void blake512_4x64_compress( blake_4x64_big_context *sc )
 {
  __m256i M0, M1, M2, M3, M4, M5, M6, M7;
  __m256i M8, M9, MA, MB, MC, MD, ME, MF;
@@ -1423,25 +1416,23 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
  VD = v256_64( CB5 ^ sc->T0 );
  VE = v256_64( CB6 ^ sc->T1 );
  VF = v256_64( CB7 ^ sc->T1 );
-  const __m256i shuf_bswap64 = mm256_bcast_m128( v128_set64(
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ) );

-  M0 = _mm256_shuffle_epi8( sc->buf[ 0], shuf_bswap64 );
-  M1 = _mm256_shuffle_epi8( sc->buf[ 1], shuf_bswap64 );
-  M2 = _mm256_shuffle_epi8( sc->buf[ 2], shuf_bswap64 );
-  M3 = _mm256_shuffle_epi8( sc->buf[ 3], shuf_bswap64 );
-  M4 = _mm256_shuffle_epi8( sc->buf[ 4], shuf_bswap64 );
-  M5 = _mm256_shuffle_epi8( sc->buf[ 5], shuf_bswap64 );
-  M6 = _mm256_shuffle_epi8( sc->buf[ 6], shuf_bswap64 );
-  M7 = _mm256_shuffle_epi8( sc->buf[ 7], shuf_bswap64 );
-  M8 = _mm256_shuffle_epi8( sc->buf[ 8], shuf_bswap64 );
-  M9 = _mm256_shuffle_epi8( sc->buf[ 9], shuf_bswap64 );
-  MA = _mm256_shuffle_epi8( sc->buf[10], shuf_bswap64 );
-  MB = _mm256_shuffle_epi8( sc->buf[11], shuf_bswap64 );
-  MC = _mm256_shuffle_epi8( sc->buf[12], shuf_bswap64 );
-  MD = _mm256_shuffle_epi8( sc->buf[13], shuf_bswap64 );
-  ME = _mm256_shuffle_epi8( sc->buf[14], shuf_bswap64 );
-  MF = _mm256_shuffle_epi8( sc->buf[15], shuf_bswap64 );
+  M0 = mm256_bswap_64( sc->buf[ 0] );
+  M1 = mm256_bswap_64( sc->buf[ 1] );
+  M2 = mm256_bswap_64( sc->buf[ 2] );
+  M3 = mm256_bswap_64( sc->buf[ 3] );
+  M4 = mm256_bswap_64( sc->buf[ 4] );
+  M5 = mm256_bswap_64( sc->buf[ 5] );
+  M6 = mm256_bswap_64( sc->buf[ 6] );
+  M7 = mm256_bswap_64( sc->buf[ 7] );
+  M8 = mm256_bswap_64( sc->buf[ 8] );
+  M9 = mm256_bswap_64( sc->buf[ 9] );
+  MA = mm256_bswap_64( sc->buf[10] );
+  MB = mm256_bswap_64( sc->buf[11] );
+  MC = mm256_bswap_64( sc->buf[12] );
+  MD = mm256_bswap_64( sc->buf[13] );
+  ME = mm256_bswap_64( sc->buf[14] );
+  MF = mm256_bswap_64( sc->buf[15] );

  ROUND_B_4WAY(0);
  ROUND_B_4WAY(1);
@@ -1470,7 +1461,7 @@ void blake512_4way_compress( blake_4x64_big_context *sc )
  sc->H[7] = mm256_xor3( VF, V7, sc->H[7] );
 }

-void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
+void blake512_4x64_prehash_le( blake512_4x64_context *sc, __m256i *midstate,
                               const void *data )
 {
   __m256i V0, V1, V2, V3, V4, V5, V6, V7;
@@ -1562,7 +1553,7 @@ void blake512_4x64_prehash_le( blake_4x64_big_context *sc, __m256i *midstate,
   midstate[15] = VF;
 }

-void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
+void blake512_4x64_final_le( blake512_4x64_context *sc, void *hash,
                             const __m256i nonce, const __m256i *midstate )
 {
   __m256i M0, M1, M2, M3, M4, M5, M6, M7;
@@ -1685,7 +1676,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
 }


-void blake512_4x64_init( blake_4x64_big_context *sc )
+void blake512_4x64_init( blake512_4x64_context *sc )
 {
   casti_m256i( sc->H, 0 ) = v256_64( 0x6A09E667F3BCC908 );
   casti_m256i( sc->H, 1 ) = v256_64( 0xBB67AE8584CAA73B );
@@ -1798,7 +1789,7 @@ blake64_4way_close( blake_4x64_big_context *sc, void *dst )
 }

 // init, update & close
-void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
+void blake512_4x64_full( blake512_4x64_context *sc, void * dst,
                         const void *data, size_t len )
 {

@@ -1824,7 +1815,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
   {
      if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
         sc->T1 =  sc->T1 + 1;
-      blake512_4way_compress( sc );
+      blake512_4x64_compress( sc );
      sc->ptr = 0;
   }

@@ -1859,7 +1850,7 @@ void blake512_4x64_full( blake_4x64_big_context *sc, void * dst,
   if ( ( sc->T0 = sc->T0 + 1024 ) < 1024 )
       sc->T1 = sc->T1 + 1;

-   blake512_4way_compress( sc );
+   blake512_4x64_compress( sc );

   mm256_block_bswap_64( (__m256i*)dst, sc->H );
 }
@@ -1934,29 +1925,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
  VE = v128_64( CB6 ^ sc->T1 );
  VF = v128_64( CB7 ^ sc->T1 );

-#if defined(__SSSE3__)
-
-  const v128u64_t shuf_bswap64 = v128_set64(
-                                 0x08090a0b0c0d0e0f, 0x0001020304050607 );
-  M0 = v128_shuffle8( sc->buf[ 0], shuf_bswap64 );
-  M1 = v128_shuffle8( sc->buf[ 1], shuf_bswap64 );
-  M2 = v128_shuffle8( sc->buf[ 2], shuf_bswap64 );
-  M3 = v128_shuffle8( sc->buf[ 3], shuf_bswap64 );
-  M4 = v128_shuffle8( sc->buf[ 4], shuf_bswap64 );
-  M5 = v128_shuffle8( sc->buf[ 5], shuf_bswap64 );
-  M6 = v128_shuffle8( sc->buf[ 6], shuf_bswap64 );
-  M7 = v128_shuffle8( sc->buf[ 7], shuf_bswap64 );
-  M8 = v128_shuffle8( sc->buf[ 8], shuf_bswap64 );
-  M9 = v128_shuffle8( sc->buf[ 9], shuf_bswap64 );
-  MA = v128_shuffle8( sc->buf[10], shuf_bswap64 );
-  MB = v128_shuffle8( sc->buf[11], shuf_bswap64 );
-  MC = v128_shuffle8( sc->buf[12], shuf_bswap64 );
-  MD = v128_shuffle8( sc->buf[13], shuf_bswap64 );
-  ME = v128_shuffle8( sc->buf[14], shuf_bswap64 );
-  MF = v128_shuffle8( sc->buf[15], shuf_bswap64 );
-
-#else  // SSE2 & NEON
-
  M0 = v128_bswap64( sc->buf[ 0] );
  M1 = v128_bswap64( sc->buf[ 1] );
  M2 = v128_bswap64( sc->buf[ 2] );
@@ -1974,8 +1942,6 @@ void blake512_2x64_compress( blake_2x64_big_context *sc )
  ME = v128_bswap64( sc->buf[14] );
  MF = v128_bswap64( sc->buf[15] );
  
-#endif
-
  ROUND_B_2X64(0);
  ROUND_B_2X64(1);
  ROUND_B_2X64(2);
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -54,10 +54,10 @@ int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_16x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
+      blake256_16x32_final_rounds_le( hash32, midstate_vars, block0_hash,
                                      block_buf, rounds );
      for ( int lane = 0; lane < 16; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -123,10 +123,10 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
   block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );

   // Partialy prehash second block without touching nonces in block_buf[3].
-   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+   blake256_8x32_round0_prehash_le( midstate_vars, block0_hash, block_buf );

   do {
-      blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
+      blake256_8x32_final_rounds_le( hash32, midstate_vars, block0_hash,
                                     block_buf, rounds );
      for ( int lane = 0; lane < 8; lane++ )
      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
@@ -148,16 +148,16 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
   
 #elif defined (BLAKECOIN_4WAY)

-blake256r8_4way_context blakecoin_4w_ctx;
+blake256r8_4x32_context blakecoin_4w_ctx;

 void blakecoin_4way_hash(void *state, const void *input)
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake256r8_4way_context ctx;
+     blake256r8_4x32_context ctx;

     memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
-     blake256r8_4way_update( &ctx, input + (64<<2), 16 );
-     blake256r8_4way_close( &ctx, vhash );
+     blake256r8_4x32_update( &ctx, input + (64<<2), 16 );
+     blake256r8_4x32_close( &ctx, vhash );

     dintrlv_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }
@@ -178,8 +178,8 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
      HTarget = 0x7f;

   v128_bswap32_intrlv80_4x32( vdata, pdata );
-   blake256r8_4way_init( &blakecoin_4w_ctx );
-   blake256r8_4way_update( &blakecoin_4w_ctx, vdata, 64 );
+   blake256r8_4x32_init( &blakecoin_4w_ctx );
+   blake256r8_4x32_update( &blakecoin_4w_ctx, vdata, 64 );

   do {
      *noncev = v128_bswap32( _mm_set_epi32( n+3, n+2, n+1, n ) );
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -16,28 +16,27 @@ extern void pentablakehash_4way( void *output, const void *input )
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-     blake512_4way_context ctx;
+     blake512_4x64_context ctx;

+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, input, 80 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, input, 80 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
-
-     blake512_4way_init( &ctx );
-     blake512_4way_update( &ctx, vhash, 64 );
-     blake512_4way_close( &ctx, vhash );
+     blake512_4x64_init( &ctx );
+     blake512_4x64_update( &ctx, vhash, 64 );
+     blake512_4x64_close( &ctx, vhash );

     memcpy( output,    hash0, 32 );
     memcpy( output+32, hash1, 32 );
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -227,7 +227,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 	v[14] = S->f[0] ^ blake2s_IV[6];
 	v[15] = S->f[1] ^ blake2s_IV[7];

-#if defined(__SSE2__)
+#if defined(__SSE2__) || defined(__ARM_NEON)

   v128_t *V = (v128_t*)v;

@@ -263,19 +263,6 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
   V[3] = v128_swap64( V[3] ); \
   V[2] = v128_shufll32( V[2] )

-   BLAKE2S_ROUND(0);
-   BLAKE2S_ROUND(1);
-   BLAKE2S_ROUND(2);
-   BLAKE2S_ROUND(3);
-   BLAKE2S_ROUND(4);
-   BLAKE2S_ROUND(5);
-   BLAKE2S_ROUND(6);
-   BLAKE2S_ROUND(7);
-   BLAKE2S_ROUND(8);
-   BLAKE2S_ROUND(9);
-   
-#undef BLAKE2S_ROUND
-
 #else

 #define G(r,i,a,b,c,d) \
@@ -290,7 +277,7 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 		b = SPH_ROTR32(b ^ c, 7); \
 	} while(0)

-#define ROUND(r)  \
+#define BLAKE2S_ROUND(r)  \
 	do { \
 		G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
 		G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
@@ -302,24 +289,25 @@ int blake2s_compress( blake2s_state *S, const uint8_t block[64] )
 		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
 	} while(0)

-   ROUND( 0 );
-	ROUND( 1 );
-	ROUND( 2 );
-	ROUND( 3 );
-	ROUND( 4 );
-	ROUND( 5 );
-	ROUND( 6 );
-	ROUND( 7 );
-	ROUND( 8 );
-	ROUND( 9 );
-
 #endif

+   BLAKE2S_ROUND(0);
+   BLAKE2S_ROUND(1);
+   BLAKE2S_ROUND(2);
+   BLAKE2S_ROUND(3);
+   BLAKE2S_ROUND(4);
+   BLAKE2S_ROUND(5);
+   BLAKE2S_ROUND(6);
+   BLAKE2S_ROUND(7);
+   BLAKE2S_ROUND(8);
+   BLAKE2S_ROUND(9);
+   
+
 	for( size_t i = 0; i < 8; ++i )
 		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];

 #undef G
-#undef ROUND
+#undef BLAKE2S_ROUND
 	return 0;
 }