v3.23.1

2025-09-17 23:44:27 +00:00 · 2023-09-13 11:48:52 -04:00
parent 4378d2f841
commit d6b5750362
28 changed files with 1626 additions and 1327 deletions
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -1,60 +1,15 @@
-/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
-/**
- * BLAKE interface. BLAKE is a family of functions which differ by their
- * output size; this implementation defines BLAKE for output sizes 224,
- * 256, 384 and 512 bits. This implementation conforms to the "third
- * round" specification.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_blake.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef __BLAKE_HASH_4WAY__
-#define __BLAKE_HASH_4WAY__ 1
-
-#ifdef __cplusplus
-extern "C"{
-#endif
+#ifndef BLAKE_HASH_4WAY__
+#define BLAKE_HASH_4WAY__ 1

 #include <stddef.h>
-#include "algo/sha/sph_types.h"
 #include "simd-utils.h"

-#define SPH_SIZE_blake256   256
-
-#define SPH_SIZE_blake512   512
-
 /////////////////////////
 //
 //  Blake-256 1 way SSE2

 void  blake256_transform_le( uint32_t *H, const uint32_t *buf,
-                             const uint32_t T0, const uint32_t T1 );
+                             const uint32_t T0, const uint32_t T1, int rounds );

 /////////////////////////
 //
@@ -75,13 +30,13 @@ typedef struct {
   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
 } blake_4way_small_context __attribute__ ((aligned (64)));

-// Default, 14 rounds, blake, decred
+// Default, 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
 void blake256_4way_init(void *ctx);
 void blake256_4way_update(void *ctx, const void *data, size_t len);
 void blake256_4way_close(void *ctx, void *dst);

-// 14 rounds, blake, decred
+// 14 rounds
 typedef blake_4way_small_context blake256r14_4way_context;
 void blake256r14_4way_init(void *cc);
 void blake256r14_4way_update(void *cc, const void *data, size_t len);
@@ -103,7 +58,7 @@ typedef struct {
   __m256i buf[16] __attribute__ ((aligned (64)));
   __m256i H[8];
   size_t ptr;
-   sph_u32 T0, T1;
+   uint32_t T0, T1;
   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
 } blake_8way_small_context;

@@ -117,7 +72,7 @@ void blake256_8way_close_le(void *cc, void *dst);
 void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
                                      void *data );
 void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
-                                     const void *midhash, const void *data );
+                    const void *midhash, const void *data, const int rounds );

 // 14 rounds, blake, decred
 typedef blake_8way_small_context blake256r14_8way_context;
@@ -138,7 +93,7 @@ typedef struct {
   __m256i H[8];
   __m256i S[4];   
   size_t ptr;
-   sph_u64 T0, T1;
+   uint64_t T0, T1;
 } blake_4way_big_context __attribute__ ((aligned (128)));

 typedef blake_4way_big_context blake512_4way_context;
@@ -180,7 +135,7 @@ void blake256_16way_close_le(void *cc, void *dst);
 void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
                                       void *data );
 void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
-                                     const void *midhash, const void *data );
+                     const void *midhash, const void *data, const int rounds );


 // 14 rounds, blake, decred
@@ -204,7 +159,7 @@ typedef struct {
   __m512i H[8];
   __m512i S[4];
   size_t ptr;
-   sph_u64 T0, T1;
+   uint64_t T0, T1;
 } blake_8way_big_context __attribute__ ((aligned (128)));

 typedef blake_8way_big_context blake512_8way_context;
@@ -224,8 +179,4 @@ void blake512_8way_final_le( blake_8way_big_context *sc, void *hash,
 #endif  // AVX512
 #endif  // AVX2

-#ifdef __cplusplus
-}
-#endif
-
 #endif  // BLAKE_HASH_4WAY_H__
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -40,26 +40,6 @@

 #include "blake-hash-4way.h"

-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE
-#define SPH_SMALL_FOOTPRINT_BLAKE   1
-#endif
-
-#if SPH_SMALL_FOOTPRINT_BLAKE
-#define SPH_COMPACT_BLAKE_32   1
-#endif
-
-#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE)
-#define SPH_COMPACT_BLAKE_64   1
-#endif
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
 // Blake-256

 static const uint32_t IV256[8] =
@@ -68,7 +48,7 @@ static const uint32_t IV256[8] =
 	0x510E527F, 0x9B05688C,	0x1F83D9AB, 0x5BE0CD19
 };

-#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64
+#if 0

 // Blake-256 4 & 8 way, Blake-512 4 way

@@ -273,44 +253,28 @@ static const unsigned sigma[16][16] = {
 #define CSx_(n)     CSx__(n)
 #define CSx__(n)    CS ## n

-#define CS0   SPH_C32(0x243F6A88)
-#define CS1   SPH_C32(0x85A308D3)
-#define CS2   SPH_C32(0x13198A2E)
-#define CS3   SPH_C32(0x03707344)
-#define CS4   SPH_C32(0xA4093822)
-#define CS5   SPH_C32(0x299F31D0)
-#define CS6   SPH_C32(0x082EFA98)
-#define CS7   SPH_C32(0xEC4E6C89)
-#define CS8   SPH_C32(0x452821E6)
-#define CS9   SPH_C32(0x38D01377)
-#define CSA   SPH_C32(0xBE5466CF)
-#define CSB   SPH_C32(0x34E90C6C)
-#define CSC   SPH_C32(0xC0AC29B7)
-#define CSD   SPH_C32(0xC97C50DD)
-#define CSE   SPH_C32(0x3F84D5B5)
-#define CSF   SPH_C32(0xB5470917)
-
-#if SPH_COMPACT_BLAKE_32
-
-static const sph_u32 CS[16] = {
-	SPH_C32(0x243F6A88), SPH_C32(0x85A308D3),
-	SPH_C32(0x13198A2E), SPH_C32(0x03707344),
-	SPH_C32(0xA4093822), SPH_C32(0x299F31D0),
-	SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89),
-	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
-	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
-	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
-	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917)
-};
-
-#endif
+#define CS0   0x243F6A88
+#define CS1   0x85A308D3
+#define CS2   0x13198A2E
+#define CS3   0x03707344
+#define CS4   0xA4093822
+#define CS5   0x299F31D0
+#define CS6   0x082EFA98
+#define CS7   0xEC4E6C89
+#define CS8   0x452821E6
+#define CS9   0x38D01377
+#define CSA   0xBE5466CF
+#define CSB   0x34E90C6C
+#define CSC   0xC0AC29B7
+#define CSD   0xC97C50DD
+#define CSE   0x3F84D5B5
+#define CSF   0xB5470917

 /////////////////////////////////////////
 //
 // Blake-256 1 way SIMD
 // Only used for prehash, otherwise 4way is used with SSE2.

-// optimize shuffles to reduce latency caused by dependencies on V1.
 #define BLAKE256_ROUND( r ) \
 { \
   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
@@ -353,52 +317,9 @@ static const sph_u32 CS[16] = {
   V2 = mm128_shufll_32( V2 ); \
 }

-/*
-#define BLAKE256_ROUND( r ) \
-{ \
-   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
-                           _mm_set_epi32( CSx( r, 7 ) ^ Mx( r, 6 ), \
-                                          CSx( r, 5 ) ^ Mx( r, 4 ), \
-                                          CSx( r, 3 ) ^ Mx( r, 2 ), \
-                                          CSx( r, 1 ) ^ Mx( r, 0 ) ) ) ); \
-   V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
-   V2 = _mm_add_epi32( V2, V3 ); \
-   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
-   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
-                           _mm_set_epi32( CSx( r, 6 ) ^ Mx( r, 7 ), \
-                                          CSx( r, 4 ) ^ Mx( r, 5 ), \
-                                          CSx( r, 2 ) ^ Mx( r, 3 ), \
-                                          CSx( r, 0 ) ^ Mx( r, 1 ) ) ) ); \
-   V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
-   V2 = _mm_add_epi32( V2, V3 ); \
-   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
-   V3 = mm128_shufll_32( V3 ); \
-   V2 = mm128_swap_64( V2 ); \
-   V1 = mm128_shuflr_32( V1 ); \
-   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
-                           _mm_set_epi32( CSx( r, F ) ^ Mx( r, E ), \
-                                          CSx( r, D ) ^ Mx( r, C ), \
-                                          CSx( r, B ) ^ Mx( r, A ), \
-                                          CSx( r, 9 ) ^ Mx( r, 8 ) ) ) ); \
-   V3 = mm128_swap32_16( _mm_xor_si128( V3, V0 ) ); \
-   V2 = _mm_add_epi32( V2, V3 ); \
-   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 12 ); \
-   V0 = _mm_add_epi32( V0, _mm_add_epi32( V1, \
-                           _mm_set_epi32( CSx( r, E ) ^ Mx( r, F ), \
-                                          CSx( r, C ) ^ Mx( r, D ), \
-                                          CSx( r, A ) ^ Mx( r, B ), \
-                                          CSx( r, 8 ) ^ Mx( r, 9 ) ) ) ); \
-   V3 = mm128_shuflr32_8( _mm_xor_si128( V3, V0 ) ); \
-   V2 = _mm_add_epi32( V2, V3 ); \
-   V1 = mm128_ror_32( _mm_xor_si128( V1, V2 ), 7 ); \
-   V3 = mm128_shuflr_32( V3 ); \
-   V2 = mm128_swap_64( V2 ); \
-   V1 = mm128_shufll_32( V1 ); \
-}
-*/
-
+// Default is 14 rounds, blakecoin & vanilla are 8.
 void blake256_transform_le( uint32_t *H, const uint32_t *buf,
-                            const uint32_t T0, const uint32_t T1 )
+                            const uint32_t T0, const uint32_t T1, int rounds )
 {
   __m128i V0, V1, V2, V3;
   uint32_t M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
@@ -431,12 +352,15 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
   BLAKE256_ROUND( 5 );
   BLAKE256_ROUND( 6 );
   BLAKE256_ROUND( 7 );
-   BLAKE256_ROUND( 8 );
-   BLAKE256_ROUND( 9 );
-   BLAKE256_ROUND( 0 );
-   BLAKE256_ROUND( 1 );
-   BLAKE256_ROUND( 2 );
-   BLAKE256_ROUND( 3 );
+   if ( rounds > 8 )     // 14
+   {
+      BLAKE256_ROUND( 8 );
+      BLAKE256_ROUND( 9 );
+      BLAKE256_ROUND( 0 );
+      BLAKE256_ROUND( 1 );
+      BLAKE256_ROUND( 2 );
+      BLAKE256_ROUND( 3 );
+   }
   casti_m128i( H, 0 ) = mm128_xor3( casti_m128i( H, 0 ), V0, V2 );
   casti_m128i( H, 1 ) = mm128_xor3( casti_m128i( H, 1 ), V1, V3 );
 }
@@ -459,34 +383,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
   b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
 }

-#if SPH_COMPACT_BLAKE_32
-
-// Not used
-#if 0
-
-#define ROUND_S_4WAY(r)   do { \
-	GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
-		CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \
-	GS_4WAY(M[sigma[r][0x2]], M[sigma[r][0x3]], \
-		CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \
-	GS_4WAY(M[sigma[r][0x4]], M[sigma[r][0x5]], \
-		CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \
-	GS_4WAY(M[sigma[r][0x6]], M[sigma[r][0x7]], \
-		CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \
-	GS_4WAY(M[sigma[r][0x8]], M[sigma[r][0x9]], \
-		CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \
-	GS_4WAY(M[sigma[r][0xA]], M[sigma[r][0xB]], \
-		CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \
-	GS_4WAY(M[sigma[r][0xC]], M[sigma[r][0xD]], \
-		CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \
-	GS_4WAY(M[sigma[r][0xE]], M[sigma[r][0xF]], \
-		CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \
-} while (0)
-
-#endif
-
-#else
-
 #define ROUND_S_4WAY(r) \
 { \
 	GS_4WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
@@ -499,8 +395,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 	GS_4WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
 }

-#endif
-
 #define DECL_STATE32_4WAY \
 	__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
        uint32_t T0, T1;
@@ -531,56 +425,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
 		(state)->T1 = T1; \
 	} while (0)

-#if SPH_COMPACT_BLAKE_32
-// not used
-#if 0
-#define COMPRESS32_4WAY( rounds )   do { \
-	__m128i M[16]; \
-	__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
-	__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
-	unsigned r; \
-	V0 = H0; \
-	V1 = H1; \
-	V2 = H2; \
-	V3 = H3; \
-	V4 = H4; \
-	V5 = H5; \
-	V6 = H6; \
-	V7 = H7; \
-   V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
-   V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
-   VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
-   VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
-   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
-   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
-   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
-   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
-   mm128_block_bswap_32( M, buf ); \
-   mm128_block_bswap_32( M+8, buf+8 ); \
-	for (r = 0; r < rounds; r ++) \
-		ROUND_S_4WAY(r); \
-        H0 = _mm_xor_si128( _mm_xor_si128( \
-                                   _mm_xor_si128( S0, V0 ), V8 ), H0 ); \
-        H1 = _mm_xor_si128( _mm_xor_si128( \
-                                   _mm_xor_si128( S1, V1 ), V9 ), H1 ); \
-        H2 = _mm_xor_si128( _mm_xor_si128( \
-                                   _mm_xor_si128( S2, V2 ), VA ), H2 ); \
-        H3 = _mm_xor_si128( _mm_xor_si128( \
-                                   _mm_xor_si128( S3, V3 ), VB ), H3 ); \
-        H4 = _mm_xor_si128( _mm_xor_si128( \
-                                   _mm_xor_si128( S0, V4 ), VC ), H4 ); \
-        H5 = _mm_xor_si128( _mm_xor_si128( \
-                                   _mm_xor_si128( S1, V5 ), VD ), H5 ); \
-        H6 = _mm_xor_si128( _mm_xor_si128( \
-                                   _mm_xor_si128( S2, V6 ), VE ), H6 ); \
-        H7 = _mm_xor_si128( _mm_xor_si128( \
-                                   _mm_xor_si128( S3, V7 ), VF ), H7 ); \
-	} while (0)
-#endif
-
-#else
-
-// current impl

 #if defined(__SSSE3__)

@@ -680,8 +524,6 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,
   H7 = _mm_xor_si128( _mm_xor_si128( VF, V7 ), H7 ); \
 }

-#endif
-
 #if defined (__AVX2__)

 /////////////////////////////////
@@ -968,7 +810,7 @@ void blake256_transform_le( uint32_t *H, const uint32_t *buf,

 #define DECL_STATE32_8WAY \
   __m256i H0, H1, H2, H3, H4, H5, H6, H7; \
-   sph_u32 T0, T1;
+   uint32_t T0, T1;

 #define READ_STATE32_8WAY(state) \
 do { \
@@ -1046,7 +888,7 @@ do { \
   ROUND_S_8WAY(5); \
   ROUND_S_8WAY(6); \
   ROUND_S_8WAY(7); \
-   if (rounds == 14) \
+   if (rounds > 8) \
   { \
      ROUND_S_8WAY(8); \
      ROUND_S_8WAY(9); \
@@ -1111,7 +953,7 @@ do { \
   ROUND_S_8WAY(5); \
   ROUND_S_8WAY(6); \
   ROUND_S_8WAY(7); \
-   if (rounds == 14) \
+   if (rounds > 8) \
   { \
      ROUND_S_8WAY(8); \
      ROUND_S_8WAY(9); \
@@ -1156,7 +998,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,

 // M[ 0:3 ] contain new message data including unique nonces in M[ 3].
 // M[ 5:12, 14 ] are always zero and not needed or used.
-// M[ 4], M[ 13], M[15] are constant and are initialized here.
+// M[ 4], M[13], M[15] are constant and are initialized here.
 // M[ 5] is a special case, used as a cache for (M[13] ^ CSC).

   M[ 4] = _mm256_set1_epi32( 0x80000000 );
@@ -1221,7 +1063,7 @@ void blake256_8way_round0_prehash_le( void *midstate, const void *midhash,
 }

 void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
-                                     const void *midhash, const void *data )
+                     const void *midhash, const void *data, const int rounds )
 {
   __m256i *H = (__m256i*)final_hash;
   const __m256i *h = (const __m256i*)midhash;
@@ -1315,12 +1157,15 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,
   ROUND256_8WAY_5;
   ROUND256_8WAY_6;
   ROUND256_8WAY_7;
-   ROUND256_8WAY_8;
-   ROUND256_8WAY_9;
-   ROUND256_8WAY_0;
-   ROUND256_8WAY_1;
-   ROUND256_8WAY_2;
-   ROUND256_8WAY_3;
+   if ( rounds > 8 )
+   {
+      ROUND256_8WAY_8;
+      ROUND256_8WAY_9;
+      ROUND256_8WAY_0;
+      ROUND256_8WAY_1;
+      ROUND256_8WAY_2;
+      ROUND256_8WAY_3;
+   }

   const __m256i shuf_bswap32 =
                  mm256_set2_64( 0x0c0d0e0f08090a0b, 0x0405060700010203 );
@@ -1623,7 +1468,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,

 #define DECL_STATE32_16WAY \
   __m512i H0, H1, H2, H3, H4, H5, H6, H7; \
-   sph_u32 T0, T1;
+   uint32_t T0, T1;

 #define READ_STATE32_16WAY(state) \
 do { \
@@ -1882,8 +1727,9 @@ void blake256_16way_round0_prehash_le( void *midstate, const void *midhash,
                         _mm512_xor_si512( _mm512_set1_epi32( CSE ), M[15] ) );
 }

+// Dfault is 14 rounds, blakecoin & vanilla are 8.
 void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
-                                     const void *midhash, const void *data )
+                     const void *midhash, const void *data, const int rounds )
 {
   __m512i *H = (__m512i*)final_hash;
   const __m512i *h = (const __m512i*)midhash;
@@ -1988,12 +1834,15 @@ void blake256_16way_final_rounds_le( void *final_hash, const void *midstate,
   ROUND256_16WAY_5;
   ROUND256_16WAY_6;
   ROUND256_16WAY_7;
-   ROUND256_16WAY_8;
-   ROUND256_16WAY_9;
-   ROUND256_16WAY_0;
-   ROUND256_16WAY_1;
-   ROUND256_16WAY_2;
-   ROUND256_16WAY_3;
+   if ( rounds > 8 )
+   {
+      ROUND256_16WAY_8;
+      ROUND256_16WAY_9;
+      ROUND256_16WAY_0;
+      ROUND256_16WAY_1;
+      ROUND256_16WAY_2;
+      ROUND256_16WAY_3;
+   }

   // Byte swap final hash
   const __m512i shuf_bswap32 =
@@ -2057,7 +1906,7 @@ blake32_4way( blake_4way_small_context *ctx, const void *data,
      size_t clen = ( sizeof ctx->buf ) - bptr;

      if ( clen > blen )
-	 clen = blen;
+         clen = blen;
      memcpy( buf + vptr, data, clen );
      bptr += clen;
      data = (const unsigned char *)data + clen;
@@ -2130,11 +1979,11 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,

 // Blake-256 8 way

-static const sph_u32 salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+static const uint32_t salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };

 static void
-blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
-                   const sph_u32 *salt, int rounds )
+blake32_8way_init( blake_8way_small_context *sc, const uint32_t *iv,
+                   const uint32_t *salt, int rounds )
 {
   casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
   casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
@@ -2181,8 +2030,8 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
      len -= clen;
      if ( ptr == buf_size )
      {
-          if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
-                T1 = SPH_T32(T1 + 1);
+          if ( ( T0 = T0 + 512 ) < 512 )
+                T1 = T1 + 1;
          COMPRESS32_8WAY( sc->rounds );
          ptr = 0;
      }
@@ -2198,7 +2047,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
   __m256i buf[16];
   size_t ptr;
   unsigned bit_len;
-   sph_u32 th, tl;
+   uint32_t th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
@@ -2208,13 +2057,13 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,

   if ( ptr == 0 )
   {
-        sc->T0 = SPH_C32(0xFFFFFE00UL);
-        sc->T1 = SPH_C32(0xFFFFFFFFUL);
+        sc->T0 = 0xFFFFFE00UL;
+        sc->T1 = 0xFFFFFFFFUL;
   }
   else if ( sc->T0 == 0 )
   {
-        sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
-        sc->T1 = SPH_T32(sc->T1 - 1);
+        sc->T0 = 0xFFFFFE00UL + bit_len;
+        sc->T1 = sc->T1 - 1;
   }
   else
        sc->T0 -= 512 - bit_len;
@@ -2233,8 +2082,8 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
   {
       memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
       blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
-       sc->T0 = SPH_C32(0xFFFFFE00UL);
-       sc->T1 = SPH_C32(0xFFFFFFFFUL);
+       sc->T0 = 0xFFFFFE00UL;
+       sc->T1 = 0xFFFFFFFFUL;
       memset_zero_256( buf, 56>>2 );
       if ( out_size_w32 == 8 )
           buf[52>>2] = _mm256_set1_epi64x( 0x0100000001000000ULL );
@@ -2277,8 +2126,8 @@ blake32_8way_le( blake_8way_small_context *sc, const void *data, size_t len )
      len -= clen;
      if ( ptr == buf_size )
      {
-          if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
-                T1 = SPH_T32(T1 + 1);
+          if ( ( T0 = T0 + 512 ) < 512 )
+                T1 = T1 + 1;
          COMPRESS32_8WAY_LE( sc->rounds );
          ptr = 0;
      }
@@ -2294,7 +2143,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
   __m256i buf[16];
   size_t ptr;
   unsigned bit_len;
-   sph_u32 th, tl;
+   uint32_t th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
@@ -2304,13 +2153,13 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,

   if ( ptr == 0 )
   {
-        sc->T0 = SPH_C32(0xFFFFFE00UL);
-        sc->T1 = SPH_C32(0xFFFFFFFFUL);
+        sc->T0 = 0xFFFFFE00UL;
+        sc->T1 = 0xFFFFFFFFUL;
   }
   else if ( sc->T0 == 0 )
   {
-        sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
-        sc->T1 = SPH_T32(sc->T1 - 1);
+        sc->T0 = 0xFFFFFE00UL + bit_len;
+        sc->T1 = sc->T1 - 1;
   }
   else
        sc->T0 -= 512 - bit_len;
@@ -2328,8 +2177,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
   {
       memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
       blake32_8way_le( sc, buf + (ptr>>2), 64 - ptr );
-       sc->T0 = SPH_C32(0xFFFFFE00UL);
-       sc->T1 = SPH_C32(0xFFFFFFFFUL);
+       sc->T0 = 0xFFFFFE00UL;
+       sc->T1 = 0xFFFFFFFFUL;
       memset_zero_256( buf, 56>>2 );
       if ( out_size_w32 == 8 )
           buf[52>>2] = m256_one_32;
@@ -2348,8 +2197,8 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
 //Blake-256 16 way AVX512

 static void
-blake32_16way_init( blake_16way_small_context *sc, const sph_u32 *iv,
-                   const sph_u32 *salt, int rounds )
+blake32_16way_init( blake_16way_small_context *sc, const uint32_t *iv,
+                   const uint32_t *salt, int rounds )
 {
   casti_m512i( sc->H, 0 ) = _mm512_set1_epi64( 0x6A09E6676A09E667 );
   casti_m512i( sc->H, 1 ) = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
@@ -2411,7 +2260,7 @@ blake32_16way_close( blake_16way_small_context *sc, unsigned ub, unsigned n,
   __m512i buf[16];
   size_t ptr;
   unsigned bit_len;
-   sph_u32 th, tl;
+   uint32_t th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
@@ -2508,7 +2357,7 @@ blake32_16way_close_le( blake_16way_small_context *sc, unsigned ub, unsigned n,
   __m512i buf[16];
   size_t ptr;
   unsigned bit_len;
-   sph_u32 th, tl;
+   uint32_t th, tl;

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
@@ -2618,8 +2467,6 @@ blake256r8_16way_close(void *cc, void *dst)

 #endif // AVX512

-
-
 // Blake-256 4 way

 // default 14 rounds, backward copatibility
@@ -2754,9 +2601,3 @@ blake256r8_8way_close(void *cc, void *dst)
 }

 #endif
-
-#ifdef __cplusplus
-}
-#endif
-
-//#endif
--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -1,62 +1,22 @@
-/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */
-/*
- * BLAKE implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
 #if defined (__AVX2__)

 #include <stddef.h>
 #include <string.h>
 #include <limits.h>
-
 #include "blake-hash-4way.h"

-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
 // Blake-512 common
   
 /*
-static const sph_u64 IV512[8] = {
-	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
-	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
-	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
-	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+static const uint64_t IV512[8] =
+{
+  0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+  0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+  0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+  0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
 };

-static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };
+static const uint64_t salt_zero_big[4] = { 0, 0, 0, 0 };

 static const unsigned sigma[16][16] = {
 	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
@@ -77,15 +37,15 @@ static const unsigned sigma[16][16] = {
 	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
 };

-static const sph_u64 CB[16] = {
-   SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
-   SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
-   SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C),
-   SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917),
-   SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC),
-   SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96),
-   SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7),
-   SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69)
+static const uint64_t CB[16] = {
+   0x243F6A8885A308D3, 0x13198A2E03707344,
+   0xA4093822299F31D0, 0x082EFA98EC4E6C89,
+   0x452821E638D01377, 0xBE5466CF34E90C6C,
+   0xC0AC29B7C97C50DD, 0x3F84D5B5B5470917,
+   0x9216D5D98979FB1B, 0xD1310BA698DFB5AC,
+   0x2FFD72DBD01ADFB7, 0xB8E1AFED6A267E96,
+   0xBA7C9045F12C7F99, 0x24A19947B3916CF7,
+   0x0801F2E2858EFC16, 0x636920D871574E69

 */

@@ -1486,7 +1446,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
 	   if ( ptr == buf_size )
      {
 		   if ( (T0 = T0 + 1024 ) < 1024 )
-			   T1 = SPH_T64(T1 + 1);
+			   T1 = T1 + 1;
 	   	COMPRESS64_4WAY;
 		   ptr = 0;
 	   }
@@ -1538,8 +1498,8 @@ blake64_4way_close( blake_4way_big_context *sc, void *dst )
       memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
-       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
-       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
+       sc->T0 = 0xFFFFFFFFFFFFFC00ULL;
+       sc->T1 = 0xFFFFFFFFFFFFFFFFULL;
       memset_zero_256( buf, 112>>3 ); 
       buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
       buf[112>>3] = _mm256_set1_epi64x( bswap_64( th ) );
@@ -1629,8 +1589,4 @@ blake512_4way_close(void *cc, void *dst)
   blake64_4way_close( cc, dst );
 }

-#ifdef __cplusplus
-}
-#endif
-
 #endif
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -4,7 +4,149 @@
 #include <stdint.h>
 #include <memory.h>

-#if defined (BLAKECOIN_4WAY)
+#define rounds 8
+
+#if defined (BLAKECOIN_16WAY)
+
+int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash32[8*16] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*16] __attribute__ ((aligned (64)));
+   __m512i block0_hash[8] __attribute__ ((aligned (64)));
+   __m512i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( ((__m512i*)hash32)[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   uint32_t phash[8] __attribute__ ((aligned (64))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = (const uint32_t) n;
+   const uint32_t last_nonce = max_nonce - 16;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m512i sixteen = _mm512_set1_epi32( 16 );
+
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0, rounds );
+
+   block0_hash[0] = _mm512_set1_epi32( phash[0] );
+   block0_hash[1] = _mm512_set1_epi32( phash[1] );
+   block0_hash[2] = _mm512_set1_epi32( phash[2] );
+   block0_hash[3] = _mm512_set1_epi32( phash[3] );
+   block0_hash[4] = _mm512_set1_epi32( phash[4] );
+   block0_hash[5] = _mm512_set1_epi32( phash[5] );
+   block0_hash[6] = _mm512_set1_epi32( phash[6] );
+   block0_hash[7] = _mm512_set1_epi32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces.
+   block_buf[0] = _mm512_set1_epi32( pdata[16] );
+   block_buf[1] = _mm512_set1_epi32( pdata[17] );
+   block_buf[2] = _mm512_set1_epi32( pdata[18] );
+   block_buf[3] =
+             _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
+
+   // Partialy prehash second block without touching nonces in block_buf[3].
+   blake256_16way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+
+   do {
+      blake256_16way_final_rounds_le( hash32, midstate_vars, block0_hash,
+                                      block_buf, rounds );
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_16x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      block_buf[3] = _mm512_add_epi32( block_buf[3], sixteen );
+      n += 16;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined (BLAKECOIN_8WAY)
+
+int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash32[8*8] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*8] __attribute__ ((aligned (32)));
+   __m256i block0_hash[8] __attribute__ ((aligned (32)));
+   __m256i block_buf[16] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( ((__m256i*)hash32)[7] );
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = (const uint32_t) n;
+   const uint32_t last_nonce = max_nonce - 8;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m256i eight = _mm256_set1_epi32( 8 );
+
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0, rounds );
+
+   block0_hash[0] = _mm256_set1_epi32( phash[0] );
+   block0_hash[1] = _mm256_set1_epi32( phash[1] );
+   block0_hash[2] = _mm256_set1_epi32( phash[2] );
+   block0_hash[3] = _mm256_set1_epi32( phash[3] );
+   block0_hash[4] = _mm256_set1_epi32( phash[4] );
+   block0_hash[5] = _mm256_set1_epi32( phash[5] );
+   block0_hash[6] = _mm256_set1_epi32( phash[6] );
+   block0_hash[7] = _mm256_set1_epi32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces.
+   block_buf[0] = _mm256_set1_epi32( pdata[16] );
+   block_buf[1] = _mm256_set1_epi32( pdata[17] );
+   block_buf[2] = _mm256_set1_epi32( pdata[18] );
+   block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
+
+   // Partialy prehash second block without touching nonces in block_buf[3].
+   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+
+   do {
+      blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash,
+                                     block_buf, rounds );
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      block_buf[3] = _mm256_add_epi32( block_buf[3], eight );
+      n += 8;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+   
+#elif defined (BLAKECOIN_4WAY)

 blake256r8_4way_context blakecoin_4w_ctx;

@@ -61,7 +203,8 @@ int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,

 #endif

-#if defined(BLAKECOIN_8WAY)
+#if 0
+//#if defined(BLAKECOIN_8WAY)

 blake256r8_8way_context blakecoin_8w_ctx;

@@ -78,11 +221,84 @@ void blakecoin_8way_hash( void *state, const void *input )
                   state+160, state+192, state+224, vhash, 256 );
 }

+/*
+int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t hash32[8*8] __attribute__ ((aligned (64)));
+   uint32_t midstate_vars[16*8] __attribute__ ((aligned (64)));
+   __m256i block0_hash[8] __attribute__ ((aligned (64)));
+   __m256i block_buf[16] __attribute__ ((aligned (64)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   uint32_t phash[8] __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = (uint32_t*)work->target;
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m256i eight = _mm256_set1_epi32( 8 );
+
+   // Prehash first block
+   blake256_transform_le( phash, pdata, 512, 0, 8 );
+
+   block0_hash[0] = _mm256_set1_epi32( phash[0] );
+   block0_hash[1] = _mm256_set1_epi32( phash[1] );
+   block0_hash[2] = _mm256_set1_epi32( phash[2] );
+   block0_hash[3] = _mm256_set1_epi32( phash[3] );
+   block0_hash[4] = _mm256_set1_epi32( phash[4] );
+   block0_hash[5] = _mm256_set1_epi32( phash[5] );
+   block0_hash[6] = _mm256_set1_epi32( phash[6] );
+   block0_hash[7] = _mm256_set1_epi32( phash[7] );
+
+   // Build vectored second block, interleave last 16 bytes of data using
+   // unique nonces.
+   block_buf[0] = _mm256_set1_epi32( pdata[16] );
+   block_buf[1] = _mm256_set1_epi32( pdata[17] );
+   block_buf[2] = _mm256_set1_epi32( pdata[18] );
+   block_buf[3] = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
+
+   // Partialy prehash second block without touching nonces
+   blake256_8way_round0_prehash_le( midstate_vars, block0_hash, block_buf );
+
+   do {
+      blake256_8way_final_rounds_le( hash32, midstate_vars, block0_hash, 
+                                     block_buf );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( hash32_d7[ lane ] <= targ32_d7 )
+      {
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
+      }
+      block_buf[3] = _mm256_add_epi32( block_buf[3], eight );
+      n += 8;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+*/
+
 int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t hash32[8*8] __attribute__ ((aligned (32)));
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   blake256r8_8way_context ctx __attribute__ ((aligned (32)));
+   uint32_t *hash32_d7 =  (uint32_t*)&( ((__m256i*)hash32)[7] );
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
@@ -101,15 +317,22 @@ int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
                                                  n+3, n+2, n+1, n ) );
      pdata[19] = n;
-      blakecoin_8way_hash( hash, vdata );

-      for ( int i = 0; i < 8; i++ )
-      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget )
-          && !opt_benchmark )
+      memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
+      blake256r8_8way_update( &ctx, (const void*)vdata + (64<<3), 16 );
+      blake256r8_8way_close( &ctx, hash32 );
+
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( hash32_d7[ lane ] <= HTarget )
      {
-          pdata[19] = n+i;
-          submit_solution( work, hash+(i<<3), mythr );
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !opt_benchmark ) )
+         {
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
+         }
      }
+     
      n += 8;
   } while ( (n < max_nonce) && !work_restart[thr_id].restart );

--- a/algo/blake/blakecoin-gate.c
+++ b/algo/blake/blakecoin-gate.c
@@ -4,10 +4,10 @@
 // vanilla uses default gen merkle root, otherwise identical to blakecoin
 bool register_vanilla_algo( algo_gate_t* gate )
 {
-#if defined(BLAKECOIN_8WAY)
+#if defined(BLAKECOIN_16WAY)
+  gate->scanhash  = (void*)&scanhash_blakecoin_16way;
+#elif defined(BLAKECOIN_8WAY)
  gate->scanhash  = (void*)&scanhash_blakecoin_8way;
-  gate->hash      = (void*)&blakecoin_8way_hash;
-
 #elif defined(BLAKECOIN_4WAY)
  gate->scanhash  = (void*)&scanhash_blakecoin_4way;
  gate->hash      = (void*)&blakecoin_4way_hash;
@@ -15,14 +15,14 @@ bool register_vanilla_algo( algo_gate_t* gate )
  gate->scanhash = (void*)&scanhash_blakecoin;
  gate->hash     = (void*)&blakecoinhash;
 #endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
  return true;
 }

 bool register_blakecoin_algo( algo_gate_t* gate )
 {
  register_vanilla_algo( gate );
-  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->gen_merkle_root = (void*)&sha256_gen_merkle_root;
  return true;
 }

--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -1,30 +1,36 @@
-#ifndef __BLAKECOIN_GATE_H__
-#define __BLAKECOIN_GATE_H__ 1
+#ifndef BLAKECOIN_GATE_H__
+#define BLAKECOIN_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__SSE4_2__)
+#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+  #define BLAKECOIN_16WAY
+#elif defined(__AVX2__)
+  #define BLAKECOIN_8WAY
+#elif defined(__SSE2__)  // always true
  #define BLAKECOIN_4WAY
 #endif
-#if defined(__AVX2__)
-  #define BLAKECOIN_8WAY
-#endif

-#if defined (BLAKECOIN_8WAY)
-void blakecoin_8way_hash(void *state, const void *input);
+#if defined (BLAKECOIN_16WAY)
+int scanhash_blakecoin_16way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+
+#elif defined (BLAKECOIN_8WAY)
+//void blakecoin_8way_hash(void *state, const void *input);
 int scanhash_blakecoin_8way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-#endif

-#if defined (BLAKECOIN_4WAY)
+#elif defined (BLAKECOIN_4WAY)
 void blakecoin_4way_hash(void *state, const void *input);
 int scanhash_blakecoin_4way( struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done, struct thr_info *mythr );
-#endif
+#else  // never used

 void blakecoinhash( void *state, const void *input );
 int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr );

 #endif
+
+#endif
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -1,6 +1,6 @@
 #include "blakecoin-gate.h"

-#if !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
+#if !defined(BLAKECOIN_16WAY) && !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)

 #define BLAKE32_ROUNDS 8
 #include "sph_blake.h"
@@ -12,7 +12,6 @@ void blakecoin_close(void *cc, void *dst);
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>
-#include <openssl/sha.h>

 // context management is staged for efficiency.
 // 1. global initial ctx cached on startup
@@ -35,8 +34,8 @@ void blakecoinhash( void *state, const void *input )
 	uint8_t hash[64] __attribute__ ((aligned (32)));
 	uint8_t *ending = (uint8_t*) input + 64;

-        // copy cached midstate
-        memcpy( &ctx, &blake_mid_ctx, sizeof ctx );
+   // copy cached midstate
+   memcpy( &ctx, &blake_mid_ctx, sizeof ctx );
 	blakecoin( &ctx, ending, 16 );
 	blakecoin_close( &ctx, hash );
 	memcpy( state, hash, 32 );
@@ -45,8 +44,8 @@ void blakecoinhash( void *state, const void *input )
 int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	uint32_t HTarget = ptarget[7];
   int thr_id = mythr->id;  // thr_id arg is deprecated
@@ -60,10 +59,10 @@ int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
 		HTarget = 0x7f;

 	// we need big endian data...
-        for (int kk=0; kk < 19; kk++) 
-                be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
+   for (int kk=0; kk < 19; kk++) 
+      be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);

-        blake_midstate_init( endiandata );
+   blake_midstate_init( endiandata );

 #ifdef DEBUG_ALGO
 	applog(LOG_DEBUG,"[%d] Target=%08x %08x", thr_id, ptarget[6], ptarget[7]);
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -48,7 +48,7 @@ static void allium_16way_hash( void *state, const void *midstate_vars,
   uint32_t hash15[8] __attribute__ ((aligned (32)));
   allium_16way_ctx_holder ctx __attribute__ ((aligned (64)));

-   blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
+   blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

   dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                  hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -217,7 +217,7 @@ int scanhash_allium_16way( struct work *work, uint32_t max_nonce,
   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

   // Prehash first block.
-   blake256_transform_le( phash, pdata, 512, 0 );
+   blake256_transform_le( phash, pdata, 512, 0, 14 );

   // Interleave hash for second block prehash.
   block0_hash[0] = _mm512_set1_epi32( phash[0] );
@@ -286,7 +286,7 @@ static void allium_8way_hash( void *hash, const void *midstate_vars,
   uint64_t *hash7 = (uint64_t*)hash+28;
   allium_8way_ctx_holder ctx __attribute__ ((aligned (64))); 

-   blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block );
+   blake256_8way_final_rounds_le( vhashA, midstate_vars, midhash, block, 14 );

   dintrlv_8x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
@@ -401,7 +401,7 @@ int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
   const __m256i eight = _mm256_set1_epi32( 8 );

   // Prehash first block
-   blake256_transform_le( phash, pdata, 512, 0 );
+   blake256_transform_le( phash, pdata, 512, 0, 14 );

   block0_hash[0] = _mm256_set1_epi32( phash[0] );
   block0_hash[1] = _mm256_set1_epi32( phash[1] );
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -35,7 +35,7 @@ static void lyra2z_16way_hash( void *state, const void *midstate_vars,
    uint32_t hash14[8] __attribute__ ((aligned (32)));
    uint32_t hash15[8] __attribute__ ((aligned (32)));

-    blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block );
+    blake256_16way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

    dintrlv_16x32( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
              hash8, hash9, hash10, hash11 ,hash12, hash13, hash14, hash15,
@@ -108,7 +108,7 @@ int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
   if ( bench ) ( (uint32_t*)ptarget )[7] = 0x0000ff;

   // Prehash first block
-   blake256_transform_le( phash, pdata, 512, 0 );
+   blake256_transform_le( phash, pdata, 512, 0, 14 );

   block0_hash[0] = _mm512_set1_epi32( phash[0] );
   block0_hash[1] = _mm512_set1_epi32( phash[1] );
@@ -170,7 +170,7 @@ static void lyra2z_8way_hash( void *state, const void *midstate_vars,
     uint32_t hash7[8] __attribute__ ((aligned (32)));
     uint32_t vhash[8*8] __attribute__ ((aligned (64)));

-     blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block );
+     blake256_8way_final_rounds_le( vhash, midstate_vars, midhash, block, 14 );

     dintrlv_8x32( hash0, hash1, hash2, hash3,
                   hash4, hash5, hash6, hash7, vhash, 256 );
@@ -216,7 +216,7 @@ int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
   const __m256i eight = _mm256_set1_epi32( 8 );

   // Prehash first block
-   blake256_transform_le( phash, pdata, 512, 0 );
+   blake256_transform_le( phash, pdata, 512, 0, 14 );

   block0_hash[0] = _mm256_set1_epi32( phash[0] );
   block0_hash[1] = _mm256_set1_epi32( phash[1] );
--- a/algo/sha/sha-hash-4way.h
+++ b/algo/sha/sha-hash-4way.h
@@ -67,7 +67,7 @@ void sha256_4way_prehash_3rounds( __m128i *state_mid, __m128i *X,
 void sha256_4way_final_rounds( __m128i *state_out, const __m128i *data,
        const __m128i *state_in, const __m128i *state_mid, const __m128i *X );
 int sha256_4way_transform_le_short( __m128i *state_out, const __m128i *data,
-                                     const __m128i *state_in );
+                                   const __m128i *state_in, const uint32_t *target );

 #endif  // SSE2

@@ -95,7 +95,7 @@ void sha256_8way_prehash_3rounds( __m256i *state_mid, __m256i *X,
 void sha256_8way_final_rounds( __m256i *state_out, const __m256i *data,
        const __m256i *state_in, const __m256i *state_mid, const __m256i *X );
 int sha256_8way_transform_le_short( __m256i *state_out, const __m256i *data,
-                                     const __m256i *state_in );
+                             const __m256i *state_in, const uint32_t *target );

 #endif  // AVX2

@@ -123,7 +123,7 @@ void sha256_16way_final_rounds( __m512i *state_out, const __m512i *data,
        const __m512i *state_in, const __m512i *state_mid, const __m512i *X );

 int sha256_16way_transform_le_short( __m512i *state_out, const __m512i *data,
-                                     const __m512i *state_in );
+                            const __m512i *state_in, const uint32_t *target );

 #endif // AVX512

--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -658,43 +658,14 @@ int scanhash_sha256d_pooler( struct work *work,	uint32_t max_nonce,
 	return 0;
 }

-/*
-int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr )
-{
-   uint32_t _ALIGN(128) hash[8];
-   uint32_t _ALIGN(64) data[20];
-   uint32_t *pdata = work->data;
-   const uint32_t *ptarget = work->target;
-   uint32_t n = pdata[19] - 1;
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
-   int thr_id = mythr->id;
-
-   memcpy( data, pdata, 80 );
-
-   do {
-      data[19] = ++n;
-      sha256d( (unsigned char*)hash, (const unsigned char*)data, 80 );
-      if ( unlikely( swab32( hash[7] ) <= Htarg ) )
-      {
-         pdata[19] = n;
-         sha256d_80_swap(hash, pdata);
-         if ( fulltest( hash, ptarget ) && !opt_benchmark )
-            submit_solution( work, hash, mythr );
-      }
-   } while ( likely( n < max_nonce && !work_restart[thr_id].restart ) );
-   *hashes_done = n - first_nonce + 1;
-   pdata[19] = n;
-   return 0;
-}
-*/
-
 bool register_sha256d_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
 #if defined(SHA256D_16WAY)
   gate->scanhash = (void*)&scanhash_sha256d_16way;
+#elif defined(SHA256D_SHA)
+   gate->optimizations = SHA_OPT;
+   gate->scanhash = (void*)&scanhash_sha256d_sha;
 //#elif defined(SHA256D_8WAY)
 //   gate->scanhash = (void*)&scanhash_sha256d_8way;
 #else
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
--- a/algo/sha/sha256-hash.c
+++ b/algo/sha/sha256-hash.c
@@ -50,65 +50,6 @@ void sha256_update( sha256_context *ctx, const void *data, size_t len )
   memcpy( ctx->buf, src, len );
 }

-#if 0
-void sha256_final( sha256_context *ctx, uint32_t *hash )
-{
-   size_t r;
-
-
-   /* Figure out how many bytes we have buffered. */
-   r = ctx->count & 0x3f;
-//   r = ( ctx->count >> 3 ) & 0x3f;
-
-//printf("final: count= %d, r= %d\n", ctx->count, r );
-   
-   /* Pad to 56 mod 64, transforming if we finish a block en route. */
-   if ( r < 56 )
-   {
-      /* Pad to 56 mod 64. */
-      memcpy( &ctx->buf[r], SHA256_PAD, 56 - r );
-   }
-   else
-   {
-      /* Finish the current block and mix. */
-      memcpy( &ctx->buf[r], SHA256_PAD, 64 - r );
-      sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
-
-//      SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-
-      /* The start of the final block is all zeroes. */
-      memset( &ctx->buf[0], 0, 56 );
-   }
-
-   /* Add the terminating bit-count. */
-   ctx->buf[56] = bswap_64( ctx->count << 3 );
-//   ctx->buf[56] = bswap_64( ctx->count );
-//   be64enc( &ctx->buf[56], ctx->count );
-
-   /* Mix in the final block. */
-   sha256_transform_be( ctx->state, (uint32_t*)ctx->buf, ctx->state );
-
-//   SHA256_Transform(ctx->state, ctx->buf, &tmp32[0], &tmp32[64]);
-
-   for ( int i = 0; i < 8; i++ )  hash[i] = bswap_32( ctx->state[i] );
-   
-//   for ( int i = 0; i < 8; i++ )  be32enc( hash + 4*i, ctx->state + i );
-
-/*
-//   be32enc_vect(digest, ctx->state, 4);
-//   be32enc_vect(uint8_t * dst, const uint32_t * src, size_t len)
-   // Encode vector, two words at a time. 
-   do {
-      be32enc(&dst[0], src[0]);
-      be32enc(&dst[4], src[1]);
-      src += 2;
-      dst += 8;
-   } while (--len);
-*/
-
-}
-#endif
-
 void sha256_final( sha256_context *ctx, void *hash )
 {
   int ptr = ctx->count & 0x3f;
--- a/algo/sha/sha256d-4way.c
+++ b/algo/sha/sha256d-4way.c
@@ -3,10 +3,194 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
+#include "sha256-hash.h"
 #include "sha-hash-4way.h"

+static const uint32_t sha256_iv[8] __attribute__ ((aligned (32))) =
+{
+   0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+   0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+#if defined(SHA256D_SHA)
+
+int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block0[16]   __attribute__ ((aligned (64)));
+   uint32_t block1[16]   __attribute__ ((aligned (64)));
+   uint32_t hash0[8]     __attribute__ ((aligned (32)));
+   uint32_t hash1[8]     __attribute__ ((aligned (32)));
+   uint32_t mstate[8]  __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i shuf_bswap32 =
+           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );
+
+   // hash first 64 bytes of data
+   sha256_opt_transform_le( mstate, pdata, sha256_iv );
+
+   do
+   {
+      // 1. final 16 bytes of data, with padding
+      memcpy( block0, pdata + 16, 16 );
+      memcpy( block1, pdata + 16, 16 );
+      block0[ 3] = n;
+      block1[ 3] = n+1;
+      block0[ 4] = block1[ 4] = 0x80000000;
+      memset( block0 + 5, 0, 40 );
+      memset( block1 + 5, 0, 40 );
+      block0[15] = block1[15] = 80*8; // bit count
+      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
+                                  mstate, mstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy( block0, hash0, 32 );
+      memcpy( block1, hash1, 32 );
+      block0[ 8] = block1[ 8] = 0x80000000;
+      memset( block0 + 9, 0, 24 );
+      memset( block1 + 9, 0, 24 );
+      block0[15] = block1[15] = 32*8; // bit count
+      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
+                                  sha256_iv, sha256_iv );
+
+      if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
+      {
+          casti_m128i( hash0, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
+          casti_m128i( hash0, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
+          if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hash0, mythr );
+          }
+      }
+
+      if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
+      {
+         casti_m128i( hash1, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
+         casti_m128i( hash1, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
+         if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hash1, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#endif
+
 #if defined(SHA256D_16WAY)

+int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+   __m512i  hash32[8]    __attribute__ ((aligned (128)));
+   __m512i  block[16]    __attribute__ ((aligned (64)));
+   __m512i  buf[16]      __attribute__ ((aligned (64)));
+   __m512i  mstate1[8]   __attribute__ ((aligned (64)));
+   __m512i  mstate2[8]   __attribute__ ((aligned (64)));
+   __m512i  istate[8]    __attribute__ ((aligned (64)));
+   __m512i  mexp_pre[8]  __attribute__ ((aligned (64)));
+   uint32_t phash[8]     __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
+   const uint32_t targ32_d7 = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 16;
+   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const __m512i sixteen = _mm512_set1_epi32( 16 );
+   const bool bench = opt_benchmark;
+   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
+                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );
+
+   // prehash first block directly from pdata
+   sha256_transform_le( phash, pdata, sha256_iv );
+
+   // vectorize block 0 hash for second block
+   mstate1[0] = _mm512_set1_epi32( phash[0] );
+   mstate1[1] = _mm512_set1_epi32( phash[1] );
+   mstate1[2] = _mm512_set1_epi32( phash[2] );
+   mstate1[3] = _mm512_set1_epi32( phash[3] );
+   mstate1[4] = _mm512_set1_epi32( phash[4] );
+   mstate1[5] = _mm512_set1_epi32( phash[5] );
+   mstate1[6] = _mm512_set1_epi32( phash[6] );
+   mstate1[7] = _mm512_set1_epi32( phash[7] );
+
+   // second message block data, with nonce & padding   
+   buf[0] = _mm512_set1_epi32( pdata[16] );
+   buf[1] = _mm512_set1_epi32( pdata[17] );
+   buf[2] = _mm512_set1_epi32( pdata[18] );
+   buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                              n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
+   buf[4] = last_byte;
+   memset_zero_512( buf+5, 10 );
+   buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
+
+   // partially pre-expand & prehash second message block, avoiding the nonces
+   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
+
+   // vectorize IV for 2nd & 3rd sha256
+   istate[0] = _mm512_set1_epi32( sha256_iv[0] );
+   istate[1] = _mm512_set1_epi32( sha256_iv[1] );
+   istate[2] = _mm512_set1_epi32( sha256_iv[2] );
+   istate[3] = _mm512_set1_epi32( sha256_iv[3] );
+   istate[4] = _mm512_set1_epi32( sha256_iv[4] );
+   istate[5] = _mm512_set1_epi32( sha256_iv[5] );
+   istate[6] = _mm512_set1_epi32( sha256_iv[6] );
+   istate[7] = _mm512_set1_epi32( sha256_iv[7] );
+
+   // initialize padding for 2nd sha256
+   block[ 8] = last_byte;
+   memset_zero_512( block + 9, 6 );
+   block[15] = _mm512_set1_epi32( 32*8 ); // bit count
+
+   do
+   {
+      sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
+
+      if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
+      {
+         for ( int lane = 0; lane < 16; lane++ )
+         if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
+         {
+            extr_lane_16x32( phash, hash32, lane, 256 );
+            casti_m256i( phash, 0 ) =
+                _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
+            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, phash, mythr );
+            }
+         }
+      }
+      buf[3] = _mm512_add_epi32( buf[3], sixteen );
+      n += 16;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+
+/*
 int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -67,20 +251,18 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
                                 mexp_pre );

      // 2. 32 byte hash from 1.
-      if ( sha256_16way_transform_le_short( hash32, block, initstate ) )
-      {
-         // byte swap final hash for testing
-         mm512_block_bswap_32( hash32, hash32 );
+      sha256_16way_transform_le( hash32, block, initstate );
+      // byte swap final hash for testing
+      mm512_block_bswap_32( hash32, hash32 );

-         for ( int lane = 0; lane < 16; lane++ )
-         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      for ( int lane = 0; lane < 16; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_16x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
-            extr_lane_16x32( lane_hash, hash32, lane, 256 );
-            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
-            {
-               pdata[19] = n + lane;
-               submit_solution( work, lane_hash, mythr );
-            }
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
         }
      }
      *noncev = _mm512_add_epi32( *noncev, sixteen );
@@ -90,6 +272,7 @@ int scanhash_sha256d_16way( struct work *work, const uint32_t max_nonce,
   *hashes_done = n - first_nonce;
   return 0;
 }
+*/

 #endif

@@ -104,7 +287,7 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
   __m256i  initstate[8] __attribute__ ((aligned (32)));
   __m256i  midstate1[8] __attribute__ ((aligned (32)));
   __m256i  midstate2[8] __attribute__ ((aligned (32)));
-   __m256i  mexp_pre[16] __attribute__ ((aligned (32)));
+   __m256i  mexp_pre[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
@@ -154,21 +337,18 @@ int scanhash_sha256d_8way( struct work *work, const uint32_t max_nonce,
                                mexp_pre );

      // 2. 32 byte hash from 1.
-      if ( unlikely(
-               sha256_8way_transform_le_short( hash32, block, initstate ) ) )
-      {
-         // byte swap final hash for testing
-         mm256_block_bswap_32( hash32, hash32 );
+      sha256_8way_transform_le( hash32, block, initstate );
+      // byte swap final hash for testing
+      mm256_block_bswap_32( hash32, hash32 );

-         for ( int lane = 0; lane < 8; lane++ )
-         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      for ( int lane = 0; lane < 8; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_8x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
-            extr_lane_8x32( lane_hash, hash32, lane, 256 );
-            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
-            {
-               pdata[19] = n + lane;
-               submit_solution( work, lane_hash, mythr );
-            }
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
         }
       }
       *noncev = _mm256_add_epi32( *noncev, eight );
@@ -191,8 +371,6 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
   __m128i  hash32[8]     __attribute__ ((aligned (32)));
   __m128i  initstate[8]  __attribute__ ((aligned (32)));
   __m128i  midstate1[8]   __attribute__ ((aligned (32)));
-   __m128i  midstate2[8]  __attribute__ ((aligned (32)));
-   __m128i  mexp_pre[16]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8]  __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
@@ -232,31 +410,25 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,

   // hash first 64 bytes of data
   sha256_4way_transform_le( midstate1, vdata, initstate );
-   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );

   do
   {
      // 1. final 16 bytes of data, with padding
-      sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
-                                mexp_pre );
+      sha256_4way_transform_le( block, vdata+16, initstate );

      // 2. 32 byte hash from 1.
-      if ( unlikely(
-              sha256_4way_transform_le_short( hash32, block, initstate ) ) )
-      {
-         // byte swap final hash for testing
-         mm128_block_bswap_32( hash32, hash32 );
+      sha256_4way_transform_le( hash32, block, initstate );
+      // byte swap final hash for testing
+      mm128_block_bswap_32( hash32, hash32 );

-         for ( int lane = 0; lane < 4; lane++ )
-         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
+      {
+         extr_lane_4x32( lane_hash, hash32, lane, 256 );
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
-            extr_lane_4x32( lane_hash, hash32, lane, 256 );
-            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
-            {
-               pdata[19] = n + lane;
-               submit_solution( work, lane_hash, mythr );
-            }
+            pdata[19] = n + lane;
+            submit_solution( work, lane_hash, mythr );
         }
      }
      *noncev = _mm_add_epi32( *noncev, four );
@@ -268,21 +440,3 @@ int scanhash_sha256d_4way( struct work *work, const uint32_t max_nonce,
 }

 #endif
-
-/*
-bool register_sha256d_algo( algo_gate_t* gate )
-{
-   gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
-#if defined(SHA256D_16WAY)
-   gate->scanhash = (void*)&scanhash_sha256d_16way;
-#elif defined(SHA256D_8WAY)
-   gate->scanhash = (void*)&scanhash_sha256d_8way;
-#elif defined(SHA256D_4WAY)
-   gate->scanhash = (void*)&scanhash_sha256d_4way;
-#endif
-   
-//   gate->hash     = (void*)&sha256d;
-   return true;
-};
-*/
-
--- a/algo/sha/sha256d-4way.h
+++ b/algo/sha/sha256d-4way.h
@@ -6,6 +6,8 @@

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define SHA256D_16WAY 1
+#elif defined(__SHA__)
+  #define SHA256D_SHA 1
 #elif defined(__AVX2__)
  #define SHA256D_8WAY 1
 #else
@@ -32,15 +34,12 @@ int scanhash_sha256d_4way( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr );
 #endif

+#if defined(SHA256D_SHA)

-/*
-#if defined(__SHA__)
-
-int scanhash_sha256d( struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done, struct thr_info *mythr );
-
-#endif
-*/
+int scanhash_sha256d_sha( struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif

 #endif

--- a/algo/sha/sha256dt.c
+++ b/algo/sha/sha256dt.c
@@ -3,99 +3,201 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
+#include "sha256-hash.h"
 #include "sha-hash-4way.h"

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
  #define SHA256DT_16WAY 1
+#elif defined(__SHA__)
+  #define SHA256DT_SHA 1
 #elif defined(__AVX2__)
  #define SHA256DT_8WAY 1
 #else
  #define SHA256DT_4WAY 1
 #endif

+static const uint32_t sha256dt_iv[8]  __attribute__ ((aligned (32))) =
+   {
+      0xdfa9bf2c, 0xb72074d4, 0x6bb01122, 0xd338e869,
+      0xaa3ff126, 0x475bbf30, 0x8fd52e5b, 0x9f75c9ad
+   };
+
 #if defined(SHA256DT_16WAY)

 int scanhash_sha256dt_16way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m512i  vdata[32]    __attribute__ ((aligned (128)));
+   __m512i  hash32[8]    __attribute__ ((aligned (128)));
   __m512i  block[16]    __attribute__ ((aligned (64)));
-   __m512i  hash32[8]    __attribute__ ((aligned (64)));
-   __m512i  initstate[8] __attribute__ ((aligned (64)));
-   __m512i  midstate1[8] __attribute__ ((aligned (64)));
-   __m512i  midstate2[8] __attribute__ ((aligned (64)));
-   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   __m512i  buf[16]      __attribute__ ((aligned (64)));
+   __m512i  mstate1[8]   __attribute__ ((aligned (64)));
+   __m512i  mstate2[8]   __attribute__ ((aligned (64)));
+   __m512i  istate[8]    __attribute__ ((aligned (64)));
+   __m512i  mexp_pre[8]  __attribute__ ((aligned (64)));
+   uint32_t phash[8]     __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
-   const uint32_t targ32_d7 = ptarget[7];
+//   uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
+//   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 16;
-   uint32_t n = first_nonce;
-   __m512i *noncev = vdata + 19; 
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
   const __m512i sixteen = _mm512_set1_epi32( 16 );
+   const bool bench = opt_benchmark;
+   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
+                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

-   for ( int i = 0; i < 19; i++ )
-      vdata[i] = _mm512_set1_epi32( pdata[i] );
+   // prehash first block directly from pdata
+   sha256_transform_le( phash, pdata, sha256dt_iv );

-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
-                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+   // vectorize block 0 hash for second block
+   mstate1[0] = _mm512_set1_epi32( phash[0] );
+   mstate1[1] = _mm512_set1_epi32( phash[1] );
+   mstate1[2] = _mm512_set1_epi32( phash[2] );
+   mstate1[3] = _mm512_set1_epi32( phash[3] );
+   mstate1[4] = _mm512_set1_epi32( phash[4] );
+   mstate1[5] = _mm512_set1_epi32( phash[5] );
+   mstate1[6] = _mm512_set1_epi32( phash[6] );
+   mstate1[7] = _mm512_set1_epi32( phash[7] );

-   vdata[16+4] = last_byte;
-   memset_zero_512( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm512_set1_epi32( 0x480 ); 
-   
+   // second message block data, with nonce & padding
+   buf[0] = _mm512_set1_epi32( pdata[16] );
+   buf[1] = _mm512_set1_epi32( pdata[17] );
+   buf[2] = _mm512_set1_epi32( pdata[18] );
+   buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                              n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
+   buf[4] = last_byte;
+   memset_zero_512( buf+5, 10 );
+   buf[15] = _mm512_set1_epi32( 0x480 ); // sha256dt funky bit count
+
+   // partially pre-expand & prehash second message block, avoiding the nonces
+   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
+
+   // vectorize IV for 2nd sha256
+   istate[0] = _mm512_set1_epi32( sha256dt_iv[0] );
+   istate[1] = _mm512_set1_epi32( sha256dt_iv[1] );
+   istate[2] = _mm512_set1_epi32( sha256dt_iv[2] );
+   istate[3] = _mm512_set1_epi32( sha256dt_iv[3] );
+   istate[4] = _mm512_set1_epi32( sha256dt_iv[4] );
+   istate[5] = _mm512_set1_epi32( sha256dt_iv[5] );
+   istate[6] = _mm512_set1_epi32( sha256dt_iv[6] );
+   istate[7] = _mm512_set1_epi32( sha256dt_iv[7] );
+
+   // initialize padding for 2nd sha256
   block[ 8] = last_byte;
-   memset_zero_512( block + 9, 6 );
-   block[15] = _mm512_set1_epi32( 0x300 ); 
-   
-   initstate[0] = _mm512_set1_epi64( 0xdfa9bf2cdfa9bf2c );
-   initstate[1] = _mm512_set1_epi64( 0xb72074d4b72074d4 );
-   initstate[2] = _mm512_set1_epi64( 0x6bb011226bb01122 );
-   initstate[3] = _mm512_set1_epi64( 0xd338e869d338e869 );
-   initstate[4] = _mm512_set1_epi64( 0xaa3ff126aa3ff126 );
-   initstate[5] = _mm512_set1_epi64( 0x475bbf30475bbf30 );
-   initstate[6] = _mm512_set1_epi64( 0x8fd52e5b8fd52e5b );
-   initstate[7] = _mm512_set1_epi64( 0x9f75c9ad9f75c9ad );
-
-   sha256_16way_transform_le( midstate1, vdata, initstate );
-   
-   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );
+   memset_zero_512( block+9, 6 );
+   block[15] = _mm512_set1_epi32( 0x300 ); // bit count

   do
   {
-      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
-                                 mexp_pre );
-      sha256_16way_transform_le( hash32, block, initstate );
-      mm512_block_bswap_32( hash32, hash32 );    
-
-      for ( int lane = 0; lane < 16; lane++ )
-      if ( hash32_d7[ lane ] <= targ32_d7 )
+      // finish second block with nonces
+      sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );
+      if ( unlikely( sha256_16way_transform_le_short(
+                                  hash32, block, istate, ptarget ) ) )
      {
-         extr_lane_16x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         for ( int lane = 0; lane < 16; lane++ )
+//         if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
         {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_16x32( phash, hash32, lane, 256 );
+            casti_m256i( phash, 0 ) =
+                   _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf ); 
+            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
+            {
+              pdata[19] = n + lane;
+              submit_solution( work, phash, mythr );
+            }
         }
      }
-      *noncev = _mm512_add_epi32( *noncev, sixteen );
+      buf[3] = _mm512_add_epi32( buf[3], sixteen );
      n += 16;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
 }
+   
+#elif defined(SHA256DT_SHA)

+int scanhash_sha256dt_sha( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t block0[16]   __attribute__ ((aligned (64)));
+   uint32_t block1[16]   __attribute__ ((aligned (64)));
+   uint32_t hash0[8]     __attribute__ ((aligned (32)));
+   uint32_t hash1[8]     __attribute__ ((aligned (32)));
+   uint32_t mstate[8]  __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 2;
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   const __m128i shuf_bswap32 =
+           _mm_set_epi64x( 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL );

-#endif
+   // hash first 64 bytes of data
+   sha256_opt_transform_le( mstate, pdata, sha256dt_iv );

-#if defined(SHA256DT_8WAY)
+   do
+   {
+      // 1. final 16 bytes of data, with padding
+      memcpy( block0, pdata + 16, 16 );
+      memcpy( block1, pdata + 16, 16 );
+      block0[ 3] = n;
+      block1[ 3] = n+1;
+      block0[ 4] = block1[ 4] = 0x80000000;
+      memset( block0 + 5, 0, 40 );
+      memset( block1 + 5, 0, 40 );
+      block0[15] = block1[15] = 0x480; // funky bit count
+      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
+                                  mstate, mstate );
+
+      // 2. 32 byte hash from 1.
+      memcpy( block0, hash0, 32 );
+      memcpy( block1, hash1, 32 );
+      block0[ 8] = block1[ 8] = 0x80000000;
+      memset( block0 + 9, 0, 24 );
+      memset( block1 + 9, 0, 24 );
+      block0[15] = block1[15] = 0x300; // bit count
+      sha256_ni2way_transform_le( hash0, hash1, block0, block1,
+                                  sha256dt_iv, sha256dt_iv );
+
+      if ( unlikely( bswap_32( hash0[7] ) <= ptarget[7] ) )
+      {
+          casti_m128i( hash0, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hash0, 0 ), shuf_bswap32 );
+          casti_m128i( hash0, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hash0, 1 ), shuf_bswap32 );
+          if ( likely( valid_hash( hash0, ptarget ) && !bench ) )
+          {
+             pdata[19] = n;
+             submit_solution( work, hash0, mythr );
+          }
+      }
+      if ( unlikely( bswap_32( hash1[7] ) <= ptarget[7] ) )
+      {
+         casti_m128i( hash1, 0 ) =
+               _mm_shuffle_epi8( casti_m128i( hash1, 0 ), shuf_bswap32 );
+         casti_m128i( hash1, 1 ) =
+               _mm_shuffle_epi8( casti_m128i( hash1, 1 ), shuf_bswap32 );
+         if ( likely( valid_hash( hash1, ptarget ) && !bench ) )
+         {
+            pdata[19] = n+1;
+            submit_solution( work, hash1, mythr );
+         }
+      }
+      n += 2;
+   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
+
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
+   return 0;
+}
+
+#elif defined(SHA256DT_8WAY)

 int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
@@ -103,15 +205,13 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
   __m256i  vdata[32]    __attribute__ ((aligned (64)));
   __m256i  block[16]    __attribute__ ((aligned (32)));
   __m256i  hash32[8]    __attribute__ ((aligned (32)));
-   __m256i  initstate[8] __attribute__ ((aligned (32)));
-   __m256i  midstate1[8] __attribute__ ((aligned (32)));
-   __m256i  midstate2[8] __attribute__ ((aligned (32)));
-   __m256i  mexp_pre[16] __attribute__ ((aligned (32)));
+   __m256i  istate[8]    __attribute__ ((aligned (32)));
+   __m256i  mstate1[8]   __attribute__ ((aligned (32)));
+   __m256i  mstate2[8]   __attribute__ ((aligned (32)));
+   __m256i  mexp_pre[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
-   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
@@ -120,6 +220,8 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
   const __m256i eight = _mm256_set1_epi32( 8 );
+   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
+                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
      vdata[i] = _mm256_set1_epi32( pdata[i] );
@@ -135,35 +237,38 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
   block[15] = _mm256_set1_epi32( 0x300 ); 
   
   // initialize state
-   initstate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
-   initstate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
-   initstate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
-   initstate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
-   initstate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
-   initstate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
-   initstate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
-   initstate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );
+   istate[0] = _mm256_set1_epi64x( 0xdfa9bf2cdfa9bf2c );
+   istate[1] = _mm256_set1_epi64x( 0xb72074d4b72074d4 );
+   istate[2] = _mm256_set1_epi64x( 0x6bb011226bb01122 );
+   istate[3] = _mm256_set1_epi64x( 0xd338e869d338e869 );
+   istate[4] = _mm256_set1_epi64x( 0xaa3ff126aa3ff126 );
+   istate[5] = _mm256_set1_epi64x( 0x475bbf30475bbf30 );
+   istate[6] = _mm256_set1_epi64x( 0x8fd52e5b8fd52e5b );
+   istate[7] = _mm256_set1_epi64x( 0x9f75c9ad9f75c9ad );

-   sha256_8way_transform_le( midstate1, vdata, initstate );
+   sha256_8way_transform_le( mstate1, vdata, istate );

   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
+   sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
   
   do
   {
-      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
+      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
                                mexp_pre );
-      sha256_8way_transform_le( hash32, block, initstate );
-      mm256_block_bswap_32( hash32, hash32 );

-      for ( int lane = 0; lane < 8; lane++ )
-      if ( hash32_d7[ lane ] <= targ32_d7 )
+      if ( unlikely( sha256_8way_transform_le_short(
+                            hash32, block, istate, ptarget ) ) )
      {
-         extr_lane_8x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+         for ( int lane = 0; lane < 8; lane++ )
         {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_8x32( lane_hash, hash32, lane, 256 );
+            casti_m256i( lane_hash, 0 ) =
+               _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
         }
      }
      *noncev = _mm256_add_epi32( *noncev, eight );
@@ -174,10 +279,7 @@ int scanhash_sha256dt_8way( struct work *work, const uint32_t max_nonce,
   return 0;
 }

-#endif
-
-
-#if defined(SHA256DT_4WAY)
+#elif defined(SHA256DT_4WAY)

 int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
@@ -230,21 +332,25 @@ int scanhash_sha256dt_4way( struct work *work, const uint32_t max_nonce,
   do
   {
      sha256_4way_transform_le( block,  vdata+16, midstate  );
-      sha256_4way_transform_le( hash32, block,    initstate );
-      mm128_block_bswap_32( hash32, hash32 );
+      sha256_4way_transform_le( hash32, block, initstate );

-      for ( int lane = 0; lane < 4; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
-      {
-         extr_lane_4x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+//      if ( sha256_4way_transform_le_short( hash32, block, initstate, ptarget ) )
+//      {
+         mm128_block_bswap_32( hash32, hash32 );
+
+         for ( int lane = 0; lane < 4; lane++ )
+         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
         {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_4x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
         }
-       }
-       *noncev = _mm_add_epi32( *noncev, four );
-       n += 4;
+//      }
+      *noncev = _mm_add_epi32( *noncev, four );
+      n += 4;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
@@ -257,11 +363,14 @@ bool register_sha256dt_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
 #if defined(SHA256DT_16WAY)
-    gate->scanhash   = (void*)&scanhash_sha256dt_16way;
+    gate->scanhash = (void*)&scanhash_sha256dt_16way;
+#elif defined(SHA256DT_SHA)
+    gate->optimizations = SHA_OPT;
+    gate->scanhash = (void*)&scanhash_sha256dt_sha;    
 #elif defined(SHA256DT_8WAY)
-    gate->scanhash   = (void*)&scanhash_sha256dt_8way;
+    gate->scanhash = (void*)&scanhash_sha256dt_8way;
 #else
-    gate->scanhash   = (void*)&scanhash_sha256dt_4way;
+    gate->scanhash = (void*)&scanhash_sha256dt_4way;
 #endif
    return true;
 }
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -3,6 +3,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
+#include "sha256-hash.h"
 #include "sha-hash-4way.h"

 #if defined(SHA256T_16WAY)
@@ -10,83 +11,96 @@
 int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
-   __m512i  vdata[32]    __attribute__ ((aligned (128)));
+   __m512i  hash32[8]    __attribute__ ((aligned (128)));
   __m512i  block[16]    __attribute__ ((aligned (64)));
-   __m512i  hash32[8]    __attribute__ ((aligned (64)));
-   __m512i  initstate[8] __attribute__ ((aligned (64)));
-   __m512i  midstate1[8] __attribute__ ((aligned (64)));
-   __m512i  midstate2[8] __attribute__ ((aligned (64)));
-   __m512i  mexp_pre[16] __attribute__ ((aligned (64)));
-   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
-   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
+   __m512i  buf[16]      __attribute__ ((aligned (64)));
+   __m512i  mstate1[8]   __attribute__ ((aligned (64)));
+   __m512i  mstate2[8]   __attribute__ ((aligned (64)));
+   __m512i  istate[8]    __attribute__ ((aligned (64)));
+   __m512i  mexp_pre[8]  __attribute__ ((aligned (64)));
+   uint32_t phash[8]     __attribute__ ((aligned (32)));
+   static const uint32_t IV[8]  __attribute__ ((aligned (32))) =
+   {
+      0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+      0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+   };
   uint32_t *pdata = work->data;
-   const uint32_t *ptarget = work->target;
+   uint32_t *ptarget = work->target;
+   uint32_t *hash32_d7 = (uint32_t*)&(hash32[7]);
   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 16;
-   uint32_t n = first_nonce;
-   __m512i *noncev = vdata + 19; 
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
   const __m512i last_byte = _mm512_set1_epi32( 0x80000000 );
+   uint32_t n = first_nonce;
+   const int thr_id = mythr->id;
   const __m512i sixteen = _mm512_set1_epi32( 16 );
+   const bool bench = opt_benchmark;
+   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
+                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

-   for ( int i = 0; i < 19; i++ )
-      vdata[i] = _mm512_set1_epi32( pdata[i] );
+   // prehash first block directly from pdata
+   sha256_transform_le( phash, pdata, IV );

-   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+9, n+8,
-                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n+1, n );
+   // vectorize block 0 hash for second block
+   mstate1[0] = _mm512_set1_epi32( phash[0] );
+   mstate1[1] = _mm512_set1_epi32( phash[1] );
+   mstate1[2] = _mm512_set1_epi32( phash[2] );
+   mstate1[3] = _mm512_set1_epi32( phash[3] );
+   mstate1[4] = _mm512_set1_epi32( phash[4] );
+   mstate1[5] = _mm512_set1_epi32( phash[5] );
+   mstate1[6] = _mm512_set1_epi32( phash[6] );
+   mstate1[7] = _mm512_set1_epi32( phash[7] );

-   vdata[16+4] = last_byte;
-   memset_zero_512( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm512_set1_epi32( 80*8 ); // bit count
-   
+   // second message block data, with nonce & padding   
+   buf[0] = _mm512_set1_epi32( pdata[16] );
+   buf[1] = _mm512_set1_epi32( pdata[17] );
+   buf[2] = _mm512_set1_epi32( pdata[18] );
+   buf[3] = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                              n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
+   buf[4] = last_byte;
+   memset_zero_512( buf+5, 10 );
+   buf[15] = _mm512_set1_epi32( 80*8 ); // bit count
+
+   // partially pre-expand & prehash second message block, avoiding the nonces
+   sha256_16way_prehash_3rounds( mstate2, mexp_pre, buf, mstate1 );
+
+   // vectorize IV for 2nd & 3rd sha256
+   istate[0] = _mm512_set1_epi32( IV[0] );
+   istate[1] = _mm512_set1_epi32( IV[1] );
+   istate[2] = _mm512_set1_epi32( IV[2] );
+   istate[3] = _mm512_set1_epi32( IV[3] );
+   istate[4] = _mm512_set1_epi32( IV[4] );
+   istate[5] = _mm512_set1_epi32( IV[5] );
+   istate[6] = _mm512_set1_epi32( IV[6] );
+   istate[7] = _mm512_set1_epi32( IV[7] );
+
+   // initialize padding for 2nd & 3rd sha256
   block[ 8] = last_byte;
   memset_zero_512( block + 9, 6 );
   block[15] = _mm512_set1_epi32( 32*8 ); // bit count
-   
-   initstate[0] = _mm512_set1_epi64( 0x6A09E6676A09E667 );
-   initstate[1] = _mm512_set1_epi64( 0xBB67AE85BB67AE85 );
-   initstate[2] = _mm512_set1_epi64( 0x3C6EF3723C6EF372 );
-   initstate[3] = _mm512_set1_epi64( 0xA54FF53AA54FF53A );
-   initstate[4] = _mm512_set1_epi64( 0x510E527F510E527F );
-   initstate[5] = _mm512_set1_epi64( 0x9B05688C9B05688C );
-   initstate[6] = _mm512_set1_epi64( 0x1F83D9AB1F83D9AB );
-   initstate[7] = _mm512_set1_epi64( 0x5BE0CD195BE0CD19 );
-
-   sha256_16way_transform_le( midstate1, vdata, initstate );
-   
-   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_16way_prehash_3rounds( midstate2, mexp_pre, vdata+16, midstate1 );

   do
   {
-      // 1. final 16 bytes of data, pre-padded
-      sha256_16way_final_rounds( block, vdata+16, midstate1, midstate2,
-                                 mexp_pre );
+      sha256_16way_final_rounds( block, buf, mstate1, mstate2, mexp_pre );

-      // 2. 32 byte hash from 1.
-      sha256_16way_transform_le( block, block, initstate );
+      sha256_16way_transform_le( block, block, istate );

-      // 3. 32 byte hash from 2.
-      if ( unlikely(
-               sha256_16way_transform_le_short( hash32, block, initstate ) ) )
+      if ( sha256_16way_transform_le_short( hash32, block, istate, ptarget ) )
      {
-         // byte swap final hash for testing
-         mm512_block_bswap_32( hash32, hash32 );    
-
         for ( int lane = 0; lane < 16; lane++ )
-         if ( hash32_d7[ lane ] <= targ32_d7 )
+         if ( bswap_32( hash32_d7[ lane ] ) <= targ32_d7 )
         {
-            extr_lane_16x32( lane_hash, hash32, lane, 256 );
-            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            extr_lane_16x32( phash, hash32, lane, 256 );
+            casti_m256i( phash, 0 ) =
+                _mm256_shuffle_epi8( casti_m256i( phash, 0 ), bswap_shuf );
+            if ( likely( valid_hash( phash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
-               submit_solution( work, lane_hash, mythr );
+               submit_solution( work, phash, mythr );
            }
         }
      }
-      *noncev = _mm512_add_epi32( *noncev, sixteen );
+      buf[3] = _mm512_add_epi32( buf[3], sixteen );
      n += 16;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
   pdata[19] = n;
@@ -94,26 +108,23 @@ int scanhash_sha256t_16way( struct work *work, const uint32_t max_nonce,
   return 0;
 }

-
 #endif

 #if defined(SHA256T_8WAY)
-
+   
 int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   __m256i  vdata[32]    __attribute__ ((aligned (64)));
   __m256i  block[16]    __attribute__ ((aligned (32)));
   __m256i  hash32[8]    __attribute__ ((aligned (32)));
-   __m256i  initstate[8] __attribute__ ((aligned (32)));
-   __m256i  midstate1[8] __attribute__ ((aligned (32)));
-   __m256i  midstate2[8] __attribute__ ((aligned (32)));
-   __m256i  mexp_pre[16] __attribute__ ((aligned (32)));
+   __m256i  istate[8]    __attribute__ ((aligned (32)));
+   __m256i  mstate1[8]   __attribute__ ((aligned (32)));
+   __m256i  mstate2[8]   __attribute__ ((aligned (32)));
+   __m256i  mexp_pre[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
-   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
   const uint32_t *ptarget = work->target;
-   const uint32_t targ32_d7 = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
@@ -122,6 +133,8 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   const bool bench = opt_benchmark;
   const __m256i last_byte = _mm256_set1_epi32( 0x80000000 );
   const __m256i eight = _mm256_set1_epi32( 8 );
+   const __m256i bswap_shuf = mm256_bcast_m128( _mm_set_epi64x(
+                                0x0c0d0e0f08090a0b, 0x0405060700010203 ) );

   for ( int i = 0; i < 19; i++ )
      vdata[i] = _mm256_set1_epi32( pdata[i] );
@@ -135,42 +148,40 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
   block[ 8] = last_byte;
   memset_zero_256( block + 9, 6 );
   block[15] = _mm256_set1_epi32( 32*8 ); // bit count
-   
-   // initialize state
-   initstate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
-   initstate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
-   initstate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
-   initstate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
-   initstate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
-   initstate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
-   initstate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
-   initstate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );

-   sha256_8way_transform_le( midstate1, vdata, initstate );
+   // initialize state
+   istate[0] = _mm256_set1_epi64x( 0x6A09E6676A09E667 );
+   istate[1] = _mm256_set1_epi64x( 0xBB67AE85BB67AE85 );
+   istate[2] = _mm256_set1_epi64x( 0x3C6EF3723C6EF372 );
+   istate[3] = _mm256_set1_epi64x( 0xA54FF53AA54FF53A );
+   istate[4] = _mm256_set1_epi64x( 0x510E527F510E527F );
+   istate[5] = _mm256_set1_epi64x( 0x9B05688C9B05688C );
+   istate[6] = _mm256_set1_epi64x( 0x1F83D9AB1F83D9AB );
+   istate[7] = _mm256_set1_epi64x( 0x5BE0CD195BE0CD19 );
+
+   sha256_8way_transform_le( mstate1, vdata, istate );

   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_8way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
-   
+   sha256_8way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );
+
   do
   {
      // 1. final 16 bytes of data, with padding
-      sha256_8way_final_rounds( block, vdata+16, midstate1, midstate2,
+      sha256_8way_final_rounds( block, vdata+16, mstate1, mstate2,
                                mexp_pre );

      // 2. 32 byte hash from 1.
-      sha256_8way_transform_le( block, block, initstate );
+      sha256_8way_transform_le( block, block, istate );

      // 3. 32 byte hash from 2.
-      if ( unlikely(
-               sha256_8way_transform_le_short( hash32, block, initstate ) ) )
+      if ( unlikely( sha256_8way_transform_le_short(
+                                    hash32, block, istate, ptarget ) ) )
      {
-         // byte swap final hash for testing
-         mm256_block_bswap_32( hash32, hash32 );
-
         for ( int lane = 0; lane < 8; lane++ )
-         if ( hash32_d7[ lane ] <= targ32_d7 )
         {
            extr_lane_8x32( lane_hash, hash32, lane, 256 );
+            casti_m256i( lane_hash, 0 ) =
+             _mm256_shuffle_epi8( casti_m256i( lane_hash, 0 ), bswap_shuf );
            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
            {
               pdata[19] = n + lane;
@@ -188,109 +199,18 @@ int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,

 #endif

-
 #if defined(SHA256T_4WAY)

-// Optimizations are slower with AVX/SSE2
-// https://github.com/JayDDee/cpuminer-opt/issues/344
-/*
-int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
-                           uint64_t *hashes_done, struct thr_info *mythr )
-{
-   __m128i  vdata[32]     __attribute__ ((aligned (64)));
-   __m128i  block[16]     __attribute__ ((aligned (32)));
-   __m128i  hash32[8]     __attribute__ ((aligned (32)));
-   __m128i  initstate[8]  __attribute__ ((aligned (32)));
-   __m128i  midstate1[8]  __attribute__ ((aligned (32)));
-   __m128i  midstate2[8]  __attribute__ ((aligned (32)));
-   __m128i  mexp_pre[16]  __attribute__ ((aligned (32)));
-   uint32_t lane_hash[8]  __attribute__ ((aligned (32)));
-   uint32_t *hash32_d7 = (uint32_t*)&( hash32[7] );
-   uint32_t *pdata = work->data;
-   const uint32_t *ptarget = work->target;
-   const uint32_t targ32_d7 = ptarget[7];
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 4;
-   uint32_t n = first_nonce;
-   __m128i *noncev = vdata + 19;
-   const int thr_id = mythr->id;
-   const bool bench = opt_benchmark;
-   const __m128i last_byte = _mm_set1_epi32( 0x80000000 );
-   const __m128i four = _mm_set1_epi32( 4 );
-
-   for ( int i = 0; i < 19; i++ )
-       vdata[i] = _mm_set1_epi32( pdata[i] );
-
-   *noncev = _mm_set_epi32( n+ 3, n+ 2, n+1, n );
-
-   vdata[16+4] = last_byte;
-   memset_zero_128( vdata+16 + 5, 10 );
-   vdata[16+15] = _mm_set1_epi32( 80*8 ); // bit count
-
-   block[ 8] = last_byte;
-   memset_zero_128( block + 9, 6 );
-   block[15] = _mm_set1_epi32( 32*8 ); // bit count
-   
-   // initialize state
-   initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
-   initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
-   initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
-   initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
-   initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
-   initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
-   initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
-   initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
-
-   // hash first 64 bytes of data
-   sha256_4way_transform_le( midstate1, vdata, initstate );
-
-   // Do 3 rounds on the first 12 bytes of the next block
-   sha256_4way_prehash_3rounds( midstate2, mexp_pre, vdata + 16, midstate1 );
-
-   do
-   {
-      // 1. final 16 bytes of data, with padding
-      sha256_4way_final_rounds( block, vdata+16, midstate1, midstate2,
-                                mexp_pre );
-
-      // 2. 32 byte hash from 1.
-      sha256_4way_transform_le( block, block, initstate );
-
-      // 3. 32 byte hash from 2.
-      if ( unlikely(
-              sha256_4way_transform_le_short( hash32, block, initstate ) ) )
-      {   
-         // byte swap final hash for testing
-         mm128_block_bswap_32( hash32, hash32 );
-
-         for ( int lane = 0; lane < 4; lane++ )
-         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
-         {
-            extr_lane_4x32( lane_hash, hash32, lane, 256 );
-            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
-            {
-               pdata[19] = n + lane;
-               submit_solution( work, lane_hash, mythr );
-            }
-         }
-      }
-      *noncev = _mm_add_epi32( *noncev, four );
-      n += 4;
-   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
-   pdata[19] = n;
-   *hashes_done = n - first_nonce;
-   return 0;
-}
-*/
-
 int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   __m128i  vdata[32]    __attribute__ ((aligned (64)));
   __m128i  block[16]    __attribute__ ((aligned (32)));
   __m128i  hash32[8]    __attribute__ ((aligned (32)));
-   __m128i  initstate[8] __attribute__ ((aligned (32)));
-   __m128i  midstate[8]  __attribute__ ((aligned (32)));
+   __m128i  istate[8]    __attribute__ ((aligned (32)));
+   __m128i  mstate[8]   __attribute__ ((aligned (32)));
+//   __m128i  mstate2[8]   __attribute__ ((aligned (32)));
+//   __m128i  mexp_pre[8]  __attribute__ ((aligned (32)));
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash32_d7 =  (uint32_t*)&( hash32[7] );
   uint32_t *pdata = work->data;
@@ -319,35 +239,44 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
   block[15] = _mm_set1_epi32( 32*8 ); // bit count
   
   // initialize state
-   initstate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
-   initstate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
-   initstate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
-   initstate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
-   initstate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
-   initstate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
-   initstate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
-   initstate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );
+   istate[0] = _mm_set1_epi64x( 0x6A09E6676A09E667 );
+   istate[1] = _mm_set1_epi64x( 0xBB67AE85BB67AE85 );
+   istate[2] = _mm_set1_epi64x( 0x3C6EF3723C6EF372 );
+   istate[3] = _mm_set1_epi64x( 0xA54FF53AA54FF53A );
+   istate[4] = _mm_set1_epi64x( 0x510E527F510E527F );
+   istate[5] = _mm_set1_epi64x( 0x9B05688C9B05688C );
+   istate[6] = _mm_set1_epi64x( 0x1F83D9AB1F83D9AB );
+   istate[7] = _mm_set1_epi64x( 0x5BE0CD195BE0CD19 );

   // hash first 64 bytes of data
-   sha256_4way_transform_le( midstate, vdata, initstate );
+   sha256_4way_transform_le( mstate, vdata, istate );
+
+//   sha256_4way_prehash_3rounds( mstate2, mexp_pre, vdata + 16, mstate1 );

   do
   {
-      sha256_4way_transform_le( block,  vdata+16, midstate  );
-      sha256_4way_transform_le( block,  block,    initstate );
-      sha256_4way_transform_le( hash32, block,    initstate );
-      mm128_block_bswap_32( hash32, hash32 );
+//      sha256_4way_final_rounds( block, vdata+16, mstate1, mstate2,
+//                                mexp_pre );
+   
+      sha256_4way_transform_le( block,  vdata+16, mstate  );
+      sha256_4way_transform_le( block,  block, istate );
+      sha256_4way_transform_le( hash32, block, istate );

-      for ( int lane = 0; lane < 4; lane++ )
-      if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
-      {
-         extr_lane_4x32( lane_hash, hash32, lane, 256 );
-         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+//      if ( unlikely( sha256_4way_transform_le_short(
+//                                  hash32, block, initstate, ptarget ) ))
+//      {
+         mm128_block_bswap_32( hash32, hash32 );
+         for ( int lane = 0; lane < 4; lane++ )
+         if ( unlikely( hash32_d7[ lane ] <= targ32_d7 ) )
         {
-            pdata[19] = n + lane;
-            submit_solution( work, lane_hash, mythr );
+            extr_lane_4x32( lane_hash, hash32, lane, 256 );
+            if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
+            {
+               pdata[19] = n + lane;
+               submit_solution( work, lane_hash, mythr );
+            }
         }
-       }
+//       }
       *noncev = _mm_add_epi32( *noncev, four );
       n += 4;
   } while ( (n < last_nonce) && !work_restart[thr_id].restart );
@@ -356,6 +285,5 @@ int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
   return 0;
 }

-
 #endif

--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -23,7 +23,7 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
-   const uint32_t last_nonce = max_nonce - 1;
+   const uint32_t last_nonce = max_nonce - 2;
   uint32_t n = first_nonce;
   const int thr_id = mythr->id;
   const bool bench = opt_benchmark;
--- a/algo/sha/sph_sha2.c
+++ b/algo/sha/sph_sha2.c
@@ -39,9 +39,9 @@
 #define SPH_SMALL_FOOTPRINT_SHA2   1
 #endif

-#define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
+#define CH(X, Y, Z)    ( ( ( (Y) ^ (Z) ) & (X)) ^ (Z) )
 //#define MAJ(X, Y, Z)   (((Y) & (Z)) | (((Y) | (Z)) & (X)))
-#define MAJ( X, Y, Z )   ( Y  ^ ( ( X_xor_Y = X ^ Y ) & ( Y_xor_Z ) ) )
+#define MAJ( X, Y, Z )   ( (Y) ^ ( ( (X_xor_Y) = (X) ^ (Y) ) & (Y_xor_Z) ) )
 #define ROTR    SPH_ROTR32

 #define BSG2_0(x)      (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))