v3.11.9

2025-09-17 23:44:27 +00:00 · 2020-02-04 01:31:59 -05:00
parent 0681ca996d
commit 1b76cee239
106 changed files with 1695 additions and 4481 deletions
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -1,4 +1,5 @@
 #include "argon2d-gate.h"
+#include "simd-utils.h"
 #include "argon2d/argon2.h"

 static const size_t INPUT_BYTES = 80;  // Lenth of a block header in bytes. Input Length = Salt Length (salt = input)
@@ -36,7 +37,7 @@ void argon2d_crds_hash( void *output, const void *input )
 int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t _ALIGN(64) edata[20];
   uint32_t _ALIGN(64) hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -45,11 +46,11 @@ int scanhash_argon2d_crds( struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   uint32_t nonce = first_nonce;

-   swab32_array( endiandata, pdata, 20 );
+   swab32_array( edata, pdata, 20 );

   do {
-      be32enc(&endiandata[19], nonce);
-      argon2d_crds_hash( hash, endiandata );
+      be32enc(&edata[19], nonce);
+      argon2d_crds_hash( hash, edata );
      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
      {
          pdata[19] = nonce;
@@ -103,31 +104,32 @@ void argon2d_dyn_hash( void *output, const void *input )
 int scanhash_argon2d_dyn( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t _ALIGN(64) edata[20];
   uint32_t _ALIGN(64) hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-   const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
+   const int thr_id = mythr->id; 
+   const uint32_t first_nonce = (const uint32_t)pdata[19];
+   const uint32_t last_nonce = (const uint32_t)max_nonce;
   uint32_t nonce = first_nonce;
+   const bool bench = opt_benchmark;

-   swab32_array( endiandata, pdata, 20 );
-
+   mm128_bswap32_80( edata, pdata );
   do
   {
-      be32enc(&endiandata[19], nonce);
-      argon2d_dyn_hash( hash, endiandata );
-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) && !opt_benchmark )
+      edata[19] = nonce;
+      argon2d_dyn_hash( hash, edata );
+      if ( unlikely( valid_hash( (uint64_t*)hash, (uint64_t*)ptarget )
+           && !bench ) )
      {
-          pdata[19] = nonce;
+          pdata[19] = bswap_32( nonce );;
          submit_solution( work, hash, mythr );
      }
      nonce++;
-  } while (nonce < max_nonce && !work_restart[thr_id].restart);
+  } while ( likely( nonce < last_nonce && !work_restart[thr_id].restart ) );

   pdata[19] = nonce;
-   *hashes_done = pdata[19] - first_nonce + 1;
+   *hashes_done = pdata[19] - first_nonce;
   return 0;
 }

@@ -146,36 +148,34 @@ int scanhash_argon2d4096( struct work *work, uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(64) vhash[8];
-   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = (const uint32_t)max_nonce;
   uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  // thr_id arg is deprecated
   uint32_t t_cost = 1; // 1 iteration
   uint32_t m_cost = 4096; // use 4MB
   uint32_t parallelism = 1; // 1 thread, 2 lanes
+   const bool bench = opt_benchmark;

-   for ( int i = 0; i < 19; i++ )
-      be32enc( &endiandata[i], pdata[i] );
+   mm128_bswap32_80( edata, pdata );

   do {
-      be32enc( &endiandata[19], n );
-      argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
-                 (char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
-      if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) && !opt_benchmark )
+      edata[19] = n;
+      argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) edata, 80,
+                 (char*) edata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
+      if ( unlikely( valid_hash( vhash, ptarget ) && !bench ) )
      {
-         pdata[19] = n;
+         be32enc( &pdata[19], n );
         submit_solution( work, vhash, mythr );
      }
      n++;
+   } while ( likely( n < last_nonce && !work_restart[thr_id].restart ) );

-   } while (n < max_nonce && !work_restart[thr_id].restart);
-
-   *hashes_done = n - first_nonce + 1;
+   *hashes_done = n - first_nonce;
   pdata[19] = n;
-
   return 0;
 }

--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -33,6 +33,8 @@

 #include "blake2b-hash-4way.h"

+#if defined(__AVX2__)
+
 static const uint8_t sigma[12][16] =
 {
      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
@@ -203,9 +205,9 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out )
   casti_m512i( out, 3 ) = ctx->h[3];
 }

-#endif
+#endif   // AVX512

-#if defined(__AVX2__)
+// AVX2

 // G Mixing function.

@@ -369,4 +371,4 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
   casti_m256i( out, 3 ) = ctx->h[3];
 }

-#endif
+#endif  // AVX2
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -4,6 +4,9 @@
 */

 #include "blake2b-gate.h"
+
+#if !defined(BLAKE2B_8WAY) && !defined(BLAKE2B_4WAY)
+
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/sph_blake2b.h"
@@ -58,3 +61,4 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
 	return 0;
 }

+#endif
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -1,5 +1,7 @@
 #include "blake2s-gate.h"

+#if  !defined(BLAKE2S_16WAY) && !defined(BLAKE2S_8WAY) && !defined(BLAKE2S)
+
 #include <string.h>
 #include <stdint.h>

@@ -70,3 +72,4 @@ int scanhash_blake2s( struct work *work,

 	return 0;
 }
+#endif
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -1,4 +1,7 @@
 #include "blakecoin-gate.h"
+
+#if !defined(BLAKECOIN_8WAY) && !defined(BLAKECOIN_4WAY)
+
 #define BLAKE32_ROUNDS 8
 #include "sph_blake.h"

@@ -93,3 +96,4 @@ int scanhash_blakecoin( struct work *work, uint32_t max_nonce,
 	return 0;
 }

+#endif
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -1,4 +1,7 @@
 #include "decred-gate.h"
+
+#if !defined(DECRED_8WAY) && !defined(DECRED_4WAY)
+
 #include "sph_blake.h"

 #include <string.h>
@@ -275,3 +278,5 @@ bool register_decred_algo( algo_gate_t* gate )
  return true;
 }
 */
+
+#endif
--- a/algo/blake/pentablake.c
+++ b/algo/blake/pentablake.c
@@ -1,4 +1,7 @@
 #include "pentablake-gate.h"
+
+#if !defined(PENTABLAKE_8WAY) && !defined(PENTABLAKE_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -111,3 +114,4 @@ int scanhash_pentablake( struct work *work, uint32_t max_nonce,
 	return 0;
 } 

+#endif
--- a/algo/bmw/bmw512.c
+++ b/algo/bmw/bmw512.c
@@ -1,5 +1,7 @@
 #include "algo-gate-api.h"

+#if !defined(BMW512_8WAY) && !defined(BMW512_4WAY)
+
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
@@ -50,4 +52,4 @@ int scanhash_bmw512( struct work *work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-
+#endif
--- a/algo/bmw/sph_bmw.c
+++ b/algo/bmw/sph_bmw.c
@@ -48,6 +48,8 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

+#if !defined(__AVX2__)
+
 static const sph_u32 IV224[] = {
 	SPH_C32(0x00010203), SPH_C32(0x04050607),
 	SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F),
@@ -70,6 +72,8 @@ static const sph_u32 IV256[] = {
 	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
 };

+#endif // !AVX2
+
 #if SPH_64

 static const sph_u64 IV384[] = {
@@ -135,6 +139,8 @@ static const sph_u64 IV512[] = {
 #define M16_30   14, 15,  1,  2,  5,  8,  9
 #define M16_31   15, 16,  2,  3,  6,  9, 10

+#if !defined(__AVX2__)
+
 #define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
 #define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
@@ -189,6 +195,8 @@ static const sph_u64 IV512[] = {
 #define expand2s_(qf, mf, hf, i16, ix, iy) \
 	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)

+#endif // !AVX2
+
 #if SPH_64

 #define sb0(x)    (((x) >> 1) ^ SPH_T64((x) << 3) \
@@ -291,6 +299,8 @@ static const sph_u64 Kb_tab[] = {
 	tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
 	op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))

+#if !defined(__AVX2__)
+
 #define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
 #define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
 #define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
@@ -407,6 +417,8 @@ static const sph_u64 Kb_tab[] = {

 #define Qs(j)   (qt[j])

+#endif  // !AVX2
+
 #if SPH_64

 #define Wb0    MAKE_W(SPH_T64,  5, -,  7, +, 10, +, 13, +, 14)
@@ -557,7 +569,6 @@ static const sph_u64 Kb_tab[] = {
 			+ ((xl >> 2) ^ qf(22) ^ qf(15))); \
 	} while (0)

-#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)

 #if SPH_64

@@ -565,6 +576,10 @@ static const sph_u64 Kb_tab[] = {

 #endif

+#if !defined(__AVX2__)
+
+#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH)
+
 static void
 compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
 {
@@ -711,6 +726,8 @@ bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n,
 		sph_enc32le(out + 4 * u, h1[v]);
 }

+#endif // !AVX2
+
 #if SPH_64

 static void
@@ -840,6 +857,8 @@ bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n,

 #endif

+#if !defined(__AVX2__)
+
 /* see sph_bmw.h */
 void
 sph_bmw224_init(void *cc)
@@ -898,6 +917,8 @@ sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 //	sph_bmw256_init(cc);
 }

+#endif // !AVX2
+
 #if SPH_64

 /* see sph_bmw.h */
--- a/algo/bmw/sph_bmw.h
+++ b/algo/bmw/sph_bmw.h
@@ -77,6 +77,9 @@ extern "C"{
 * computation can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
+
+#if !defined(__AVX2__)
+
 typedef struct {
 #ifndef DOXYGEN_IGNORE
 	unsigned char buf[64];    /* first field, for alignment */
@@ -102,6 +105,8 @@ typedef sph_bmw_small_context sph_bmw224_context;
 */
 typedef sph_bmw_small_context sph_bmw256_context;

+#endif // !AVX2
+
 #if SPH_64

 /**
@@ -137,6 +142,8 @@ typedef sph_bmw_big_context sph_bmw512_context;

 #endif

+#if !defined(__AVX2__)
+
 /**
 * Initialize a BMW-224 context. This process performs no memory allocation.
 *
@@ -227,6 +234,8 @@ void sph_bmw256_close(void *cc, void *dst);
 void sph_bmw256_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

+#endif // !AVX2
+
 #if SPH_64

 /**
--- a/algo/cryptonight/cryptolight.c
+++ b/algo/cryptonight/cryptolight.c
@@ -358,6 +358,9 @@ int scanhash_cryptolight( struct work *work,

 bool register_cryptolight_algo( algo_gate_t* gate )
 {
+  applog(LOG_WARNING,"Cryptonight algorithm and variants are no longer");
+  applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
+  applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
  register_json_rpc2( gate );
  gate->optimizations = SSE2_OPT | AES_OPT;
  gate->scanhash  = (void*)&scanhash_cryptolight;
--- a/algo/cryptonight/cryptonight-common.c
+++ b/algo/cryptonight/cryptonight-common.c
@@ -105,6 +105,9 @@ int scanhash_cryptonight( struct work *work, uint32_t max_nonce,

 bool register_cryptonight_algo( algo_gate_t* gate )
 {
+  applog(LOG_WARNING,"Cryptonight algorithm and variants are no longer");
+  applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
+  applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
  cryptonightV7 = false;
  register_json_rpc2( gate );
  gate->optimizations = SSE2_OPT | AES_OPT;
@@ -116,6 +119,9 @@ bool register_cryptonight_algo( algo_gate_t* gate )

 bool register_cryptonightv7_algo( algo_gate_t* gate )
 {
+  applog(LOG_WARNING,"Cryptonight algorithm and variants are no longer");
+  applog(LOG_WARNING,"supported by cpuminer-opt. Shares submitted will");
+  applog(LOG_WARNING,"likely be rejected. Proceed at your own risk.\n");
  cryptonightV7 = true;
  register_json_rpc2( gate );
  gate->optimizations = SSE2_OPT | AES_OPT;
--- a/algo/echo/sph_echo.c
+++ b/algo/echo/sph_echo.c
@@ -36,6 +36,8 @@

 #include "sph_echo.h"

+#if !defined(__AES__)
+
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -1028,4 +1030,5 @@ sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 }
 #ifdef __cplusplus
 }
-#endif
+#endif 
+#endif  // !AES
--- a/algo/echo/sph_echo.h
+++ b/algo/echo/sph_echo.h
@@ -36,6 +36,8 @@
 #ifndef SPH_ECHO_H__
 #define SPH_ECHO_H__

+#if !defined(__AES__)
+
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -316,5 +318,5 @@ void sph_echo512_addbits_and_close(
 #ifdef __cplusplus
 }
 #endif
-
+#endif // !AES
 #endif
--- a/algo/groestl/aes_ni/groestl-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl-intr-aes.h
@@ -1,3 +1,6 @@
+#if !defined GROESTL_INTR_AES_H__
+#define GROESTL_INTR_AES_H__
+
 /* groestl-intr-aes.h     Aug 2011
 *
 * Groestl implementation with intrinsics using ssse3, sse4.1, and aes
@@ -11,6 +14,52 @@
 #include <wmmintrin.h>
 #include "hash-groestl.h"

+static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
+{
+   { 0x7060504030201000, 0xf0e0d0c0b0a09080 },
+   { 0x7161514131211101, 0xf1e1d1c1b1a19181 },
+   { 0x7262524232221202, 0xf2e2d2c2b2a29282 },
+   { 0x7363534333231303, 0xf3e3d3c3b3a39383 },
+   { 0x7464544434241404, 0xf4e4d4c4b4a49484 },
+   { 0x7565554535251505, 0xf5e5d5c5b5a59585 },
+   { 0x7666564636261606, 0xf6e6d6c6b6a69686 },
+   { 0x7767574737271707, 0xf7e7d7c7b7a79787 },
+   { 0x7868584838281808, 0xf8e8d8c8b8a89888 },
+   { 0x7969594939291909, 0xf9e9d9c9b9a99989 },
+   { 0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a },
+   { 0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b },
+   { 0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c },
+   { 0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d }
+};
+
+static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
+{
+   { 0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f },
+   { 0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e },
+   { 0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d },
+   { 0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c },
+   { 0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b },
+   { 0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a },
+   { 0x8999a9b9c9d9e9f9, 0x0919293949596979 },
+   { 0x8898a8b8c8d8e8f8, 0x0818283848586878 },
+   { 0x8797a7b7c7d7e7f7, 0x0717273747576777 },
+   { 0x8696a6b6c6d6e6f6, 0x0616263646566676 },
+   { 0x8595a5b5c5d5e5f5, 0x0515253545556575 },
+   { 0x8494a4b4c4d4e4f4, 0x0414243444546474 },
+   { 0x8393a3b3c3d3e3f3, 0x0313233343536373 },
+   { 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
+};
+
+static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
+static const __m128i SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508 };
+static const __m128i SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609 };
+static const __m128i SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a };
+static const __m128i SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b };
+static const __m128i SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c };
+static const __m128i SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d };
+static const __m128i SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e };
+static const __m128i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003 };
+
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -141,42 +190,6 @@
 }/*MixBytes*/


-static const uint64_t round_const_p[] __attribute__ ((aligned (64))) =
-{
-  0x7060504030201000, 0xf0e0d0c0b0a09080,
-  0x7161514131211101, 0xf1e1d1c1b1a19181,
-  0x7262524232221202, 0xf2e2d2c2b2a29282,
-  0x7363534333231303, 0xf3e3d3c3b3a39383,
-  0x7464544434241404, 0xf4e4d4c4b4a49484,
-  0x7565554535251505, 0xf5e5d5c5b5a59585,
-  0x7666564636261606, 0xf6e6d6c6b6a69686,
-  0x7767574737271707, 0xf7e7d7c7b7a79787,
-  0x7868584838281808, 0xf8e8d8c8b8a89888,
-  0x7969594939291909, 0xf9e9d9c9b9a99989,
-  0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a,
-  0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b,
-  0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c,
-  0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d
-};
-
-static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
-{
-  0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f,
-  0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e,
-  0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d,
-  0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c,
-  0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b,
-  0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a,
-  0x8999a9b9c9d9e9f9, 0x0919293949596979,
-  0x8898a8b8c8d8e8f8, 0x0818283848586878,
-  0x8797a7b7c7d7e7f7, 0x0717273747576777,
-  0x8696a6b6c6d6e6f6, 0x0616263646566676,
-  0x8595a5b5c5d5e5f5, 0x0515253545556575,
-  0x8494a4b4c4d4e4f4, 0x0414243444546474,
-  0x8393a3b3c3d3e3f3, 0x0313233343536373,
-  0x8292a2b2c2d2e2f2, 0x0212223242526272
-};
-
 /* one round
 * a0-a7 = input rows
 * b0-b7 = output rows
@@ -203,22 +216,14 @@ static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
    xmm8 = _mm_xor_si128( xmm8, \
             casti_m128i( round_const_p, round_counter ) ); \
     /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8( xmm8,  m128_const_64( 0x0306090c0f020508, \
-                                                    0x0b0e0104070a0d00 ) ); \
-    xmm9  = _mm_shuffle_epi8( xmm9,  m128_const_64( 0x04070a0d00030609, \
-                                                    0x0c0f0205080b0e01 ) ); \
-    xmm10 = _mm_shuffle_epi8( xmm10, m128_const_64( 0x05080b0e0104070a, \
-                                                    0x0d000306090c0f02 ) ); \
-    xmm11 = _mm_shuffle_epi8( xmm11, m128_const_64( 0x06090c0f0205080b, \
-                                                    0x0e0104070a0d0003 ) ); \
-    xmm12 = _mm_shuffle_epi8( xmm12, m128_const_64( 0x070a0d000306090c, \
-                                                    0x0f0205080b0e0104 ) ); \
-    xmm13 = _mm_shuffle_epi8( xmm13, m128_const_64( 0x080b0e0104070a0d, \
-                                                    0x000306090c0f0205 ) ); \
-    xmm14 = _mm_shuffle_epi8( xmm14, m128_const_64( 0x090c0f0205080b0e, \
-                                                    0x0104070a0d000306 ) ); \
-    xmm15 = _mm_shuffle_epi8( xmm15, m128_const_64( 0x0e0104070a0d0003, \
-                                                    0x06090c0f0205080b ) ); \
+    xmm8  = _mm_shuffle_epi8( xmm8,  SUBSH_MASK0 ); \
+    xmm9  = _mm_shuffle_epi8( xmm9,  SUBSH_MASK1 ); \
+    xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK2 ); \
+    xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK3 ); \
+    xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK4 ); \
+    xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK5 ); \
+    xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK6 ); \
+    xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK7 ); \
    /* SubBytes + MixBytes */\
    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7 ); \
@@ -226,22 +231,14 @@ static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
    /* AddRoundConstant P1024 */\
    xmm0 = _mm_xor_si128( xmm0, \
             casti_m128i( round_const_p, round_counter+1 ) ); \
-    xmm0 = _mm_shuffle_epi8( xmm0, m128_const_64( 0x0306090c0f020508, \
-                                                  0x0b0e0104070a0d00 ) ); \
-    xmm1 = _mm_shuffle_epi8( xmm1, m128_const_64( 0x04070a0d00030609, \
-                                                  0x0c0f0205080b0e01 ) ); \
-    xmm2 = _mm_shuffle_epi8( xmm2, m128_const_64( 0x05080b0e0104070a, \
-                                                  0x0d000306090c0f02 ) ); \
-    xmm3 = _mm_shuffle_epi8( xmm3, m128_const_64( 0x06090c0f0205080b, \
-                                                  0x0e0104070a0d0003 ) ); \
-    xmm4 = _mm_shuffle_epi8( xmm4, m128_const_64( 0x070a0d000306090c, \
-                                                  0x0f0205080b0e0104 ) ); \
-    xmm5 = _mm_shuffle_epi8( xmm5, m128_const_64( 0x080b0e0104070a0d, \
-                                                  0x000306090c0f0205 ) ); \
-    xmm6 = _mm_shuffle_epi8( xmm6, m128_const_64( 0x090c0f0205080b0e, \
-                                                  0x0104070a0d000306 ) ); \
-    xmm7 = _mm_shuffle_epi8( xmm7, m128_const_64( 0x0e0104070a0d0003, \
-                                                  0x06090c0f0205080b ) ); \
+    xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK0 ); \
+    xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK1 ); \
+    xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK2 ); \
+    xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK3 ); \
+    xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK4 ); \
+    xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK5 ); \
+    xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK6 ); \
+    xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK7 ); \
    SUBMIX( xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
            xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
  }\
@@ -262,22 +259,14 @@ static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
    xmm15 = _mm_xor_si128( xmm15, \
              casti_m128i( round_const_q, round_counter ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm8  = _mm_shuffle_epi8( xmm8,  m128_const_64( 0x04070a0d00030609, \
-                                                    0x0c0f0205080b0e01 ) ); \
-    xmm9  = _mm_shuffle_epi8( xmm9,  m128_const_64( 0x06090c0f0205080b, \
-                                                    0x0e0104070a0d0003 ) ); \
-    xmm10 = _mm_shuffle_epi8( xmm10, m128_const_64( 0x080b0e0104070a0d, \
-                                                    0x000306090c0f0205 ) ); \
-    xmm11 = _mm_shuffle_epi8( xmm11, m128_const_64( 0x0e0104070a0d0003, \
-                                                    0x06090c0f0205080b ) ); \
-    xmm12 = _mm_shuffle_epi8( xmm12, m128_const_64( 0x0306090c0f020508, \
-                                                    0x0b0e0104070a0d00 ) ); \
-    xmm13 = _mm_shuffle_epi8( xmm13, m128_const_64( 0x05080b0e0104070a, \
-                                                    0x0d000306090c0f02 ) ); \
-    xmm14 = _mm_shuffle_epi8( xmm14, m128_const_64( 0x070a0d000306090c, \
-                                                    0x0f0205080b0e0104 ) ); \
-    xmm15 = _mm_shuffle_epi8( xmm15, m128_const_64( 0x090c0f0205080b0e, \
-                                                    0x0104070a0d000306 ) ); \
+    xmm8  = _mm_shuffle_epi8( xmm8,  SUBSH_MASK1 ); \
+    xmm9  = _mm_shuffle_epi8( xmm9,  SUBSH_MASK3 ); \
+    xmm10 = _mm_shuffle_epi8( xmm10, SUBSH_MASK5 ); \
+    xmm11 = _mm_shuffle_epi8( xmm11, SUBSH_MASK7 ); \
+    xmm12 = _mm_shuffle_epi8( xmm12, SUBSH_MASK0 ); \
+    xmm13 = _mm_shuffle_epi8( xmm13, SUBSH_MASK2 ); \
+    xmm14 = _mm_shuffle_epi8( xmm14, SUBSH_MASK4 ); \
+    xmm15 = _mm_shuffle_epi8( xmm15, SUBSH_MASK6 ); \
    /* SubBytes + MixBytes */\
    SUBMIX( xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, \
            xmm0, xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6 , xmm7 ); \
@@ -294,22 +283,14 @@ static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
    xmm7 = _mm_xor_si128( xmm7, \
             casti_m128i( round_const_q, round_counter+1 ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm0 = _mm_shuffle_epi8( xmm0, m128_const_64( 0x04070a0d00030609, \
-                                                  0x0c0f0205080b0e01 ) ); \
-    xmm1 = _mm_shuffle_epi8( xmm1, m128_const_64( 0x06090c0f0205080b, \
-                                                  0x0e0104070a0d0003 ) ); \
-    xmm2 = _mm_shuffle_epi8( xmm2, m128_const_64( 0x080b0e0104070a0d, \
-                                                  0x000306090c0f0205 ) ); \
-    xmm3 = _mm_shuffle_epi8( xmm3, m128_const_64( 0x0e0104070a0d0003, \
-                                                  0x06090c0f0205080b ) ); \
-    xmm4 = _mm_shuffle_epi8( xmm4, m128_const_64( 0x0306090c0f020508, \
-                                                  0x0b0e0104070a0d00 ) ); \
-    xmm5 = _mm_shuffle_epi8( xmm5, m128_const_64( 0x05080b0e0104070a, \
-                                                  0x0d000306090c0f02 ) ); \
-    xmm6 = _mm_shuffle_epi8( xmm6, m128_const_64( 0x070a0d000306090c, \
-                                                  0x0f0205080b0e0104 ) ); \
-    xmm7 = _mm_shuffle_epi8( xmm7, m128_const_64( 0x090c0f0205080b0e, \
-                                                  0x0104070a0d000306 ) ); \
+    xmm0 = _mm_shuffle_epi8( xmm0, SUBSH_MASK1 ); \
+    xmm1 = _mm_shuffle_epi8( xmm1, SUBSH_MASK3 ); \
+    xmm2 = _mm_shuffle_epi8( xmm2, SUBSH_MASK5 ); \
+    xmm3 = _mm_shuffle_epi8( xmm3, SUBSH_MASK7 ); \
+    xmm4 = _mm_shuffle_epi8( xmm4, SUBSH_MASK0 ); \
+    xmm5 = _mm_shuffle_epi8( xmm5, SUBSH_MASK2 ); \
+    xmm6 = _mm_shuffle_epi8( xmm6, SUBSH_MASK4 ); \
+    xmm7 = _mm_shuffle_epi8( xmm7, SUBSH_MASK6 ); \
    /* SubBytes + MixBytes */\
    SUBMIX( xmm0,  xmm1, xmm2,  xmm3,  xmm4,  xmm5,  xmm6,  xmm7, \
            xmm8,  xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15 ); \
@@ -324,7 +305,7 @@ static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
 * clobbers: t0-t7
 */
 #define Matrix_Transpose(i0, i1, i2, i3, i4, i5, i6, i7, t0, t1, t2, t3, t4, t5, t6, t7){\
-  t0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 );\
+  t0 = TRANSP_MASK; \
 \
  i6 = _mm_shuffle_epi8(i6, t0);\
  i0 = _mm_shuffle_epi8(i0, t0);\
@@ -412,7 +393,7 @@ static const uint64_t round_const_q[] __attribute__ ((aligned (64))) =
  i4 = _mm_unpacklo_epi64(i4, i5);\
  t1 = _mm_unpackhi_epi64(t1, i5);\
  t2 = i6;\
-  o0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 ); \
+  o0 = TRANSP_MASK; \
  i6 = _mm_unpacklo_epi64(i6, i7);\
  t2 = _mm_unpackhi_epi64(t2, i7);\
  /* load transpose mask into a register, because it will be used 8 times */\
@@ -653,3 +634,4 @@ void OF1024( __m128i* chaining )
  return;
 }

+#endif
--- a/algo/groestl/aes_ni/groestl256-intr-aes.h
+++ b/algo/groestl/aes_ni/groestl256-intr-aes.h
@@ -11,6 +11,45 @@
 #include <wmmintrin.h>
 #include "hash-groestl256.h"

+static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
+{
+   { 0x7060504030201000, 0xffffffffffffffff },
+   { 0x7161514131211101, 0xffffffffffffffff },
+   { 0x7262524232221202, 0xffffffffffffffff },
+   { 0x7363534333231303, 0xffffffffffffffff },
+   { 0x7464544434241404, 0xffffffffffffffff },
+   { 0x7565554535251505, 0xffffffffffffffff },
+   { 0x7666564636261606, 0xffffffffffffffff },
+   { 0x7767574737271707, 0xffffffffffffffff },
+   { 0x7868584838281808, 0xffffffffffffffff },
+   { 0x7969594939291909, 0xffffffffffffffff }
+};
+
+static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
+{
+   { 0x0000000000000000, 0x8f9fafbfcfdfefff },
+   { 0x0000000000000000, 0x8e9eaebecedeeefe },
+   { 0x0000000000000000, 0x8d9dadbdcdddedfd },
+   { 0x0000000000000000, 0x8c9cacbcccdcecfc },
+   { 0x0000000000000000, 0x8b9babbbcbdbebfb },
+   { 0x0000000000000000, 0x8a9aaabacadaeafa },
+   { 0x0000000000000000, 0x8999a9b9c9d9e9f9 },
+   { 0x0000000000000000, 0x8898a8b8c8d8e8f8 },
+   { 0x0000000000000000, 0x8797a7b7c7d7e7f7 },
+   { 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
+};
+
+static const __m128i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02 };
+
+static const __m128i SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509 };
+static const __m128i SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b };
+static const __m128i SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d };
+static const __m128i SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f };
+static const __m128i SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108 };
+static const __m128i SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a };
+static const __m128i SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c };
+static const __m128i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e };
+
 #define tos(a)    #a
 #define tostr(a)  tos(a)

@@ -26,8 +65,6 @@
  i = _mm_xor_si128(i, j);\
 } 

- /**/
-
 /* Yet another implementation of MixBytes.
   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
   Input: a0, ..., a7
@@ -141,36 +178,6 @@
  b1 = _mm_xor_si128(b1, a4);\
 }/*MixBytes*/

-
-static const uint64_t round_const_l0[] __attribute__ ((aligned (64))) =
-{
-  0x7060504030201000, 0xffffffffffffffff,
-  0x7161514131211101, 0xffffffffffffffff,
-  0x7262524232221202, 0xffffffffffffffff,
-  0x7363534333231303, 0xffffffffffffffff,
-  0x7464544434241404, 0xffffffffffffffff,
-  0x7565554535251505, 0xffffffffffffffff,
-  0x7666564636261606, 0xffffffffffffffff,
-  0x7767574737271707, 0xffffffffffffffff,
-  0x7868584838281808, 0xffffffffffffffff,
-  0x7969594939291909, 0xffffffffffffffff
-};
-
-static const uint64_t round_const_l7[] __attribute__ ((aligned (64))) =
-{
-0x0000000000000000, 0x8f9fafbfcfdfefff,
-0x0000000000000000, 0x8e9eaebecedeeefe,
-0x0000000000000000, 0x8d9dadbdcdddedfd,
-0x0000000000000000, 0x8c9cacbcccdcecfc,
-0x0000000000000000, 0x8b9babbbcbdbebfb,
-0x0000000000000000, 0x8a9aaabacadaeafa,
-0x0000000000000000, 0x8999a9b9c9d9e9f9,
-0x0000000000000000, 0x8898a8b8c8d8e8f8,
-0x0000000000000000, 0x8797a7b7c7d7e7f7,
-0x0000000000000000, 0x8696a6b6c6d6e6f6
-};
-
-
 /* one round
 * i = round number
 * a0-a7 = input rows
@@ -190,29 +197,21 @@ static const uint64_t round_const_l7[] __attribute__ ((aligned (64))) =
  \
  /* ShiftBytes + SubBytes (interleaved) */\
  b0 = _mm_xor_si128(b0,  b0);\
-  a0 = _mm_shuffle_epi8( a0, m128_const_64( 0x03060a0d08020509, \
-                                            0x0c0f0104070b0e00 ) ); \
+  a0 = _mm_shuffle_epi8( a0, SUBSH_MASK0 ); \
  a0 = _mm_aesenclast_si128( a0, b0 );\
-  a1 = _mm_shuffle_epi8( a1, m128_const_64( 0x04070c0f0a03060b, \
-                                            0x0e090205000d0801 ) ); \
+  a1 = _mm_shuffle_epi8( a1, SUBSH_MASK1 ); \
  a1 = _mm_aesenclast_si128( a1, b0 );\
-  a2 = _mm_shuffle_epi8( a2, m128_const_64( 0x05000e090c04070d, \
-                                            0x080b0306010f0a02 ) ); \
+  a2 = _mm_shuffle_epi8( a2, SUBSH_MASK2 ); \
  a2 = _mm_aesenclast_si128( a2, b0 );\
-  a3 = _mm_shuffle_epi8( a3, m128_const_64( 0x0601080b0e05000f, \
-                                            0x0a0d040702090c03 ) ); \
+  a3 = _mm_shuffle_epi8( a3, SUBSH_MASK3 ); \
  a3 = _mm_aesenclast_si128( a3, b0 );\
-  a4 = _mm_shuffle_epi8( a4, m128_const_64( 0x0702090c0f060108, \
-                                            0x0b0e0500030a0d04 ) ); \
+  a4 = _mm_shuffle_epi8( a4, SUBSH_MASK4 ); \
  a4 = _mm_aesenclast_si128( a4, b0 );\
-  a5 = _mm_shuffle_epi8( a5, m128_const_64( 0x00030b0e0907020a, \
-                                            0x0d080601040c0f05 ) ); \
+  a5 = _mm_shuffle_epi8( a5, SUBSH_MASK5 ); \
  a5 = _mm_aesenclast_si128( a5, b0 );\
-  a6 = _mm_shuffle_epi8( a6, m128_const_64( 0x01040d080b00030c, \
-                                            0x0f0a0702050e0906 ) ); \
+  a6 = _mm_shuffle_epi8( a6, SUBSH_MASK6 ); \
  a6 = _mm_aesenclast_si128( a6, b0 );\
-  a7 = _mm_shuffle_epi8( a7, m128_const_64( 0x02050f0a0d01040e, \
-                                            0x090c000306080b07 ) ); \
+  a7 = _mm_shuffle_epi8( a7, SUBSH_MASK7 ); \
  a7 = _mm_aesenclast_si128( a7, b0 );\
  \
  /* MixBytes */\
@@ -241,8 +240,9 @@ static const uint64_t round_const_l7[] __attribute__ ((aligned (64))) =
 * outputs: i0, o1-o3
 * clobbers: t0
 */
+
 #define Matrix_Transpose_A(i0, i1, i2, i3, o1, o2, o3, t0){\
-  t0 = m128_const_64( 0x0f070b030e060a02, 0x0d0509010c040800 ); \
+  t0 = TRANSP_MASK; \
  \
  i0 = _mm_shuffle_epi8(i0, t0);\
  i1 = _mm_shuffle_epi8(i1, t0);\
--- a/algo/groestl/aes_ni/hash-groestl256.c
+++ b/algo/groestl/aes_ni/hash-groestl256.c
@@ -214,6 +214,98 @@ HashReturn_gr update_and_final_groestl256( hashState_groestl256* ctx,
   return SUCCESS_GR;
 }

+int groestl256_full( hashState_groestl256* ctx,
+                   void* output, const void* input, DataLength_gr databitlen )
+{
+   int i;
+   ctx->hashlen = 32;
+  for ( i = 0; i < SIZE256; i++ )
+  {
+     ctx->chaining[i] = _mm_setzero_si128();
+     ctx->buffer[i]   = _mm_setzero_si128();
+  }
+  ((u64*)ctx->chaining)[COLS-1] = U64BIG((u64)LENGTH);
+  INIT256( ctx->chaining );
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = ctx->hashlen / 16;   // bytes to __m128i
+   const int hash_offset = SIZE256 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE256;
+   __m128i* in = (__m128i*)input;
+
+   // --- update ---
+
+   // digest any full blocks, process directly from input
+   for ( i = 0; i < blocks; i++ )
+      TF512( ctx->chaining, &in[ i * SIZE256 ] );
+   ctx->buf_ptr = blocks * SIZE256;
+
+   // cryptonight has 200 byte input, an odd number of __m128i
+   // remainder is only 8 bytes, ie u64.
+   if ( databitlen % 128 !=0 )
+   {
+      // must be cryptonight, copy 64 bits of data
+      *(uint64_t*)(ctx->buffer) = *(uint64_t*)(&in[ ctx->buf_ptr ] );
+      i = -1; // signal for odd length
+   }
+   else
+   {
+      // Copy any remaining data to buffer for final transform
+      for ( i = 0; i < len % SIZE256; i++ )
+          ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+      i += rem;   // use i as rem_ptr in final
+   }
+
+   //--- final ---
+
+   // adjust for final block
+   blocks++;
+
+   if ( i == len - 1 )
+   {
+       // all padding at once
+       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+                                           0,        0,0,0, 0,0,0,0x80 );
+   }
+   else
+   {
+      if ( i == -1 )
+      {
+         // cryptonight odd length
+         ((uint64_t*)ctx->buffer)[ 1 ] = 0x80ull;
+         // finish the block with zero and length padding as normal
+         i = 0;
+       }
+       else
+       {
+          // add first padding
+          ctx->buffer[i] = _mm_set_epi8( 0,0,0,0, 0,0,0,0,
+                                         0,0,0,0, 0,0,0,0x80 );
+       }
+       // add zero padding
+       for ( i += 1; i < SIZE256 - 1; i++ )
+           ctx->buffer[i] = _mm_setzero_si128();
+       // add length padding
+       // cheat since we know the block count is trivial, good if block < 256
+       ctx->buffer[i] = _mm_set_epi8( blocks,blocks>>8,0,0, 0,0,0,0,
+                                           0,        0,0,0, 0,0,0,0 );
+   }
+
+   // digest final padding block and do output transform
+   TF512( ctx->chaining, ctx->buffer );
+   OF512( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m128i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return SUCCESS_GR;
+}
+
+
 /* hash bit sequence */
 HashReturn_gr hash_groestl256(int hashbitlen,
                const BitSequence_gr* data,
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -115,4 +115,7 @@ HashReturn_gr hash_groestli256( int, const BitSequence_gr*, DataLength_gr,
 HashReturn_gr update_and_final_groestl256( hashState_groestl256*, void*,
                                           const void*, DataLength_gr );

+int groestl256_full( hashState_groestl256* ctx,
+                   void* output, const void* input, DataLength_gr databitlen );
+
 #endif /* __hash_h */
--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -1,4 +1,7 @@
 #include "groestl-gate.h"
+
+#if !defined(GROESTL_8WAY) && !defined(GROESTLX16R_4WAY)
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -88,4 +91,4 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-
+#endif
--- a/algo/groestl/groestl256-hash-4way.c
+++ b/algo/groestl/groestl256-hash-4way.c
@@ -23,7 +23,6 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
  int i;

  ctx->hashlen = hashlen;
-  SET_CONSTANTS();

  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return 1;
@@ -36,9 +35,6 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )

  // The only non-zero in the IV is len. It can be hard coded.
  ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
-//  uint64_t len = U64BIG((uint64_t)LENGTH);
-//  ctx->chaining[ COLS/2 -1 ] = _mm512_set4_epi64( len, 0, len, 0 );
-//  INIT256_4way(ctx->chaining);

  ctx->buf_ptr = 0;
  ctx->rem_ptr = 0;
@@ -46,6 +42,77 @@ int groestl256_4way_init( groestl256_4way_context* ctx, uint64_t hashlen )
  return 0;
 }

+int groestl256_4way_full( groestl256_4way_context* ctx, void* output,
+                                const void* input, uint64_t databitlen )
+{
+   const int len = (int)databitlen / 128;
+   const int hashlen_m128i = 32 / 16;   // bytes to __m128i
+   const int hash_offset = SIZE256 - hashlen_m128i;
+   int rem = ctx->rem_ptr;
+   int blocks = len / SIZE256;
+   __m512i* in = (__m512i*)input;
+   int i;
+
+  if (ctx->chaining == NULL || ctx->buffer == NULL)
+    return 1;
+
+  for ( i = 0; i < SIZE256; i++ )
+  {
+     ctx->chaining[i] = m512_zero;
+     ctx->buffer[i]   = m512_zero;
+  }
+
+  // The only non-zero in the IV is len. It can be hard coded.
+  ctx->chaining[ 3 ] = m512_const2_64( 0, 0x0100000000000000 );
+  ctx->buf_ptr = 0;
+  ctx->rem_ptr = 0;
+   
+   // --- update ---
+
+   // digest any full blocks, process directly from input 
+   for ( i = 0; i < blocks; i++ )
+      TF512_4way( ctx->chaining, &in[ i * SIZE256 ] );
+   ctx->buf_ptr = blocks * SIZE256;
+
+   // copy any remaining data to buffer, it may already contain data
+   // from a previous update for a midstate precalc
+   for ( i = 0; i < len % SIZE256; i++ )
+       ctx->buffer[ rem + i ] = in[ ctx->buf_ptr + i ];
+   i += rem;    // use i as rem_ptr in final
+
+   //--- final ---
+
+   blocks++;      // adjust for final block
+
+   if ( i == SIZE256 - 1 )
+   {        
+       // only 1 vector left in buffer, all padding at once
+      ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0x80 ); 
+   }   
+   else
+   {
+       // add first padding
+       ctx->buffer[i] = m512_const4_64( 0, 0x80, 0, 0x80 );
+       // add zero padding
+       for ( i += 1; i < SIZE256 - 1; i++ )
+           ctx->buffer[i] = m512_zero;
+
+       // add length padding, second last byte is zero unless blocks > 255
+      ctx->buffer[i] = m512_const2_64( (uint64_t)blocks << 56, 0 );
+   }
+
+// digest final padding block and do output transform
+   TF512_4way( ctx->chaining, ctx->buffer );
+
+   OF512_4way( ctx->chaining );
+
+   // store hash result in output 
+   for ( i = 0; i < hashlen_m128i; i++ )
+      casti_m512i( output, i ) = ctx->chaining[ hash_offset + i ];
+
+   return 0;
+}
+
 int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
                                const void* input, uint64_t databitlen )
 {
@@ -75,11 +142,11 @@ int groestl256_4way_update_close( groestl256_4way_context* ctx, void* output,
   blocks++;      // adjust for final block

   if ( i == SIZE256 - 1 )
-   {        
+   {
       // only 1 vector left in buffer, all padding at once
       ctx->buffer[i] = m512_const1_128( _mm_set_epi8(
                      blocks, blocks>>8,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
-   }   
+   }
   else
   {
       // add first padding
--- a/algo/groestl/groestl256-hash-4way.h
+++ b/algo/groestl/groestl256-hash-4way.h
@@ -71,5 +71,8 @@ int groestl256_4way_init( groestl256_4way_context*, uint64_t );
 int groestl256_4way_update_close( groestl256_4way_context*,  void*,
                                        const void*, uint64_t );

+int groestl256_4way_full( groestl256_4way_context*, void*,
+                          const void*, uint64_t );
+
 #endif
 #endif 
--- a/algo/groestl/groestl256-intr-4way.h
+++ b/algo/groestl/groestl256-intr-4way.h
@@ -14,17 +14,78 @@
 #include "groestl256-hash-4way.h"

 #if defined(__VAES__)
+static const __m128i round_const_l0[] __attribute__ ((aligned (64))) =
+{
+   { 0x7060504030201000, 0xffffffffffffffff },
+   { 0x7161514131211101, 0xffffffffffffffff },
+   { 0x7262524232221202, 0xffffffffffffffff },
+   { 0x7363534333231303, 0xffffffffffffffff },
+   { 0x7464544434241404, 0xffffffffffffffff },
+   { 0x7565554535251505, 0xffffffffffffffff },
+   { 0x7666564636261606, 0xffffffffffffffff },
+   { 0x7767574737271707, 0xffffffffffffffff },
+   { 0x7868584838281808, 0xffffffffffffffff },
+   { 0x7969594939291909, 0xffffffffffffffff }
+};

-/* global constants  */
-__m512i ROUND_CONST_Lx;
-__m512i ROUND_CONST_L0[ROUNDS512];
-__m512i ROUND_CONST_L7[ROUNDS512];
-//__m512i ROUND_CONST_P[ROUNDS1024];
-//__m512i ROUND_CONST_Q[ROUNDS1024];
-__m512i TRANSP_MASK;
-__m512i SUBSH_MASK[8];
-__m512i ALL_1B;
-__m512i ALL_FF;
+static const __m128i round_const_l7[] __attribute__ ((aligned (64))) =
+{
+   { 0x0000000000000000, 0x8f9fafbfcfdfefff },
+   { 0x0000000000000000, 0x8e9eaebecedeeefe },
+   { 0x0000000000000000, 0x8d9dadbdcdddedfd },
+   { 0x0000000000000000, 0x8c9cacbcccdcecfc },
+   { 0x0000000000000000, 0x8b9babbbcbdbebfb },
+   { 0x0000000000000000, 0x8a9aaabacadaeafa },
+   { 0x0000000000000000, 0x8999a9b9c9d9e9f9 },
+   { 0x0000000000000000, 0x8898a8b8c8d8e8f8 },
+   { 0x0000000000000000, 0x8797a7b7c7d7e7f7 },
+   { 0x0000000000000000, 0x8696a6b6c6d6e6f6 }
+};
+
+static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
+                                     0x1d1519111c141810, 0x1f171b131e161a12,
+                                     0x2d2529212c242820, 0x2f272b232e262a22,
+                                     0x3d3539313c343830, 0x3f373b333e363a32 };
+
+static const __m512i SUBSH_MASK0 = { 0x0c0f0104070b0e00, 0x03060a0d08020509,
+                                     0x1c1f1114171b1e10, 0x13161a1d18121519,
+                                     0x2c2f2124272b2e20, 0x23262a2d28222529,
+                                     0x3c3f3134373b3e30, 0x33363a3d38323539 };
+
+static const __m512i SUBSH_MASK1 = { 0x0e090205000d0801, 0x04070c0f0a03060b,
+                                     0x1e191215101d1801, 0x14171c1f1a13161b,
+                                     0x2e292225202d2821, 0x24272c2f2a23262b,
+                                     0x3e393235303d3831, 0x34373c3f3a33363b };
+
+static const __m512i SUBSH_MASK2 = { 0x080b0306010f0a02, 0x05000e090c04070d,
+                                     0x181b1316111f1a12, 0x15101e191c14171d,
+                                     0x282b2326212f2a22, 0x25202e292c24272d,
+                                     0x383b3336313f3a32, 0x35303e393c34373d };
+
+static const __m512i SUBSH_MASK3 = { 0x0a0d040702090c03, 0x0601080b0e05000f,
+                                     0x1a1d141712191c13, 0x1611181b1e15101f,
+                                     0x2a2d242722292c23, 0x2621282b2e25202f,
+                                     0x3a3d343732393c33, 0x3631383b3e35303f };
+
+static const __m512i SUBSH_MASK4 = { 0x0b0e0500030a0d04, 0x0702090c0f060108,
+                                     0x1b1e1510131a1d14, 0x1712191c1f161118,
+                                     0x2b2e2520232a2d24, 0x2722292c2f262128,
+                                     0x3b3e3530333a3d34, 0x3732393c3f363138 };
+
+static const __m512i SUBSH_MASK5 = { 0x0d080601040c0f05, 0x00030b0e0907020a,
+                                     0x1d181611141c1f15, 0x10131b1e1917121a,
+                                     0x2d282621242c2f25, 0x20232b2e2927222a,
+                                     0x3d383631343c3f35, 0x30333b3e3937323a };
+
+static const __m512i SUBSH_MASK6 = { 0x0f0a0702050e0906, 0x01040d080b00030c,
+                                     0x1f1a1712151e1916, 0x11141d181b10131c,
+                                     0x2f2a2722252e2926, 0x21242d282b20232c,
+                                     0x3f3a3732353e3936, 0x31343d383b30333c };
+
+static const __m512i SUBSH_MASK7 = { 0x090c000306080b07, 0x02050f0a0d01040e,
+                                     0x191c101316181b17, 0x12151f1a1d11141e,
+                                     0x292c202326282b27, 0x22252f2a2d21242e,
+                                     0x393c303336383b37, 0x32353f3a3d31343e };

 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -40,8 +101,6 @@ __m512i ALL_FF;
  i = _mm512_xor_si512(i, j);\
 } 

- /**/
-
 /* Yet another implementation of MixBytes.
   This time we use the formulae (3) from the paper "Byte Slicing Groestl".
   Input: a0, ..., a7
@@ -155,95 +214,36 @@ __m512i ALL_FF;
  b1 = _mm512_xor_si512(b1, a4);\
 }/*MixBytes*/

-// calculate the round constants seperately and load at startup
-
-#define SET_CONSTANTS(){\
-  ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\
-  TRANSP_MASK   = _mm512_set_epi32( \
-                         0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \
-                         0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \
-                         0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \
-                         0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \
-  SUBSH_MASK[0] = _mm512_set_epi32( \
-                         0x33363a3d, 0x38323539, 0x3c3f3134, 0x373b3e30, \
-                         0x23262a2d, 0x28222529, 0x2c2f2124, 0x272b2e20, \
-                         0x13161a1d, 0x18121519, 0x1c1f1114, 0x171b1e10, \
-                         0x03060a0d, 0x08020509, 0x0c0f0104, 0x070b0e00 ); \
-  SUBSH_MASK[1] = _mm512_set_epi32( \
-                         0x34373c3f, 0x3a33363b, 0x3e393235, 0x303d3831, \
-                         0x24272c2f, 0x2a23262b, 0x2e292225, 0x202d2821, \
-                         0x14171c1f, 0x1a13161b, 0x1e191215, 0x101d1801, \
-                         0x04070c0f, 0x0a03060b, 0x0e090205, 0x000d0801 );\
-  SUBSH_MASK[2] = _mm512_set_epi32( \
-                         0x35303e39, 0x3c34373d, 0x383b3336, 0x313f3a32, \
-                         0x25202e29, 0x2c24272d, 0x282b2326, 0x212f2a22, \
-                         0x15101e19, 0x1c14171d, 0x181b1316, 0x111f1a12, \
-                         0x05000e09, 0x0c04070d, 0x080b0306, 0x010f0a02 );\
-  SUBSH_MASK[3] = _mm512_set_epi32( \
-                         0x3631383b, 0x3e35303f, 0x3a3d3437, 0x32393c33, \
-                         0x2621282b, 0x2e25202f, 0x2a2d2427, 0x22292c23, \
-                         0x1611181b, 0x1e15101f, 0x1a1d1417, 0x12191c13, \
-                         0x0601080b, 0x0e05000f, 0x0a0d0407, 0x02090c03 );\
-  SUBSH_MASK[4] = _mm512_set_epi32( \
-                         0x3732393c, 0x3f363138, 0x3b3e3530, 0x333a3d34, \
-                         0x2722292c, 0x2f262128, 0x2b2e2520, 0x232a2d24, \
-                         0x1712191c, 0x1f161118, 0x1b1e1510, 0x131a1d14, \
-                         0x0702090c, 0x0f060108, 0x0b0e0500, 0x030a0d04 );\
-  SUBSH_MASK[5] = _mm512_set_epi32( \
-                         0x30333b3e, 0x3937323a, 0x3d383631, 0x343c3f35, \
-                         0x20232b2e, 0x2927222a, 0x2d282621, 0x242c2f25, \
-                         0x10131b1e, 0x1917121a, 0x1d181611, 0x141c1f15, \
-                         0x00030b0e, 0x0907020a, 0x0d080601, 0x040c0f05 );\
-  SUBSH_MASK[6] = _mm512_set_epi32( \
-                         0x31343d38, 0x3b30333c, 0x3f3a3732, 0x353e3936, \
-                         0x21242d28, 0x2b20232c, 0x2f2a2722, 0x252e2926, \
-                         0x11141d18, 0x1b10131c, 0x1f1a1712, 0x151e1916, \
-                         0x01040d08, 0x0b00030c, 0x0f0a0702, 0x050e0906 );\
-  SUBSH_MASK[7] = _mm512_set_epi32( \
-                         0x32353f3a, 0x3d31343e, 0x393c3033, 0x36383b37, \
-                         0x22252f2a, 0x2d21242e, 0x292c2023, 0x26282b27, \
-                         0x12151f1a, 0x1d11141e, 0x191c1013, 0x16181b17, \
-                         0x02050f0a, 0x0d01040e, 0x090c0003, 0x06080b07 );\
-  for ( i = 0; i < ROUNDS512; i++ ) \
-  {\
-    ROUND_CONST_L0[i] = _mm512_set4_epi32( 0xffffffff, 0xffffffff, \
-          0x70605040 ^ ( i * 0x01010101 ), 0x30201000 ^ ( i * 0x01010101 ) ); \
-    ROUND_CONST_L7[i] = _mm512_set4_epi32( 0x8f9fafbf ^ ( i * 0x01010101 ), \
-          0xcfdfefff ^ ( i * 0x01010101 ), 0x00000000, 0x00000000 ); \
-  }\
-  ROUND_CONST_Lx = _mm512_set4_epi32( 0xffffffff, 0xffffffff, \
-                                      0x00000000, 0x00000000 ); \
-}while(0);\

 #define ROUND(i, a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7){\
  /* AddRoundConstant */\
-  b1 = ROUND_CONST_Lx;\
-  a0 = _mm512_xor_si512( a0, (ROUND_CONST_L0[i]) );\
+  b1 = m512_const2_64( 0xffffffffffffffff, 0 ); \
+  a0 = _mm512_xor_si512( a0, m512_const1_128( round_const_l0[i] ) );\
  a1 = _mm512_xor_si512( a1, b1 );\
  a2 = _mm512_xor_si512( a2, b1 );\
  a3 = _mm512_xor_si512( a3, b1 );\
  a4 = _mm512_xor_si512( a4, b1 );\
  a5 = _mm512_xor_si512( a5, b1 );\
  a6 = _mm512_xor_si512( a6, b1 );\
-  a7 = _mm512_xor_si512( a7, (ROUND_CONST_L7[i]) );\
+  a7 = _mm512_xor_si512( a7, m512_const1_128( round_const_l7[i] ) );\
  \
  /* ShiftBytes + SubBytes (interleaved) */\
  b0 = _mm512_xor_si512( b0, b0 );\
-  a0 = _mm512_shuffle_epi8( a0, (SUBSH_MASK[0]) );\
+  a0 = _mm512_shuffle_epi8( a0, SUBSH_MASK0 );\
  a0 = _mm512_aesenclast_epi128(a0, b0 );\
-  a1 = _mm512_shuffle_epi8( a1, (SUBSH_MASK[1]) );\
+  a1 = _mm512_shuffle_epi8( a1, SUBSH_MASK1 );\
  a1 = _mm512_aesenclast_epi128(a1, b0 );\
-  a2 = _mm512_shuffle_epi8( a2, (SUBSH_MASK[2]) );\
+  a2 = _mm512_shuffle_epi8( a2, SUBSH_MASK2 );\
  a2 = _mm512_aesenclast_epi128(a2, b0 );\
-  a3 = _mm512_shuffle_epi8( a3, (SUBSH_MASK[3]) );\
+  a3 = _mm512_shuffle_epi8( a3, SUBSH_MASK3 );\
  a3 = _mm512_aesenclast_epi128(a3, b0 );\
-  a4 = _mm512_shuffle_epi8( a4, (SUBSH_MASK[4]) );\
+  a4 = _mm512_shuffle_epi8( a4, SUBSH_MASK4 );\
  a4 = _mm512_aesenclast_epi128(a4, b0 );\
-  a5 = _mm512_shuffle_epi8( a5, (SUBSH_MASK[5]) );\
+  a5 = _mm512_shuffle_epi8( a5, SUBSH_MASK5 );\
  a5 = _mm512_aesenclast_epi128(a5, b0 );\
-  a6 = _mm512_shuffle_epi8( a6, (SUBSH_MASK[6]) );\
+  a6 = _mm512_shuffle_epi8( a6, SUBSH_MASK6 );\
  a6 = _mm512_aesenclast_epi128(a6, b0 );\
-  a7 = _mm512_shuffle_epi8( a7, (SUBSH_MASK[7]) );\
+  a7 = _mm512_shuffle_epi8( a7, SUBSH_MASK7 );\
  a7 = _mm512_aesenclast_epi128( a7, b0 );\
  \
  /* MixBytes */\
@@ -390,29 +390,6 @@ __m512i ALL_FF;
 }/**/


-
-void INIT256_4way( __m512i* chaining )
-{
-  static __m512i xmm0, xmm2, xmm6, xmm7;
-  static __m512i xmm12, xmm13, xmm14, xmm15;
-
-  /* load IV into registers xmm12 - xmm15 */
-  xmm12 = chaining[0];
-  xmm13 = chaining[1];
-  xmm14 = chaining[2];
-  xmm15 = chaining[3];
-
-  /* transform chaining value from column ordering into row ordering */
-  /* we put two rows (64 bit) of the IV into one 128-bit XMM register */
-  Matrix_Transpose_A(xmm12, xmm13, xmm14, xmm15, xmm2, xmm6, xmm7, xmm0);
-
-  /* store transposed IV */
-  chaining[0] = xmm12;
-  chaining[1] = xmm2;
-  chaining[2] = xmm6;
-  chaining[3] = xmm7;
-}
-
 void TF512_4way( __m512i* chaining, __m512i* message )
 {
  static __m512i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
--- a/algo/groestl/groestl512-hash-4way.c
+++ b/algo/groestl/groestl512-hash-4way.c
@@ -19,10 +19,6 @@

 int groestl512_4way_init( groestl512_4way_context* ctx, uint64_t hashlen )
 {
-  int i;
-
-  SET_CONSTANTS();
-
  if (ctx->chaining == NULL || ctx->buffer == NULL)
    return 1;

@@ -99,7 +95,6 @@ int groestl512_4way_full( groestl512_4way_context* ctx, void* output,

   // --- init ---

-   SET_CONSTANTS();
   memset_zero_512( ctx->chaining, SIZE512 );
   memset_zero_512( ctx->buffer, SIZE512 );
   ctx->chaining[ 6 ] = m512_const2_64( 0x0200000000000000, 0 );
--- a/algo/groestl/groestl512-intr-4way.h
+++ b/algo/groestl/groestl512-intr-4way.h
@@ -15,16 +15,86 @@

 #if defined(__VAES__)

-/* global constants  */
-__m512i ROUND_CONST_Lx;
-//__m128i ROUND_CONST_L0[ROUNDS512];
-//__m128i ROUND_CONST_L7[ROUNDS512];
-__m512i ROUND_CONST_P[ROUNDS1024];
-__m512i ROUND_CONST_Q[ROUNDS1024];
-__m512i TRANSP_MASK;
-__m512i SUBSH_MASK[8];
-__m512i ALL_1B;
-__m512i ALL_FF;
+static const __m128i round_const_p[] __attribute__ ((aligned (64))) =
+{
+   { 0x7060504030201000, 0xf0e0d0c0b0a09080 },
+   { 0x7161514131211101, 0xf1e1d1c1b1a19181 }, 
+   { 0x7262524232221202, 0xf2e2d2c2b2a29282 },
+   { 0x7363534333231303, 0xf3e3d3c3b3a39383 },
+   { 0x7464544434241404, 0xf4e4d4c4b4a49484 },
+   { 0x7565554535251505, 0xf5e5d5c5b5a59585 },
+   { 0x7666564636261606, 0xf6e6d6c6b6a69686 },
+   { 0x7767574737271707, 0xf7e7d7c7b7a79787 },
+   { 0x7868584838281808, 0xf8e8d8c8b8a89888 },
+   { 0x7969594939291909, 0xf9e9d9c9b9a99989 },
+   { 0x7a6a5a4a3a2a1a0a, 0xfaeadacabaaa9a8a },
+   { 0x7b6b5b4b3b2b1b0b, 0xfbebdbcbbbab9b8b },
+   { 0x7c6c5c4c3c2c1c0c, 0xfcecdcccbcac9c8c },
+   { 0x7d6d5d4d3d2d1d0d, 0xfdedddcdbdad9d8d }
+};
+
+static const __m128i round_const_q[] __attribute__ ((aligned (64))) =
+{
+   { 0x8f9fafbfcfdfefff, 0x0f1f2f3f4f5f6f7f },
+   { 0x8e9eaebecedeeefe, 0x0e1e2e3e4e5e6e7e },
+   { 0x8d9dadbdcdddedfd, 0x0d1d2d3d4d5d6d7d },
+   { 0x8c9cacbcccdcecfc, 0x0c1c2c3c4c5c6c7c },
+   { 0x8b9babbbcbdbebfb, 0x0b1b2b3b4b5b6b7b },
+   { 0x8a9aaabacadaeafa, 0x0a1a2a3a4a5a6a7a },
+   { 0x8999a9b9c9d9e9f9, 0x0919293949596979 },
+   { 0x8898a8b8c8d8e8f8, 0x0818283848586878 },
+   { 0x8797a7b7c7d7e7f7, 0x0717273747576777 },
+   { 0x8696a6b6c6d6e6f6, 0x0616263646566676 },
+   { 0x8595a5b5c5d5e5f5, 0x0515253545556575 },
+   { 0x8494a4b4c4d4e4f4, 0x0414243444546474 },
+   { 0x8393a3b3c3d3e3f3, 0x0313233343536373 },
+   { 0x8292a2b2c2d2e2f2, 0x0212223242526272 }
+};
+
+static const __m512i TRANSP_MASK = { 0x0d0509010c040800, 0x0f070b030e060a02,
+                                     0x1d1519111c141810, 0x1f171b131e161a12,
+                                     0x2d2529212c242820, 0x2f272b232e262a22,
+                                     0x3d3539313c343830, 0x3f373b333e363a32 };
+
+static const __m512i SUBSH_MASK0 = { 0x0b0e0104070a0d00, 0x0306090c0f020508,
+                                     0x1b1e1114171a1d10, 0x1316191c1f121518,
+                                     0x2b2e2124272a2d20, 0x2326292c2f222528,
+                                     0x3b3e3134373a3d30, 0x3336393c3f323538 };
+
+static const __m512i SUBSH_MASK1 = { 0x0c0f0205080b0e01, 0x04070a0d00030609,
+                                     0x1c1f1215181b1e11, 0x14171a1d10131619,
+                                     0x2c2f2225282b2e21, 0x24272a2d20232629,
+                                     0x3c3f3235383b3e31, 0x34373a3d30333639 };
+
+static const __m512i SUBSH_MASK2 = { 0x0d000306090c0f02, 0x05080b0e0104070a,
+                                     0x1d101316191c1f12, 0x15181b1e1114171a,
+                                     0x2d202326292c2f22, 0x25282b2e2124272a,
+                                     0x3d303336393c3f32, 0x35383b3e3134373a };
+
+static const __m512i SUBSH_MASK3 = { 0x0e0104070a0d0003, 0x06090c0f0205080b,
+                                     0x1e1114171a1d1013, 0x16191c1f1215181b,
+                                     0x2e2124272a2d2023, 0x26292c2f2225282b,
+                                     0x3e3134373a3d3033, 0x36393c3f3235383b };
+
+static const __m512i SUBSH_MASK4 = { 0x0f0205080b0e0104, 0x070a0d000306090c,
+                                     0x1f1215181b1e1114, 0x171a1d101316191c,
+                                     0x2f2225282b2e2124, 0x272a2d202326292c,
+                                     0x3f3235383b3e3134, 0x373a3d303336393c };
+
+static const __m512i SUBSH_MASK5 = { 0x000306090c0f0205, 0x080b0e0104070a0d,
+                                     0x101316191c1f1215, 0x181b1e1114171a1d,
+                                     0x202326292c2f2225, 0x282b2e2124272a2d,
+                                     0x303336393c3f3235, 0x383b3e3134373a3d };
+
+static const __m512i SUBSH_MASK6 = { 0x0104070a0d000306, 0x090c0f0205080b0e,
+                                     0x1114171a1d101316, 0x191c1f1215181b1e,
+                                     0x2124272a2d202326, 0x292c2f2225282b2e,
+                                     0x3134373a3d303336, 0x393c3f3235383b3e };
+
+static const __m512i SUBSH_MASK7 = { 0x06090c0f0205080b, 0x0e0104070a0d0003,
+                                     0x16191c1f1215181b, 0x1e1114171a1d1013,
+                                     0x26292c2f2225282b, 0x2e2124272a2d2023,
+                                     0x36393c3f3235383b, 0x3e3134373a3d3033 };

 #define tos(a)    #a
 #define tostr(a)  tos(a)
@@ -155,69 +225,6 @@ __m512i ALL_FF;
  b1 = _mm512_xor_si512(b1, a4);\
 }/*MixBytes*/

-// calculate the round constants seperately and load at startup
-
-#define SET_CONSTANTS(){\
-  ALL_FF = _mm512_set1_epi32( 0xffffffff );\
-  ALL_1B = _mm512_set1_epi32( 0x1b1b1b1b );\
-  TRANSP_MASK   = _mm512_set_epi32( \
-                         0x3f373b33, 0x3e363a32, 0x3d353931, 0x3c343830, \
-                         0x2f272b23, 0x2e262a22, 0x2d252921, 0x2c242820, \
-                         0x1f171b13, 0x1e161a12, 0x1d151911, 0x1c141810, \
-                         0x0f070b03, 0x0e060a02, 0x0d050901, 0x0c040800 ); \
-  SUBSH_MASK[0] = _mm512_set_epi32( \
-                         0x3336393c, 0x3f323538, 0x3b3e3134, 0x373a3d30, \
-                         0x2326292c, 0x2f222528, 0x2b2e2124, 0x272a2d20, \
-                         0x1316191c, 0x1f121518, 0x1b1e1114, 0x171a1d10, \
-                         0x0306090c, 0x0f020508, 0x0b0e0104, 0x070a0d00 ); \
-  SUBSH_MASK[1] = _mm512_set_epi32( \
-                         0x34373a3d, 0x30333639, 0x3c3f3235, 0x383b3e31, \
-                         0x24272a2d, 0x20232629, 0x2c2f2225, 0x282b2e21, \
-                         0x14171a1d, 0x10131619, 0x1c1f1215, 0x181b1e11, \
-                         0x04070a0d, 0x00030609, 0x0c0f0205, 0x080b0e01 ); \
-  SUBSH_MASK[2] = _mm512_set_epi32( \
-                         0x35383b3e, 0x3134373a, 0x3d303336, 0x393c3f32, \
-                         0x25282b2e, 0x2124272a, 0x2d202326, 0x292c2f22, \
-                         0x15181b1e, 0x1114171a, 0x1d101316, 0x191c1f12, \
-                         0x05080b0e, 0x0104070a, 0x0d000306, 0x090c0f02 ); \
-  SUBSH_MASK[3] = _mm512_set_epi32( \
-                         0x36393c3f, 0x3235383b, 0x3e313437, 0x3a3d3033, \
-                         0x26292c2f, 0x2225282b, 0x2e212427, 0x2a2d2023, \
-                         0x16191c1f, 0x1215181b, 0x1e111417, 0x1a1d1013, \
-                         0x06090c0f, 0x0205080b, 0x0e010407, 0x0a0d0003 ); \
-  SUBSH_MASK[4] = _mm512_set_epi32( \
-                         0x373a3d30, 0x3336393c, 0x3f323538, 0x3b3e3134, \
-                         0x272a2d20, 0x2326292c, 0x2f222528, 0x2b2e2124, \
-                         0x171a1d10, 0x1316191c, 0x1f121518, 0x1b1e1114, \
-                         0x070a0d00, 0x0306090c, 0x0f020508, 0x0b0e0104 ); \
-  SUBSH_MASK[5] = _mm512_set_epi32( \
-                         0x383b3e31, 0x34373a3d, 0x30333639, 0x3c3f3235, \
-                         0x282b2e21, 0x24272a2d, 0x20232629, 0x2c2f2225, \
-                         0x181b1e11, 0x14171a1d, 0x10131619, 0x1c1f1215, \
-                         0x080b0e01, 0x04070a0d, 0x00030609, 0x0c0f0205 ); \
-  SUBSH_MASK[6] = _mm512_set_epi32( \
-                         0x393c3f32, 0x35383b3e, 0x3134373a, 0x3d303336, \
-                         0x292c2f22, 0x25282b2e, 0x2124272a, 0x2d202326, \
-                         0x191c1f12, 0x15181b1e, 0x1114171a, 0x1d101316, \
-                         0x090c0f02, 0x05080b0e, 0x0104070a, 0x0d000306 ); \
-  SUBSH_MASK[7] = _mm512_set_epi32( \
-                         0x3e313437, 0x3a3d3033, 0x36393c3f, 0x3235383b, \
-                         0x2e212427, 0x2a2d2023, 0x26292c2f, 0x2225282b, \
-                         0x1e111417, 0x1a1d1013, 0x16191c1f, 0x1215181b, \
-                         0x0e010407, 0x0a0d0003, 0x06090c0f, 0x0205080b ); \
-  for( i = 0; i < ROUNDS1024; i++ ) \
-  { \
-    ROUND_CONST_P[i] = _mm512_set4_epi32( 0xf0e0d0c0 ^ (i * 0x01010101), \
-                                          0xb0a09080 ^ (i * 0x01010101), \
-                                          0x70605040 ^ (i * 0x01010101), \
-                                          0x30201000 ^ (i * 0x01010101) ); \
-    ROUND_CONST_Q[i] = _mm512_set4_epi32( 0x0f1f2f3f ^ (i * 0x01010101), \
-                                          0x4f5f6f7f ^ (i * 0x01010101), \
-                                          0x8f9fafbf ^ (i * 0x01010101), \
-                                          0xcfdfefff ^ (i * 0x01010101));\
-  } \
-}while(0);\
-
 /* one round
 * a0-a7 = input rows
 * b0-b7 = output rows
@@ -242,30 +249,32 @@ __m512i ALL_FF;
  for ( round_counter = 0; round_counter < 14; round_counter += 2 ) \
  { \
    /* AddRoundConstant P1024 */\
-    xmm8 = _mm512_xor_si512( xmm8, ( ROUND_CONST_P[ round_counter ] ) );\
+    xmm8 = _mm512_xor_si512( xmm8, m512_const1_128( \
+             casti_m128i( round_const_p, round_counter ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm8  = _mm512_shuffle_epi8( xmm8,  ( SUBSH_MASK[0] ) );\
-    xmm9  = _mm512_shuffle_epi8( xmm9,  ( SUBSH_MASK[1] ) );\
-    xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[2] ) );\
-    xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[3] ) );\
-    xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[4] ) );\
-    xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[5] ) );\
-    xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[6] ) );\
-    xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[7] ) );\
+    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK0 ); \
+    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK1 );\
+    xmm10 = _mm512_shuffle_epi8( xmm10, SUBSH_MASK2 );\
+    xmm11 = _mm512_shuffle_epi8( xmm11, SUBSH_MASK3 );\
+    xmm12 = _mm512_shuffle_epi8( xmm12, SUBSH_MASK4 );\
+    xmm13 = _mm512_shuffle_epi8( xmm13, SUBSH_MASK5 );\
+    xmm14 = _mm512_shuffle_epi8( xmm14, SUBSH_MASK6 );\
+    xmm15 = _mm512_shuffle_epi8( xmm15, SUBSH_MASK7 );\
    /* SubBytes + MixBytes */\
    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
     /* AddRoundConstant P1024 */\
-    xmm0 = _mm512_xor_si512( xmm0, ( ROUND_CONST_P[ round_counter+1 ] ) );\
+    xmm0 = _mm512_xor_si512( xmm0, m512_const1_128( \
+             casti_m128i( round_const_p, round_counter+1 ) ) ); \
    /* ShiftBytes P1024 + pre-AESENCLAST */\
-    xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[0] ) );\
-    xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[1] ) );\
-    xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[2] ) );\
-    xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[3] ) );\
-    xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[4] ) );\
-    xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[5] ) );\
-    xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[6] ) );\
-    xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[7] ) );\
+    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK0 );\
+    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK1 );\
+    xmm2 = _mm512_shuffle_epi8( xmm2, SUBSH_MASK2 );\
+    xmm3 = _mm512_shuffle_epi8( xmm3, SUBSH_MASK3 );\
+    xmm4 = _mm512_shuffle_epi8( xmm4, SUBSH_MASK4 );\
+    xmm5 = _mm512_shuffle_epi8( xmm5, SUBSH_MASK5 );\
+    xmm6 = _mm512_shuffle_epi8( xmm6, SUBSH_MASK6 );\
+    xmm7 = _mm512_shuffle_epi8( xmm7, SUBSH_MASK7 );\
    /* SubBytes + MixBytes */\
     SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
@@ -284,16 +293,17 @@ __m512i ALL_FF;
    xmm12 = _mm512_xor_si512( xmm12, xmm1 );\
    xmm13 = _mm512_xor_si512( xmm13, xmm1 );\
    xmm14 = _mm512_xor_si512( xmm14, xmm1 );\
-    xmm15 = _mm512_xor_si512( xmm15, ( ROUND_CONST_Q[ round_counter ] ) );\
+    xmm15 = _mm512_xor_si512( xmm15, m512_const1_128( \
+                 casti_m128i( round_const_q, round_counter ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm8  = _mm512_shuffle_epi8( xmm8,  ( SUBSH_MASK[1] ) );\
-    xmm9  = _mm512_shuffle_epi8( xmm9,  ( SUBSH_MASK[3] ) );\
-    xmm10 = _mm512_shuffle_epi8( xmm10, ( SUBSH_MASK[5] ) );\
-    xmm11 = _mm512_shuffle_epi8( xmm11, ( SUBSH_MASK[7] ) );\
-    xmm12 = _mm512_shuffle_epi8( xmm12, ( SUBSH_MASK[0] ) );\
-    xmm13 = _mm512_shuffle_epi8( xmm13, ( SUBSH_MASK[2] ) );\
-    xmm14 = _mm512_shuffle_epi8( xmm14, ( SUBSH_MASK[4] ) );\
-    xmm15 = _mm512_shuffle_epi8( xmm15, ( SUBSH_MASK[6] ) );\
+    xmm8  = _mm512_shuffle_epi8( xmm8,  SUBSH_MASK1 );\
+    xmm9  = _mm512_shuffle_epi8( xmm9,  SUBSH_MASK3 );\
+    xmm10 = _mm512_shuffle_epi8( xmm10, SUBSH_MASK5 );\
+    xmm11 = _mm512_shuffle_epi8( xmm11, SUBSH_MASK7 );\
+    xmm12 = _mm512_shuffle_epi8( xmm12, SUBSH_MASK0 );\
+    xmm13 = _mm512_shuffle_epi8( xmm13, SUBSH_MASK2 );\
+    xmm14 = _mm512_shuffle_epi8( xmm14, SUBSH_MASK4 );\
+    xmm15 = _mm512_shuffle_epi8( xmm15, SUBSH_MASK6 );\
    /* SubBytes + MixBytes */\
    SUBMIX(xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7);\
    \
@@ -306,16 +316,17 @@ __m512i ALL_FF;
    xmm4 = _mm512_xor_si512( xmm4, xmm9 );\
    xmm5 = _mm512_xor_si512( xmm5, xmm9 );\
    xmm6 = _mm512_xor_si512( xmm6, xmm9 );\
-    xmm7 = _mm512_xor_si512( xmm7, ( ROUND_CONST_Q[ round_counter+1 ] ) );\
+    xmm7 = _mm512_xor_si512( xmm7, m512_const1_128( \
+             casti_m128i( round_const_q, round_counter+1 ) ) ); \
    /* ShiftBytes Q1024 + pre-AESENCLAST */\
-    xmm0 = _mm512_shuffle_epi8( xmm0, ( SUBSH_MASK[1] ) );\
-    xmm1 = _mm512_shuffle_epi8( xmm1, ( SUBSH_MASK[3] ) );\
-    xmm2 = _mm512_shuffle_epi8( xmm2, ( SUBSH_MASK[5] ) );\
-    xmm3 = _mm512_shuffle_epi8( xmm3, ( SUBSH_MASK[7] ) );\
-    xmm4 = _mm512_shuffle_epi8( xmm4, ( SUBSH_MASK[0] ) );\
-    xmm5 = _mm512_shuffle_epi8( xmm5, ( SUBSH_MASK[2] ) );\
-    xmm6 = _mm512_shuffle_epi8( xmm6, ( SUBSH_MASK[4] ) );\
-    xmm7 = _mm512_shuffle_epi8( xmm7, ( SUBSH_MASK[6] ) );\
+    xmm0 = _mm512_shuffle_epi8( xmm0, SUBSH_MASK1 );\
+    xmm1 = _mm512_shuffle_epi8( xmm1, SUBSH_MASK3 );\
+    xmm2 = _mm512_shuffle_epi8( xmm2, SUBSH_MASK5 );\
+    xmm3 = _mm512_shuffle_epi8( xmm3, SUBSH_MASK7 );\
+    xmm4 = _mm512_shuffle_epi8( xmm4, SUBSH_MASK0 );\
+    xmm5 = _mm512_shuffle_epi8( xmm5, SUBSH_MASK2 );\
+    xmm6 = _mm512_shuffle_epi8( xmm6, SUBSH_MASK4 );\
+    xmm7 = _mm512_shuffle_epi8( xmm7, SUBSH_MASK6 );\
    /* SubBytes + MixBytes */\
    SUBMIX(xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15);\
  }\
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -1,4 +1,7 @@
 #include "myrgr-gate.h"
+
+#if !defined(MYRGR_8WAY) && !defined(MYRGR_4WAY)
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -86,3 +89,4 @@ int scanhash_myriad( struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+#endif
--- a/algo/groestl/sph_groestl.c
+++ b/algo/groestl/sph_groestl.c
@@ -35,6 +35,8 @@

 #include "sph_groestl.h"

+#if !defined(__AES__)
+
 #ifdef __cplusplus
 extern "C"{
 #endif
@@ -3116,4 +3118,6 @@ sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 #ifdef __cplusplus
 }
+
+#endif  // !AES
 #endif
--- a/algo/groestl/sph_groestl.h
+++ b/algo/groestl/sph_groestl.h
@@ -42,6 +42,7 @@ extern "C"{
 #include <stddef.h>
 #include "algo/sha/sph_types.h"

+#if !defined(__AES__)   
 /**
 * Output size (in bits) for Groestl-224.
 */
@@ -326,4 +327,5 @@ void sph_groestl512_addbits_and_close(
 }
 #endif

+#endif  // !AES
 #endif
--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -1,156 +0,0 @@
-#include "algo-gate-api.h"
-#include <stdio.h>
-#include <string.h>
-#include <openssl/sha.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include "sph_hefty1.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/fugue/sph_fugue.h"
-#include "algo/skein/sph_skein.h"
-#include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/shabal/sph_shabal.h"
-#include "algo/echo/sph_echo.h"
-#include "algo/hamsi/sph_hamsi.h"
-#include "algo/luffa/luffa_for_sse2.h"
-#ifdef __AES__
-  #include "algo/echo/aes_ni/hash_api.h"
-#endif
-
-void bastionhash(void *output, const void *input)
-{
-	unsigned char hash[64] __attribute__ ((aligned (64)));
-
-#ifdef __AES__
-   hashState_echo          ctx_echo;
-#else
-   sph_echo512_context     ctx_echo;
-#endif
-   hashState_luffa         ctx_luffa;
-	sph_fugue512_context ctx_fugue;
-	sph_whirlpool_context ctx_whirlpool;
-	sph_shabal512_context ctx_shabal;
-   sph_hamsi512_context ctx_hamsi;
-	sph_skein512_context ctx_skein;
-
-	HEFTY1(input, 80, hash);
-
-        init_luffa( &ctx_luffa, 512 );
-        update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
-                                (const BitSequence*)hash, 64 );
-
-	if (hash[0] & 0x8)
-	{
-		sph_fugue512_init(&ctx_fugue);
-		sph_fugue512(&ctx_fugue, hash, 64);
-		sph_fugue512_close(&ctx_fugue, hash);
-	} else {
-   sph_skein512_init( &ctx_skein );
-   sph_skein512( &ctx_skein, hash, 64 );
-   sph_skein512_close( &ctx_skein, hash );
-	}
-
-	sph_whirlpool_init(&ctx_whirlpool);
-	sph_whirlpool(&ctx_whirlpool, hash, 64);
-	sph_whirlpool_close(&ctx_whirlpool, hash);
-
-	sph_fugue512_init(&ctx_fugue);
-	sph_fugue512(&ctx_fugue, hash, 64);
-	sph_fugue512_close(&ctx_fugue, hash);
-
-	if (hash[0] & 0x8)
-	{
-#ifdef __AES__
-      init_echo( &ctx_echo, 512 );
-      update_final_echo ( &ctx_echo,(BitSequence*)hash,
-                              (const BitSequence*)hash, 512 );
-#else
-		sph_echo512_init(&ctx_echo);
-		sph_echo512(&ctx_echo, hash, 64);
-		sph_echo512_close(&ctx_echo, hash);
-#endif
-	} else {
-      init_luffa( &ctx_luffa, 512 );
-      update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
-                                    (const BitSequence*)hash, 64 );
-	}
-
-	sph_shabal512_init(&ctx_shabal);
-	sph_shabal512(&ctx_shabal, hash, 64);
-	sph_shabal512_close(&ctx_shabal, hash);
-
-   sph_skein512_init( &ctx_skein );
-   sph_skein512( &ctx_skein, hash, 64 );
-   sph_skein512_close( &ctx_skein, hash );
-
-	if (hash[0] & 0x8)
-	{
-		sph_shabal512_init(&ctx_shabal);
-		sph_shabal512(&ctx_shabal, hash, 64);
-		sph_shabal512_close(&ctx_shabal, hash);
-	} else {
-		sph_whirlpool_init(&ctx_whirlpool);
-		sph_whirlpool(&ctx_whirlpool, hash, 64);
-		sph_whirlpool_close(&ctx_whirlpool, hash);
-	}
-
-	sph_shabal512_init(&ctx_shabal);
-	sph_shabal512(&ctx_shabal, hash, 64);
-	sph_shabal512_close(&ctx_shabal, hash);
-
-	if (hash[0] & 0x8)
-	{
-		sph_hamsi512_init(&ctx_hamsi);
-		sph_hamsi512(&ctx_hamsi, hash, 64);
-		sph_hamsi512_close(&ctx_hamsi, hash);
-	} else {
-      init_luffa( &ctx_luffa, 512 );
-      update_and_final_luffa( &ctx_luffa, (BitSequence*)hash,
-                                    (const BitSequence*)hash, 64 );
-	}
-
-	memcpy(output, hash, 32);
-}
-
-int scanhash_bastion( struct work *work, uint32_t max_nonce,
-      uint64_t *hashes_done, struct thr_info *mythr)
-{
-	uint32_t _ALIGN(64) hash32[8];
-	uint32_t _ALIGN(64) endiandata[20];
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-
-	uint32_t n = first_nonce;
-
-	for (int i=0; i < 19; i++) 
-		be32enc(&endiandata[i], pdata[i]);
-
-	do {
-		be32enc(&endiandata[19], n);
-		bastionhash(hash32, endiandata);
-		if (hash32[7] < Htarg && fulltest(hash32, ptarget)) {
-			pdata[19] = n;
-         submit_solution( work, hash32, mythr );
-		}
-		n++;
-
-	} while (n < max_nonce && !work_restart[thr_id].restart);
-
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-
-	return 0;
-}
-
-bool register_bastion_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT;
-  gate->scanhash = (void*)&scanhash_bastion;
-  gate->hash     = (void*)&bastionhash;
-  return true;
-};
-
--- a/algo/heavy/heavy.c
+++ b/algo/heavy/heavy.c
@@ -1,111 +0,0 @@
-#include <string.h>
-#include <openssl/sha.h>
-#include <stdint.h>
-
-#include "algo-gate-api.h"
-#include "sph_hefty1.h"
-#include "algo/keccak/sph_keccak.h"
-#include "algo/blake/sph_blake.h"
-#include "algo/groestl/sph_groestl.h"
-
-/* Combines top 64-bits from each hash into a single hash */
-static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
-{
-    uint32_t *hash[4] = { hash1, hash2, hash3, hash4 };
-
-    /* Transpose first 64 bits of each hash into out */
-    memset(out, 0, 32);
-    int bits = 0;
-    for (unsigned int i = 7; i >= 6; i--) {
-        for (uint32_t mask = 0x80000000; mask; mask >>= 1) {
-            for (unsigned int k = 0; k < 4; k++) {
-                out[(255 - bits)/32] <<= 1;
-                if ((hash[k][i] & mask) != 0)
-                    out[(255 - bits)/32] |= 1;
-                bits++;
-            }
-        }
-    }
-}
-
-extern void heavyhash(unsigned char* output, const unsigned char* input, int len)
-{
-    unsigned char hash1[32];
-    HEFTY1(input, len, hash1);
-
-// HEFTY1 is new, so take an extra security measure to eliminate
-//     * the possiblity of collisions:
-//     *
-//     *     Hash(x) = SHA256(x + HEFTY1(x))
-//     *
-//     * N.B. '+' is concatenation.
-//
-    unsigned char hash2[32];;
-    SHA256_CTX ctx;
-    SHA256_Init(&ctx);
-    SHA256_Update(&ctx, input, len);
-    SHA256_Update(&ctx, hash1, sizeof(hash1));
-    SHA256_Final(hash2, &ctx);
-
-//   * Additional security: Do not rely on a single cryptographic hash
-//     * function.  Instead, combine the outputs of 4 of the most secure
-//     * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512
-//     * and BLAKE512.
-
-
-    uint32_t hash3[16];
-    sph_keccak512_context keccakCtx;
-    sph_keccak512_init(&keccakCtx);
-    sph_keccak512(&keccakCtx, input, len);
-    sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
-    sph_keccak512_close(&keccakCtx, (void *)&hash3);
-
-    uint32_t hash4[16];
-    sph_groestl512_context groestlCtx;
-    sph_groestl512_init(&groestlCtx);
-    sph_groestl512(&groestlCtx, input, len);
-    sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
-    sph_groestl512_close(&groestlCtx, (void *)&hash4);
-
-    uint32_t hash5[16];
-    sph_blake512_context blakeCtx;
-    sph_blake512_init(&blakeCtx);
-    sph_blake512(&blakeCtx, input, len);
-    sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
-    sph_blake512_close(&blakeCtx, (void *)&hash5);
-
-    uint32_t *final = (uint32_t *)output;
-    combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5);
-
-}
-
-int scanhash_heavy( uint32_t *pdata, const uint32_t *ptarget,
-            uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
-{
-    uint32_t hash[8];
-    uint32_t start_nonce = pdata[19];
-    int thr_id = mythr->id;  // thr_id arg is deprecated
-    
-    do {
-        heavyhash((unsigned char *)hash, (unsigned char *)pdata, 80);
-    
-        if (hash[7] <= ptarget[7]) {
-            if (fulltest(hash, ptarget)) {
-                *hashes_done = pdata[19] - start_nonce;
-                return 1;
-                break;
-            }
-        }
-        pdata[19]++;
-    } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-    *hashes_done = pdata[19] - start_nonce;
-    return 0;
-}
-
-bool register_heavy_algo( algo_gate_t* gate )
-{
-    gate->scanhash = (void*)&scanhash_heavy;
-    gate->hash     = (void*)&heavyhash;
-    return true;
-};
-
--- a/algo/jh/jha.c
+++ b/algo/jh/jha.c
@@ -1,4 +1,7 @@
 #include "jha-gate.h"
+
+#if !defined(JHA_8WAY) && !defined(JHA_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -133,3 +136,4 @@ int scanhash_jha( struct work *work, uint32_t max_nonce,
 	return 0;
 }

+#endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -3,6 +3,8 @@
 #include "keccak-hash-4way.h"
 #include "keccak-gate.h"

+#if defined(__AVX2__)
+
 static const uint64_t RC[] = {
        0x0000000000000001, 0x0000000000008082,
        0x800000000000808A, 0x8000000080008000,
@@ -239,7 +241,7 @@ keccak512_8way_close(void *cc, void *dst)

 #endif  // AVX512

-#if defined(__AVX2__)
+// AVX2

 #define INPUT_BUF(size)   do { \
    size_t j; \
--- a/algo/keccak/keccak.c
+++ b/algo/keccak/keccak.c
@@ -1,4 +1,6 @@
-#include "algo-gate-api.h"
+#include "keccak-gate.h"
+
+#if !defined(KECCAK_8WAY) && !defined(KECCAK_4WAY)

 #include <stdlib.h>
 #include <string.h>
@@ -49,3 +51,4 @@ int scanhash_keccak( struct work *work, uint32_t max_nonce,
   return 0;
 }

+#endif
--- a/algo/keccak/sha3d.c
+++ b/algo/keccak/sha3d.c
@@ -1,4 +1,7 @@
-#include "algo-gate-api.h"
+#include "keccak-gate.h"
+
+#if !defined(KECCAK_8WAY) && !defined(KECCAK_4WAY)
+
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
@@ -48,3 +51,4 @@ int scanhash_sha3d( struct work *work, uint32_t max_nonce,
 	return 0;
 }

+#endif
--- a/algo/luffa/luffa.c
+++ b/algo/luffa/luffa.c
@@ -1,63 +0,0 @@
-#include "algo-gate-api.h"
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "sph_luffa.h"
-
-void luffahash(void *output, const void *input)
-{
-	unsigned char _ALIGN(128) hash[64];
-	sph_luffa512_context ctx_luffa;
-
-	sph_luffa512_init(&ctx_luffa);
-	sph_luffa512 (&ctx_luffa, input, 80);
-	sph_luffa512_close(&ctx_luffa, (void*) hash);
-
-	memcpy(output, hash, 32);
-}
-
-int scanhash_luffa(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
-{
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-
-	uint32_t _ALIGN(64) hash64[8];
-	uint32_t _ALIGN(64) endiandata[20];
-
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-
-	uint32_t n = first_nonce;
-
-        for (int i=0; i < 19; i++) 
-                be32enc(&endiandata[i], pdata[i]);
-
-	do {
-		be32enc(&endiandata[19], n);
-		luffahash(hash64, endiandata);
-		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
-			*hashes_done = n - first_nonce + 1;
-			pdata[19] = n;
-			return true;
-		}
-		n++;
-
-	} while (n < max_nonce && !work_restart[thr_id].restart);
-
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-
-	return 0;
-}
-
-bool register_luffa_algo( algo_gate_t* gate )
-{
-    gate->scanhash = (void*)&scanhash_luffa;
-    gate->hash     = (void*)&luffahash;
-    return true;
-};
-
--- a/algo/luffa/luffa_for_sse2.h
+++ b/algo/luffa/luffa_for_sse2.h
@@ -1,3 +1,6 @@
+#if !defined(LUFFA_FOR_SSE2_H__)
+#define LUFFA_FOR_SSE2_H__ 1
+
 /*
 * luffa_for_sse2.h
 * Version 2.0 (Sep 15th 2009)
@@ -48,8 +51,6 @@
 typedef struct {
    uint32 buffer[8] __attribute((aligned(32)));
    __m128i chainv[10] __attribute((aligned(32)));   /* Chaining values */
-//    uint64 bitlen[2]; /* Message length in bits */
-//    uint32 rembitlen; /* Length of buffer data to be hashed */
    int hashbitlen;
    int rembytes;
 } hashState_luffa;
@@ -67,4 +68,4 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,

 int luffa_full( hashState_luffa *state, BitSequence* output, int hashbitlen,
                                   const BitSequence* data, size_t inlen );
-
+#endif   // LUFFA_FOR_SSE2_H___
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -115,9 +115,8 @@ void allium_16way_hash( void *state, const void *input )
   intrlv_4x128( vhashA, hash0, hash1, hash2, hash3, 256 );
   intrlv_4x128( vhashB, hash4, hash5, hash6, hash7, 256 );

-   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+   cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
+   cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );

   dintrlv_4x128( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x128( hash4, hash5, hash6, hash7, vhashB, 256 );
@@ -125,10 +124,8 @@ void allium_16way_hash( void *state, const void *input )
   intrlv_4x128( vhashA, hash8, hash9, hash10, hash11, 256 );
   intrlv_4x128( vhashB, hash12, hash13, hash14, hash15, 256 );

-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashA, vhashA, 32 );
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhashB, vhashB, 32 );
+   cube_4way_full( &ctx.cube, vhashA, 256, vhashA, 32 );
+   cube_4way_full( &ctx.cube, vhashB, 256, vhashB, 32 );

   dintrlv_4x128( hash8, hash9, hash10, hash11, vhashA, 256 );
   dintrlv_4x128( hash12, hash13, hash14, hash15, vhashB, 256 );
@@ -169,7 +166,6 @@ void allium_16way_hash( void *state, const void *input )
   skein256_8way_update( &ctx.skein, vhashB, 32 );
   skein256_8way_close( &ctx.skein, vhashB );

-
   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
                 vhashA, 256 );
   dintrlv_8x64( hash8, hash9, hash10, hash11, hash12, hash13, hash14, hash15,
@@ -179,77 +175,43 @@ void allium_16way_hash( void *state, const void *input )

   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );

-   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );

   dintrlv_4x128( state, state+32, state+64, state+96, vhash, 256 );
   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );

-   groestl256_4way_init( &ctx.groestl, 32 );
-   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
   
   dintrlv_4x128( state+128, state+160, state+192, state+224, vhash, 256 );
   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );

-   groestl256_4way_init( &ctx.groestl, 32 );
-   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );

   dintrlv_4x128( state+256, state+288, state+320, state+352, vhash, 256 );
   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );

-   groestl256_4way_init( &ctx.groestl, 32 );
-   groestl256_4way_update_close( &ctx.groestl, vhash, vhash, 256 );
+   groestl256_4way_full( &ctx.groestl, vhash, vhash, 256 );
 
   dintrlv_4x128( state+384, state+416, state+448, state+480, vhash, 256 );
   
 #else

-   update_and_final_groestl256( &ctx.groestl, state, hash0, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+32, hash1, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+64, hash2, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+96, hash3, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+128, hash4, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+160, hash5, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+192, hash6, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+224, hash7, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+256, hash8, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+288, hash9, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+320, hash10, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+352, hash11, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+384, hash12, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+416, hash13, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+448, hash14, 256 );
-   memcpy( &ctx.groestl, &allium_16way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, state+480, hash15, 256 );
-
+   groestl256_full( &ctx.groestl, state,     hash0,  256 );
+   groestl256_full( &ctx.groestl, state+32,  hash1,  256 );
+   groestl256_full( &ctx.groestl, state+64,  hash2,  256 );
+   groestl256_full( &ctx.groestl, state+96,  hash3,  256 );
+   groestl256_full( &ctx.groestl, state+128, hash4,  256 );
+   groestl256_full( &ctx.groestl, state+160, hash5,  256 );
+   groestl256_full( &ctx.groestl, state+192, hash6,  256 );
+   groestl256_full( &ctx.groestl, state+224, hash7,  256 );
+   groestl256_full( &ctx.groestl, state+256, hash8,  256 );
+   groestl256_full( &ctx.groestl, state+288, hash9,  256 );
+   groestl256_full( &ctx.groestl, state+320, hash10, 256 );
+   groestl256_full( &ctx.groestl, state+352, hash11, 256 );
+   groestl256_full( &ctx.groestl, state+384, hash12, 256 );
+   groestl256_full( &ctx.groestl, state+416, hash13, 256 );
+   groestl256_full( &ctx.groestl, state+448, hash14, 256 );
+   groestl256_full( &ctx.groestl, state+480, hash15, 256 );
 #endif
 }

@@ -393,28 +355,14 @@ void allium_8way_hash( void *hash, const void *input )
   dintrlv_4x64( hash0, hash1, hash2, hash3, vhashA, 256 );
   dintrlv_4x64( hash4, hash5, hash6, hash7, vhashB, 256 );

-   update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash4, hash4, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash5, hash5, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash6, hash6, 256 );
-   memcpy( &ctx.groestl, &allium_8way_ctx.groestl,
-           sizeof(hashState_groestl256) );
-   update_and_final_groestl256( &ctx.groestl, hash7, hash7, 256 );
+   groestl256_full( &ctx.groestl, hash0, hash0, 256 );
+   groestl256_full( &ctx.groestl, hash1, hash1, 256 );
+   groestl256_full( &ctx.groestl, hash2, hash2, 256 );
+   groestl256_full( &ctx.groestl, hash3, hash3, 256 );
+   groestl256_full( &ctx.groestl, hash4, hash4, 256 );
+   groestl256_full( &ctx.groestl, hash5, hash5, 256 );
+   groestl256_full( &ctx.groestl, hash6, hash6, 256 );
+   groestl256_full( &ctx.groestl, hash7, hash7, 256 );
 }

 int scanhash_allium_8way( struct work *work, uint32_t max_nonce,
--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -1,4 +1,7 @@
 #include "lyra2-gate.h"
+
+#if !( defined(ALLIUM_16WAY) || defined(ALLIUM_8WAY) || defined(ALLIUM_4WAY) )
+
 #include <memory.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/keccak/sph_keccak.h"
@@ -107,3 +110,4 @@ int scanhash_allium( struct work *work, uint32_t max_nonce,
    return 0;
 }

+#endif
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -75,7 +75,6 @@ int scanhash_lyra2rev2_4way( struct work *work, uint32_t max_nonce,
 bool init_lyra2rev2_4way_ctx();

 #else
-
 void lyra2rev2_hash( void *state, const void *input );
 int scanhash_lyra2rev2( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -1,4 +1,7 @@
 #include "lyra2-gate.h"
+
+#if !( defined(LYRA2H_8WAY) || defined(LYRA2H_4WAY) )
+
 #include <memory.h>
 #include <mm_malloc.h>
 #include "lyra2.h"
@@ -71,3 +74,4 @@ int scanhash_lyra2h( struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+#endif
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -1,4 +1,7 @@
 #include "lyra2-gate.h"
+
+#if !( defined(LYRA2REV2_16WAY) || defined(LYRA2REV2_8WAY) || defined(LYRA2REV2_4WAY) )
+
 #include <memory.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/cubehash/sph_cubehash.h"
@@ -107,4 +110,4 @@ int scanhash_lyra2rev2( struct work *work,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-
+#endif
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -79,19 +79,16 @@ void lyra2rev3_16way_hash( void *state, const void *input )
   dintrlv_2x256( hash14, hash15, vhash, 256 );

   intrlv_4x128( vhash, hash0, hash1, hash2, hash3, 256 );
-   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   cube_4way_full( &ctx.cube, vhash, 256, vhash, 32 );
   dintrlv_4x128( hash0, hash1, hash2, hash3, vhash, 256 );
   intrlv_4x128( vhash, hash4, hash5, hash6, hash7, 256 );
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   cube_4way_full( &ctx.cube, vhash, 256, vhash, 32 );
   dintrlv_4x128( hash4, hash5, hash6, hash7, vhash, 256 );
   intrlv_4x128( vhash, hash8, hash9, hash10, hash11, 256 );
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   cube_4way_full( &ctx.cube, vhash, 256, vhash, 32 );
   dintrlv_4x128( hash8, hash9, hash10, hash11, vhash, 256 );
   intrlv_4x128( vhash, hash12, hash13, hash14, hash15, 256 );
-   cube_4way_init( &ctx.cube, 256, 16, 32 );
-   cube_4way_update_close( &ctx.cube, vhash, vhash, 32 );
+   cube_4way_full( &ctx.cube, vhash, 256, vhash, 32 );
   dintrlv_4x128( hash12, hash13, hash14, hash15, vhash, 256 );

   intrlv_2x256( vhash, hash0, hash1, 256 );
@@ -224,21 +221,14 @@ void lyra2rev3_8way_hash( void *state, const void *input )
   LYRA2REV3( l2v3_wholeMatrix, hash6, 32, hash6, 32, hash6, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash7, 32, hash7, 32, hash7, 32, 1, 4, 4 );

-   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) hash4, (const byte*) hash4, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) hash5, (const byte*) hash5, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) hash6, (const byte*) hash6, 32 );
-   cubehashInit( &ctx.cube, 256, 16, 32 );
-   cubehashUpdateDigest( &ctx.cube, (byte*) hash7, (const byte*) hash7, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash0, 256, (const byte*) hash0, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash1, 256, (const byte*) hash1, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash2, 256, (const byte*) hash2, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash3, 256, (const byte*) hash3, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash4, 256, (const byte*) hash4, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash5, 256, (const byte*) hash5, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash6, 256, (const byte*) hash6, 32 );
+   cubehash_full( &ctx.cube, (byte*) hash7, 256, (const byte*) hash7, 32 );

   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
@@ -265,25 +255,24 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
   uint32_t *hash7 = &hash[7<<3];
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
-   const uint32_t *ptarget = work->target;
+   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;

-   if ( opt_benchmark )  ( (uint32_t*)ptarget )[7] = 0x0000ff;
+   if ( bench )  ptarget[7] = 0x0000ff;

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
-
+   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
   blake256_8way_init( &l2v3_8way_ctx.blake );
   blake256_8way_update( &l2v3_8way_ctx.blake, vdata, 64 );

   do
   {
-      *noncev = mm256_bswap_32( _mm256_set_epi32( n+7, n+6, n+5, n+4,
-                                                  n+3, n+2, n+1, n ) );
-
      lyra2rev3_8way_hash( hash, vdata );
      pdata[19] = n;

@@ -291,15 +280,17 @@ int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
      if ( unlikely( hash7[lane] <= Htarg ) )
      {
         extr_lane_8x32( lane_hash, hash, lane, 256 );
-         if ( likely( fulltest( lane_hash, ptarget ) && !opt_benchmark ) )
+         if ( likely( valid_hash( lane_hash, ptarget ) && !bench ) )
         {
-             pdata[19] = n + lane;
+             pdata[19] = bswap_32( n + lane );
             submit_lane_solution( work, lane_hash, mythr, lane );
         }
      }
+      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
      n += 8;
-   } while ( likely( (n < max_nonce-8) && !work_restart[thr_id].restart ) );
-   *hashes_done = n - first_nonce + 1;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/lyra2/lyra2rev3.c
+++ b/algo/lyra2/lyra2rev3.c
@@ -1,4 +1,7 @@
 #include "lyra2-gate.h"
+
+#if !( defined(LYRA2REV3_16WAY) || defined(LYRA2REV3_8WAY) || defined(LYRA2REV3_4WAY) )
+
 #include <memory.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/cubehash/sph_cubehash.h"
@@ -96,4 +99,4 @@ int scanhash_lyra2rev3( struct work *work,
   *hashes_done = pdata[19] - first_nonce + 1;
   return 0;
 }
-
+#endif
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -97,41 +97,42 @@ void lyra2z_16way_hash( void *state, const void *input )
 int scanhash_lyra2z_16way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*16] __attribute__ ((aligned (128)));
+   uint64_t hash[4*16] __attribute__ ((aligned (128)));
   uint32_t vdata[20*16] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
+   const uint32_t last_nonce = max_nonce - 16;
   __m512i  *noncev = (__m512i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;

-   if ( opt_benchmark )
-      ptarget[7] = 0x0000ff;
+   if ( bench )   ptarget[7] = 0x0000ff;

   mm512_bswap32_intrlv80_16x32( vdata, pdata );
+   *noncev = _mm512_set_epi32( n+15, n+14, n+13, n+12, n+11, n+10, n+ 9, n+ 8,
+                               n+ 7, n+ 6, n+ 5, n+ 4, n+ 3, n+ 2, n +1, n );
   lyra2z_16way_midstate( vdata );

   do {
-      *noncev = mm512_bswap_32( _mm512_set_epi32( n+15, n+14, n+13, n+12,
-                                                  n+11, n+10, n+ 9, n+ 8,
-                                                  n+ 7, n+ 6, n+ 5, n+ 4,
-                                                  n+ 3, n+ 2, n+ 1, n ) );
      lyra2z_16way_hash( hash, vdata );
-      pdata[19] = n;

-      for ( int i = 0; i < 16; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
-           && !opt_benchmark )
+      for ( int lane = 0; lane < 16; lane++ )
      {
-          pdata[19] = n+i;
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+        const uint64_t *lane_hash = hash + (lane<<2);
+        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
+        {
+           pdata[19] = bswap_32( n + lane );
+           submit_lane_solution( work, lane_hash, mythr, lane );
+        }
      }
+      *noncev = _mm512_add_epi32( *noncev, m512_const1_32( 16 ) );
      n += 16;
-   } while ( (n < max_nonce-16) && !work_restart[thr_id].restart);
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

-   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
   return 0;
 }

@@ -195,39 +196,40 @@ void lyra2z_8way_hash( void *state, const void *input )
 int scanhash_lyra2z_8way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*8] __attribute__ ((aligned (64)));
+   uint64_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 8;
   uint32_t n = first_nonce;
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;

-   if ( opt_benchmark )
-      ptarget[7] = 0x0000ff;
+   if ( bench )  ptarget[7] = 0x0000ff;

   mm256_bswap32_intrlv80_8x32( vdata, pdata );
+   *noncev = _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n );
   lyra2z_8way_midstate( vdata );

   do {
-      *noncev = mm256_bswap_32(
-                 _mm256_set_epi32( n+7, n+6, n+5, n+4, n+3, n+2, n+1, n ) );
      lyra2z_8way_hash( hash, vdata );
-      pdata[19] = n;

-      for ( int i = 0; i < 8; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
-           && !opt_benchmark )
+      for ( int lane = 0; lane < 8; lane++ )
      {
-          pdata[19] = n+i;
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+        const uint64_t *lane_hash = hash + (lane<<2);
+        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
+        {
+           pdata[19] = bswap_32( n + lane );
+           submit_lane_solution( work, lane_hash, mythr, lane );
+        }
      }
+      *noncev = _mm256_add_epi32( *noncev, m256_const1_32( 8 ) );
      n += 8;
-   } while ( (n < max_nonce-8) && !work_restart[thr_id].restart);
-
-   *hashes_done = n - first_nonce + 1;
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart) );
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
   return 0;
 }

@@ -274,39 +276,40 @@ void lyra2z_4way_hash( void *state, const void *input )
 int scanhash_lyra2z_4way( struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint64_t hash[4*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
+   const uint32_t last_nonce = max_nonce - 4;
   uint32_t n = first_nonce;
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id; 
+   const bool bench = opt_benchmark;

-   if ( opt_benchmark )
-      ptarget[7] = 0x0000ff;
+   if ( bench )   ptarget[7] = 0x0000ff;

   mm128_bswap32_intrlv80_4x32( vdata, pdata );
+   *noncev = _mm_set_epi32( n+3, n+2, n+1, n );
   lyra2z_4way_midstate( vdata );

   do {
-      *noncev = mm128_bswap_32( _mm_set_epi32( n+3, n+2, n+1, n ) );
-
      lyra2z_4way_hash( hash, vdata );
-      pdata[19] = n;
-
-      for ( int i = 0; i < 4; i++ )
-      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget )
-           && !opt_benchmark )
+      for ( int lane = 0; lane < 4; lane++ )
      {
-          pdata[19] = n+i;         
-          submit_lane_solution( work, hash+(i<<3), mythr, i );
+        const uint64_t *lane_hash = hash + (lane<<2);
+        if ( unlikely( valid_hash( lane_hash, ptarget ) && !bench ) )
+        {
+           pdata[19] = bswap_32( n + lane );
+           submit_lane_solution( work, lane_hash, mythr, lane );
+        }
      }
+      *noncev = _mm_add_epi32( *noncev, m128_const1_32( 4 ) );
      n += 4;
-   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
+   } while ( likely( (n < last_nonce) && !work_restart[thr_id].restart ) );

-   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+   *hashes_done = n - first_nonce;
   return 0;
 }

--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -1,6 +1,9 @@
 #include <memory.h>
 #include <mm_malloc.h>
 #include "lyra2-gate.h"
+
+#if !( defined(LYRA2Z_16WAY) || defined(LYRA2Z_8WAY) || defined(LYRA2Z_4WAY) )
+
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "simd-utils.h"
@@ -80,4 +83,4 @@ int scanhash_lyra2z( struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-
+#endif
--- a/algo/lyra2/phi2.c
+++ b/algo/lyra2/phi2.c
@@ -96,32 +96,30 @@ int scanhash_phi2( struct work *work, uint32_t max_nonce,
 	           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(128) hash[8];
-   uint32_t _ALIGN(128) endiandata[36];
+   uint32_t _ALIGN(128) edata[36];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   if(opt_benchmark){
-   	ptarget[7] = 0x00ff;
-   }
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;
+   if( bench )   	ptarget[7] = 0x00ff;

   phi2_has_roots = false;
-   for ( int i=0; i < 36; i++ )
+
+   for ( int i = 0; i < 36; i++ )
   {
-	   be32enc(&endiandata[i], pdata[i]);
+	   be32enc( &edata[i], pdata[i] );
      if ( i >= 20 && pdata[i] ) phi2_has_roots = true;
   }

   do {
-	be32enc( &endiandata[19], n );
-	phi2_hash( hash, endiandata );
-	if ( hash[7] < Htarg )
-   if ( fulltest( hash, ptarget ) && !opt_benchmark )
+	edata[19] = n;
+	phi2_hash( hash, edata );
+   if ( valid_hash( hash, ptarget ) && !opt_benchmark )
  	{
-       pdata[19] = n;
+       be32enc( pdata+19, n );
       submit_solution( work, hash, mythr );
   }
 	n++;
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -89,6 +89,9 @@ inline void initState( uint64_t State[/*16*/] )
 *
 * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
 */
+
+#if !defined(__AVX512F__) && !defined(__AVX2__) && !defined(__SSE2__)
+
 inline static void blake2bLyra( uint64_t *v )
 {
    ROUND_LYRA(0);
@@ -114,6 +117,8 @@ inline static void reducedBlake2bLyra( uint64_t *v )
    ROUND_LYRA(0);
 }

+#endif
+
 /**
 * Performs a squeeze operation, using Blake2b's G function as the
 * internal permutation
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -171,7 +171,6 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7)

-
 #endif // AVX2 else SSE2

 // Scalar
@@ -200,7 +199,6 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);

-
 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

 union _ovly_512
--- a/algo/nist5/nist5.c
+++ b/algo/nist5/nist5.c
@@ -1,4 +1,7 @@
 #include "nist5-gate.h"
+
+#if !defined(NIST5_8WAY) && !defined(NIST5_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -105,13 +108,4 @@ int scanhash_nist5( struct work *work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-/*
-bool register_nist5_algo( algo_gate_t* gate )
-{
-    gate->optimizations = SSE2_OPT | AES_OPT;
-    init_nist5_ctx();
-    gate->scanhash = (void*)&scanhash_nist5;
-    gate->hash     = (void*)&nist5hash;
-    return true;
-};
-*/
+#endif
--- a/algo/quark/anime.c
+++ b/algo/quark/anime.c
@@ -1,5 +1,8 @@
 #include "cpuminer-config.h"
 #include "anime-gate.h"
+
+#if !defined(ANIME_8WAY) && !defined(ANIME_4WAY)
+
 #include <stdio.h>
 #include <string.h>
 #include <stdint.h>
@@ -169,4 +172,4 @@ int scanhash_anime( struct work *work, uint32_t max_nonce,
    pdata[19] = n;
    return 0;
 }
-
+#endif
--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -1,4 +1,7 @@
 #include "hmq1725-gate.h"
+
+#if !defined(HMQ1725_8WAY) && !defined(HMQ1725_4WAY)
+
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/sph_blake.h"
@@ -7,10 +10,7 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
@@ -21,6 +21,9 @@
 #if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
+#else
+  #include "algo/groestl/sph_groestl.h"
+  #include "algo/echo/sph_echo.h"
 #endif
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
@@ -392,3 +395,4 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
+#endif
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -1,5 +1,8 @@
 #include "cpuminer-config.h"
 #include "quark-gate.h"
+
+#if !defined(QUARK_8WAY) && !defined(QUARK_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -137,4 +140,4 @@ int scanhash_quark( struct work *work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-
+#endif
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -1,4 +1,7 @@
 #include "deep-gate.h"
+
+#if !defined(DEEP_8WAY) && !defined(DEEP_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -114,4 +117,4 @@ int scanhash_deep( struct work *work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-
+#endif
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -1,4 +1,7 @@
 #include "qubit-gate.h"
+
+#if !defined(QUBIT_8WAY) && !defined(QUBIT_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -126,4 +129,4 @@ int scanhash_qubit( struct work *work,	uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-
+#endif
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -1,4 +1,7 @@
 #include "lbry-gate.h"
+
+#if !defined(LBRY_16WAY) && !defined(LBRY_8WAY) && !defined(LBRY_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -100,3 +103,4 @@ int scanhash_lbry( struct work *work, uint32_t max_nonce,
 	pdata[27] = n;
 	return 0;
 }
+#endif
--- a/algo/scrypt/pluck.c
+++ b/algo/scrypt/pluck.c
@@ -1,505 +0,0 @@
-/*
- * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2014 pooler, 2015 Jordan Earls
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include "cpuminer-config.h"
-#include "algo-gate-api.h"
-
-#include <stdlib.h>
-#include <string.h>
-
-#define BLOCK_HEADER_SIZE 80
-
-// windows
-#ifndef htobe32
-#define htobe32(x)  ((uint32_t)htonl((uint32_t)(x)))
-#endif
-
-#ifdef _MSC_VER
-#define ROTL(a, b) _rotl(a,b)
-#define ROTR(a, b) _rotr(a,b)
-#else
-#define ROTL(a, b) (((a) << b) | ((a) >> (32 - b)))
-#define ROTR(a, b) ((a >> b) | (a << (32 - b)))
-#endif
-
-#if defined(_MSC_VER) && defined(_M_X64)
-#define _VECTOR __vectorcall
-#include <intrin.h>
-//#include <emmintrin.h> //SSE2
-//#include <pmmintrin.h> //SSE3
-//#include <tmmintrin.h> //SSSE3
-//#include <smmintrin.h> //SSE4.1
-//#include <nmmintrin.h> //SSE4.2
-//#include <ammintrin.h> //SSE4A
-//#include <wmmintrin.h> //AES
-//#include <immintrin.h> //AVX
-#define OPT_COMPATIBLE
-#elif defined(__GNUC__) && defined(__x86_64__)
-#include <x86intrin.h>
-#define _VECTOR
-#endif
-
-static __thread char *scratchbuf;
-
-#ifdef OPT_COMPATIBLE
-static void _VECTOR xor_salsa8(__m128i B[4], const __m128i Bx[4], int i)
-{
-	__m128i X0, X1, X2, X3;
-
-	if (i <= 128) {
-		// a xor 0 = a
-		X0 = B[0] = Bx[0];
-		X1 = B[1] = Bx[1];
-		X2 = B[2] = Bx[2];
-		X3 = B[3] = Bx[3];
-	} else {
-		X0 = B[0] = _mm_xor_si128(B[0], Bx[0]);
-		X1 = B[1] = _mm_xor_si128(B[1], Bx[1]);
-		X2 = B[2] = _mm_xor_si128(B[2], Bx[2]);
-		X3 = B[3] = _mm_xor_si128(B[3], Bx[3]);
-	}
-
-	for (i = 0; i < 4; i++) {
-		/* Operate on columns. */
-		X1.m128i_u32[0] ^= ROTL(X0.m128i_u32[0] + X3.m128i_u32[0], 7);
-		X2.m128i_u32[1] ^= ROTL(X1.m128i_u32[1] + X0.m128i_u32[1], 7);
-		X3.m128i_u32[2] ^= ROTL(X2.m128i_u32[2] + X1.m128i_u32[2], 7);
-		X0.m128i_u32[3] ^= ROTL(X3.m128i_u32[3] + X2.m128i_u32[3], 7);
-
-		X2.m128i_u32[0] ^= ROTL(X1.m128i_u32[0] + X0.m128i_u32[0], 9);
-		X3.m128i_u32[1] ^= ROTL(X2.m128i_u32[1] + X1.m128i_u32[1], 9);
-		X0.m128i_u32[2] ^= ROTL(X3.m128i_u32[2] + X2.m128i_u32[2], 9);
-		X1.m128i_u32[3] ^= ROTL(X0.m128i_u32[3] + X3.m128i_u32[3], 9);
-
-		X3.m128i_u32[0] ^= ROTL(X2.m128i_u32[0] + X1.m128i_u32[0], 13);
-		X0.m128i_u32[1] ^= ROTL(X3.m128i_u32[1] + X2.m128i_u32[1], 13);
-		X1.m128i_u32[2] ^= ROTL(X0.m128i_u32[2] + X3.m128i_u32[2], 13);
-		X2.m128i_u32[3] ^= ROTL(X1.m128i_u32[3] + X0.m128i_u32[3], 13);
-
-		X0.m128i_u32[0] ^= ROTL(X3.m128i_u32[0] + X2.m128i_u32[0], 18);
-		X1.m128i_u32[1] ^= ROTL(X0.m128i_u32[1] + X3.m128i_u32[1], 18);
-		X2.m128i_u32[2] ^= ROTL(X1.m128i_u32[2] + X0.m128i_u32[2], 18);
-		X3.m128i_u32[3] ^= ROTL(X2.m128i_u32[3] + X1.m128i_u32[3], 18);
-
-		/* Operate on rows. */
-		X0.m128i_u32[1] ^= ROTL(X0.m128i_u32[0] + X0.m128i_u32[3], 7);  X1.m128i_u32[2] ^= ROTL(X1.m128i_u32[1] + X1.m128i_u32[0], 7);
-		X2.m128i_u32[3] ^= ROTL(X2.m128i_u32[2] + X2.m128i_u32[1], 7);  X3.m128i_u32[0] ^= ROTL(X3.m128i_u32[3] + X3.m128i_u32[2], 7);
-		X0.m128i_u32[2] ^= ROTL(X0.m128i_u32[1] + X0.m128i_u32[0], 9);  X1.m128i_u32[3] ^= ROTL(X1.m128i_u32[2] + X1.m128i_u32[1], 9);
-		X2.m128i_u32[0] ^= ROTL(X2.m128i_u32[3] + X2.m128i_u32[2], 9);  X3.m128i_u32[1] ^= ROTL(X3.m128i_u32[0] + X3.m128i_u32[3], 9);
-
-		X0.m128i_u32[3] ^= ROTL(X0.m128i_u32[2] + X0.m128i_u32[1], 13);  X1.m128i_u32[0] ^= ROTL(X1.m128i_u32[3] + X1.m128i_u32[2], 13);
-		X2.m128i_u32[1] ^= ROTL(X2.m128i_u32[0] + X2.m128i_u32[3], 13);  X3.m128i_u32[2] ^= ROTL(X3.m128i_u32[1] + X3.m128i_u32[0], 13);
-		X0.m128i_u32[0] ^= ROTL(X0.m128i_u32[3] + X0.m128i_u32[2], 18);  X1.m128i_u32[1] ^= ROTL(X1.m128i_u32[0] + X1.m128i_u32[3], 18);
-		X2.m128i_u32[2] ^= ROTL(X2.m128i_u32[1] + X2.m128i_u32[0], 18);  X3.m128i_u32[3] ^= ROTL(X3.m128i_u32[2] + X3.m128i_u32[1], 18);
-	}
-
-	B[0] = _mm_add_epi32(B[0], X0);
-	B[1] = _mm_add_epi32(B[1], X1);
-	B[2] = _mm_add_epi32(B[2], X2);
-	B[3] = _mm_add_epi32(B[3], X3);
-}
-
-#else
-
-static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16], int i)
-{
-	uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
-
-	if (i <= 128) {
-		// a xor 0 = a
-		x00 = B[ 0] = Bx[ 0]; x01 = B[ 1] = Bx[ 1]; x02 = B[ 2] = Bx[ 2]; x03 = B[ 3] = Bx[ 3];
-		x04 = B[ 4] = Bx[ 4]; x05 = B[ 5] = Bx[ 5]; x06 = B[ 6] = Bx[ 6]; x07 = B[ 7] = Bx[ 7];
-		x08 = B[ 8] = Bx[ 8]; x09 = B[ 9] = Bx[ 9]; x10 = B[10] = Bx[10]; x11 = B[11] = Bx[11];
-		x12 = B[12] = Bx[12]; x13 = B[13] = Bx[13]; x14 = B[14] = Bx[14]; x15 = B[15] = Bx[15];
-	} else {
-		x00 = (B[ 0] ^= Bx[ 0]);
-		x01 = (B[ 1] ^= Bx[ 1]);
-		x02 = (B[ 2] ^= Bx[ 2]);
-		x03 = (B[ 3] ^= Bx[ 3]);
-		x04 = (B[ 4] ^= Bx[ 4]);
-		x05 = (B[ 5] ^= Bx[ 5]);
-		x06 = (B[ 6] ^= Bx[ 6]);
-		x07 = (B[ 7] ^= Bx[ 7]);
-		x08 = (B[ 8] ^= Bx[ 8]);
-		x09 = (B[ 9] ^= Bx[ 9]);
-		x10 = (B[10] ^= Bx[10]);
-		x11 = (B[11] ^= Bx[11]);
-		x12 = (B[12] ^= Bx[12]);
-		x13 = (B[13] ^= Bx[13]);
-		x14 = (B[14] ^= Bx[14]);
-		x15 = (B[15] ^= Bx[15]);
-	}
-
-	for (i = 0; i < 8; i += 2) {
-		/* Operate on columns. */
-		x04 ^= ROTL(x00 + x12,  7);  x09 ^= ROTL(x05 + x01,  7);
-		x14 ^= ROTL(x10 + x06,  7);  x03 ^= ROTL(x15 + x11,  7);
-
-		x08 ^= ROTL(x04 + x00,  9);  x13 ^= ROTL(x09 + x05,  9);
-		x02 ^= ROTL(x14 + x10,  9);  x07 ^= ROTL(x03 + x15,  9);
-
-		x12 ^= ROTL(x08 + x04, 13);  x01 ^= ROTL(x13 + x09, 13);
-		x06 ^= ROTL(x02 + x14, 13);  x11 ^= ROTL(x07 + x03, 13);
-
-		x00 ^= ROTL(x12 + x08, 18);  x05 ^= ROTL(x01 + x13, 18);
-		x10 ^= ROTL(x06 + x02, 18);  x15 ^= ROTL(x11 + x07, 18);
-
-		/* Operate on rows. */
-		x01 ^= ROTL(x00 + x03,  7);  x06 ^= ROTL(x05 + x04,  7);
-		x11 ^= ROTL(x10 + x09,  7);  x12 ^= ROTL(x15 + x14,  7);
-
-		x02 ^= ROTL(x01 + x00,  9);  x07 ^= ROTL(x06 + x05,  9);
-		x08 ^= ROTL(x11 + x10,  9);  x13 ^= ROTL(x12 + x15,  9);
-
-		x03 ^= ROTL(x02 + x01, 13);  x04 ^= ROTL(x07 + x06, 13);
-		x09 ^= ROTL(x08 + x11, 13);  x14 ^= ROTL(x13 + x12, 13);
-
-		x00 ^= ROTL(x03 + x02, 18);  x05 ^= ROTL(x04 + x07, 18);
-		x10 ^= ROTL(x09 + x08, 18);  x15 ^= ROTL(x14 + x13, 18);
-	}
-	B[ 0] += x00;
-	B[ 1] += x01;
-	B[ 2] += x02;
-	B[ 3] += x03;
-	B[ 4] += x04;
-	B[ 5] += x05;
-	B[ 6] += x06;
-	B[ 7] += x07;
-	B[ 8] += x08;
-	B[ 9] += x09;
-	B[10] += x10;
-	B[11] += x11;
-	B[12] += x12;
-	B[13] += x13;
-	B[14] += x14;
-	B[15] += x15;
-}
-
-#endif
-
-static const uint32_t sha256_k[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-/* Elementary functions used by SHA256 */
-#define Ch(x, y, z)     ((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)    ((x & (y | z)) | (y & z))
-#define S0(x)           (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
-#define S1(x)           (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
-#define s0(x)           (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3))
-#define s1(x)           (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10))
-
-/* SHA256 round function */
-#define RND(a, b, c, d, e, f, g, h, k) \
-	do { \
-		t0 = h + S1(e) + Ch(e, f, g) + k; \
-		t1 = S0(a) + Maj(a, b, c); \
-		d += t0; \
-		h  = t0 + t1; \
-		} while (0)
-
-/* Adjusted round function for rotating state */
-#define RNDr(S, W, i) \
-	RND(S[(64 - i) % 8], S[(65 - i) % 8], \
-	    S[(66 - i) % 8], S[(67 - i) % 8], \
-	    S[(68 - i) % 8], S[(69 - i) % 8], \
-	    S[(70 - i) % 8], S[(71 - i) % 8], \
-	    W[i] + sha256_k[i])
-
-
-static void sha256_transform_volatile(uint32_t *state, uint32_t *block)
-{
-	uint32_t* W=block; //note: block needs to be a mutable 64 int32_t
-	uint32_t S[8];
-	uint32_t t0, t1;
-	int i;
-
-	for (i = 16; i < 64; i += 2) {
-		W[i]   = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
-		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
-	}
-
-	/* 2. Initialize working variables. */
-	memcpy(S, state, 32);
-
-	/* 3. Mix. */
-	RNDr(S, W, 0);
-	RNDr(S, W, 1);
-	RNDr(S, W, 2);
-	RNDr(S, W, 3);
-	RNDr(S, W, 4);
-	RNDr(S, W, 5);
-	RNDr(S, W, 6);
-	RNDr(S, W, 7);
-	RNDr(S, W, 8);
-	RNDr(S, W, 9);
-	RNDr(S, W, 10);
-	RNDr(S, W, 11);
-	RNDr(S, W, 12);
-	RNDr(S, W, 13);
-	RNDr(S, W, 14);
-	RNDr(S, W, 15);
-	RNDr(S, W, 16);
-	RNDr(S, W, 17);
-	RNDr(S, W, 18);
-	RNDr(S, W, 19);
-	RNDr(S, W, 20);
-	RNDr(S, W, 21);
-	RNDr(S, W, 22);
-	RNDr(S, W, 23);
-	RNDr(S, W, 24);
-	RNDr(S, W, 25);
-	RNDr(S, W, 26);
-	RNDr(S, W, 27);
-	RNDr(S, W, 28);
-	RNDr(S, W, 29);
-	RNDr(S, W, 30);
-	RNDr(S, W, 31);
-	RNDr(S, W, 32);
-	RNDr(S, W, 33);
-	RNDr(S, W, 34);
-	RNDr(S, W, 35);
-	RNDr(S, W, 36);
-	RNDr(S, W, 37);
-	RNDr(S, W, 38);
-	RNDr(S, W, 39);
-	RNDr(S, W, 40);
-	RNDr(S, W, 41);
-	RNDr(S, W, 42);
-	RNDr(S, W, 43);
-	RNDr(S, W, 44);
-	RNDr(S, W, 45);
-	RNDr(S, W, 46);
-	RNDr(S, W, 47);
-	RNDr(S, W, 48);
-	RNDr(S, W, 49);
-	RNDr(S, W, 50);
-	RNDr(S, W, 51);
-	RNDr(S, W, 52);
-	RNDr(S, W, 53);
-	RNDr(S, W, 54);
-	RNDr(S, W, 55);
-	RNDr(S, W, 56);
-	RNDr(S, W, 57);
-	RNDr(S, W, 58);
-	RNDr(S, W, 59);
-	RNDr(S, W, 60);
-	RNDr(S, W, 61);
-	RNDr(S, W, 62);
-	RNDr(S, W, 63);
-
-	/* 4. Mix local working variables into global state */
-	for (i = 0; i < 8; i++)
-		state[i] += S[i];
-}
-
-// standard sha256 hash
-#if 1
-static void sha256_hash(unsigned char *hash, const unsigned char *data, int len)
-{
-	uint32_t _ALIGN(64) S[16];
-	uint32_t _ALIGN(64) T[64];
-	int i, r;
-
-	sha256_init(S);
-	for (r = len; r > -9; r -= 64) {
-		if (r < 64)
-			memset(T, 0, 64);
-		memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
-		if (r >= 0 && r < 64)
-			((unsigned char *)T)[r] = 0x80;
-		for (i = 0; i < 16; i++)
-			T[i] = be32dec(T + i);
-		if (r < 56)
-			T[15] = 8 * len;
-		//sha256_transform(S, T, 0);
-		sha256_transform_volatile(S, T);
-	}
-	for (i = 0; i < 8; i++)
-		be32enc((uint32_t *)hash + i, S[i]);
-}
-#else
-#include <openssl/sha.h>
-static void sha256_hash(unsigned char *hash, const unsigned char *data, int len)
-{
-	SHA256_CTX ctx;
-	SHA256_Init(&ctx);
-	SHA256_Update(&ctx, data, len);
-	SHA256_Final(hash, &ctx);
-}
-#endif
-
-// hash exactly 64 bytes (ie, sha256 block size)
-static void sha256_hash512(uint32_t *hash, const uint32_t *data)
-{
-	uint32_t _ALIGN(64) S[16];
-	uint32_t _ALIGN(64) T[64];
-	uchar _ALIGN(64) E[64*4] = { 0 };
-	int i;
-
-	sha256_init(S);
-
-	for (i = 0; i < 16; i++)
-		T[i] = be32dec(&data[i]);
-	sha256_transform_volatile(S, T);
-
-	E[3]  = 0x80;
-	E[61] = 0x02; // T[15] = 8 * 64 => 0x200;
-	sha256_transform_volatile(S, (uint32_t*)E);
-
-	for (i = 0; i < 8; i++)
-		be32enc(&hash[i], S[i]);
-}
-
-void pluck_hash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N)
-{
-	int size = N * 1024;
-	sha256_hash(hashbuffer, (void*)data, BLOCK_HEADER_SIZE);
-	memset(&hashbuffer[32], 0, 32);
-
-	for(int i = 64; i < size - 32; i += 32)
-	{
-		uint32_t _ALIGN(64) randseed[16];
-		uint32_t _ALIGN(64) randbuffer[16];
-		uint32_t _ALIGN(64) joint[16];
-		//i-4 because we use integers for all references against this, and we don't want to go 3 bytes over the defined area
-		//we could use size here, but then it's probable to use 0 as the value in most cases
-		int randmax = i - 4;
-
-		//setup randbuffer to be an array of random indexes
-		memcpy(randseed, &hashbuffer[i - 64], 64);
-
-		if(i > 128) memcpy(randbuffer, &hashbuffer[i - 128], 64);
-		//else memset(randbuffer, 0, 64);
-
-		xor_salsa8((void*)randbuffer, (void*)randseed, i);
-		memcpy(joint, &hashbuffer[i - 32], 32);
-
-		//use the last hash value as the seed
-		for (int j = 32; j < 64; j += 4)
-		{
-			//every other time, change to next random index
-			//randmax - 32 as otherwise we go beyond memory that's already been written to
-			uint32_t rand = randbuffer[(j - 32) >> 2] % (randmax - 32);
-			joint[j >> 2] = *((uint32_t *)&hashbuffer[rand]);
-		}
-
-		sha256_hash512((uint32_t*) &hashbuffer[i], joint);
-
-		//setup randbuffer to be an array of random indexes
-		//use last hash value and previous hash value(post-mixing)
-		memcpy(randseed, &hashbuffer[i - 32], 64);
-
-		if(i > 128) memcpy(randbuffer, &hashbuffer[i - 128], 64);
-		//else memset(randbuffer, 0, 64);
-
-		xor_salsa8((void*)randbuffer, (void*)randseed, i);
-
-		//use the last hash value as the seed
-		for (int j = 0; j < 32; j += 2)
-		{
-			uint32_t rand = randbuffer[j >> 1] % randmax;
-			*((uint32_t *)(hashbuffer + rand)) = *((uint32_t *)(hashbuffer + j + randmax));
-		}
-	}
-
-	memcpy(hash, hashbuffer, 32);
-}
-
-int scanhash_pluck( struct work *work, uint32_t max_nonce,
-        uint64_t *hashes_done, struct thr_info *mythr  )
-{
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-	uint32_t _ALIGN(64) endiandata[20];
-	uint32_t _ALIGN(64) hash[8];
-	const uint32_t first_nonce = pdata[19];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-	volatile uint8_t *restart = &(work_restart[thr_id].restart);
-	uint32_t n = first_nonce;
-
-
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0ffff;
-
-        for (int i=0; i < 19; i++) 
-                be32enc(&endiandata[i], pdata[i]);
-
-	const uint32_t Htarg = ptarget[7];
-	do {
-		//be32enc(&endiandata[19], n);
-		endiandata[19] = n;
-		pluck_hash(hash, endiandata, scratchbuf, opt_pluck_n);
-
-		if (hash[7] <= Htarg && fulltest(hash, ptarget))
-		{
-			*hashes_done = n - first_nonce + 1;
-			pdata[19] = htobe32(endiandata[19]);
-			return 1;
-		}
-		n++;
-	} while (n < max_nonce && !(*restart));
-
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
-}
-
-bool pluck_miner_thread_init( int thr_id )
-{ 
-  scratchbuf = malloc( 128 * 1024 ); 
-  if ( scratchbuf )
-    return true;
-  applog( LOG_ERR, "Thread %u: Pluck buffer allocation failed", thr_id );
-  return false;
-}
-
-bool register_pluck_algo( algo_gate_t* gate )
-{
-  algo_not_tested();
-  gate->miner_thread_init = (void*)&pluck_miner_thread_init;
-  gate->scanhash         = (void*)&scanhash_pluck;
-  gate->hash             = (void*)&pluck_hash;
-  opt_target_factor = 65536.0;
-  return true;
-};
-
-
--- a/algo/sha/sha256q.c
+++ b/algo/sha/sha256q.c
@@ -1,4 +1,7 @@
 #include "sha256t-gate.h"
+
+#if !defined(SHA256T_16WAY) && !defined(SHA256T_8WAY) && !defined(SHA256T_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -102,3 +105,4 @@ int scanhash_sha256q( struct work *work, uint32_t max_nonce,
   pdata[19] = n;
   return 0;
 }
+#endif
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -1,4 +1,7 @@
 #include "sha256t-gate.h"
+
+#if !defined(SHA256T_16WAY) && !defined(SHA256T_8WAY) && !defined(SHA256T_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -98,3 +101,5 @@ int scanhash_sha256t( struct work *work, uint32_t max_nonce,
   pdata[19] = n;
   return 0;
 }
+#endif
+
--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -1,4 +1,7 @@
 #include "algo-gate-api.h"
+
+#if !defined(SKEIN_8WAY) && !defined(SKEIN_4WAY)
+
 #include <string.h>
 #include <stdint.h>
 #include "sph_skein.h"
@@ -52,4 +55,4 @@ int scanhash_skein( struct work *work, uint32_t max_nonce,

 	return 0;
 }
-
+#endif
--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -1,4 +1,7 @@
 #include "skein-gate.h"
+
+#if !defined(SKEIN_8WAY) && !defined(SKEIN_4WAY)
+
 #include <string.h>
 #include <stdint.h>

@@ -66,4 +69,4 @@ int scanhash_skein2( struct work *work,	uint32_t max_nonce,
 	return 0;
 }

-
+#endif
--- a/algo/whirlpool/md_helper.c
+++ b/algo/whirlpool/md_helper.c
@@ -252,12 +252,6 @@ SPH_XCAT(HASH, _addbits_and_close)(void *cc,
 	current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
 #endif

-//uint64_t *b= (uint64_t*)sc->buf;
-//uint64_t *s= (uint64_t*)sc->state;
-// printf("Sptr 1= %u\n",current);   
-// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
-// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );
-
 #ifdef PW01
 	sc->buf[current ++] = (0x100 | (ub & 0xFF)) >> (8 - n);
 #else
@@ -269,10 +263,6 @@ SPH_XCAT(HASH, _addbits_and_close)(void *cc,
 	}
 #endif

-// printf("Sptr 2= %u\n",current); 
-// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
-// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );
-
 	if (current > SPH_MAXPAD) {
 		memset(sc->buf + current, 0, SPH_BLEN - current);
 		RFUN(sc->buf, SPH_VAL);
@@ -333,16 +323,8 @@ SPH_XCAT(HASH, _addbits_and_close)(void *cc,
 #endif
 #endif

-// printf("Sptr 3= %u\n",current);
-// printf("SBuf   %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
-// printf("SBuf   %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );
-
 	RFUN(sc->buf, SPH_VAL);

-// printf("Sptr after= %u\n",current);
-// printf("SState %016llx %016llx %016llx %016llx\n", s[0], s[1], s[2], s[3] );
-// printf("SState %016llx %016llx %016llx %016llx\n", s[4], s[5], s[6], s[7] );
-
 #ifdef SPH_NO_OUTPUT
 	(void)dst;
 	(void)rnum;
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -1,4 +1,7 @@
 #include "c11-gate.h"
+
+#if !defined(C11_8WAY) && !defined(C11_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,9 +12,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/simd/sph_simd.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
@@ -149,3 +149,4 @@ int scanhash_c11( struct work *work, uint32_t max_nonce,
 	return 0;
 }

+#endif
--- a/algo/x11/fresh.c
+++ b/algo/x11/fresh.c
@@ -1,131 +0,0 @@
-#include "algo-gate-api.h"
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
-
-//#define DEBUG_ALGO
-
-extern void freshhash(void* output, const void* input, uint32_t len)
-{
-	unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
-	#define hashA hash
-	#define hashB hash+64
-
-	sph_shavite512_context ctx_shavite;
-	sph_simd512_context ctx_simd;
-	sph_echo512_context ctx_echo;
-
-	sph_shavite512_init(&ctx_shavite);
-	sph_shavite512(&ctx_shavite, input, len);
-	sph_shavite512_close(&ctx_shavite, hashA);
-
-	sph_simd512_init(&ctx_simd);
-	sph_simd512(&ctx_simd, hashA, 64);
-	sph_simd512_close(&ctx_simd, hashB);
-
-	sph_shavite512_init(&ctx_shavite);
-	sph_shavite512(&ctx_shavite, hashB, 64);
-	sph_shavite512_close(&ctx_shavite, hashA);
-
-	sph_simd512_init(&ctx_simd);
-	sph_simd512(&ctx_simd, hashA, 64);
-	sph_simd512_close(&ctx_simd, hashB);
-
-	sph_echo512_init(&ctx_echo);
-	sph_echo512(&ctx_echo, hashB, 64);
-	sph_echo512_close(&ctx_echo, hashA);
-
-	memcpy(output, hash, 32);
-}
-
-int scanhash_fresh( struct work *work,
-				uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
-{
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-	uint32_t len = 80;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	const uint32_t Htarg = ptarget[7];
-#ifdef _MSC_VER
-	uint32_t __declspec(align(32)) hash64[8];
-#else
-	uint32_t hash64[8] __attribute__((aligned(32)));
-#endif
-	uint32_t endiandata[32];
-
-	uint64_t htmax[] = {
-		0,
-		0xF,
-		0xFF,
-		0xFFF,
-		0xFFFF,
-		0x10000000
-	};
-	uint32_t masks[] = {
-		0xFFFFFFFF,
-		0xFFFFFFF0,
-		0xFFFFFF00,
-		0xFFFFF000,
-		0xFFFF0000,
-		0
-	};
-
-	// we need bigendian data...
-        for (int k = 0; k < 19; k++)
-                be32enc(&endiandata[k], pdata[k]);
-
-#ifdef DEBUG_ALGO
-	if (Htarg != 0)
-		printf("[%d] Htarg=%X\n", thr_id, Htarg);
-#endif
-	for (int m=0; m < 6; m++) {
-		if (Htarg <= htmax[m]) {
-			uint32_t mask = masks[m];
-			do {
-				pdata[19] = ++n;
-				be32enc(&endiandata[19], n);
-				freshhash(hash64, endiandata, len);
-#ifndef DEBUG_ALGO
-				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
-					*hashes_done = n - first_nonce + 1;
-					return true;
-				}
-#else
-				if (!(n % 0x1000) && !thr_id) printf(".");
-				if (!(hash64[7] & mask)) {
-					printf("[%d]",thr_id);
-					if (fulltest(hash64, ptarget)) {
-						*hashes_done = n - first_nonce + 1;
-						return true;
-					}
-				}
-#endif
-			} while (n < max_nonce && !work_restart[thr_id].restart);
-			// see blake.c if else to understand the loop on htmax => mask
-			break;
-		}
-	}
-
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
-}
-
-bool register_fresh_algo( algo_gate_t* gate )
-{
-    algo_not_tested();
-    gate->scanhash   = (void*)&scanhash_fresh;
-    gate->hash       = (void*)&freshhash;
-    opt_target_factor = 256.0;
-    return true;
-};
-
--- a/algo/x11/timetravel.c
+++ b/algo/x11/timetravel.c
@@ -1,5 +1,7 @@
 #include "timetravel-gate.h"

+#if !defined(TIMETRAVEL_8WAY) && !defined(TIMETRAVEL_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -290,4 +292,4 @@ int scanhash_timetravel( struct work *work, uint32_t max_nonce,
  return 0;
 }

-
+#endif
--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -1,4 +1,7 @@
 #include "timetravel10-gate.h"
+
+#if !defined(TIMETRAVEL10_8WAY) && !defined(TIMETRAVEL10_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -329,3 +332,4 @@ int scanhash_timetravel10( struct work *work, uint32_t max_nonce,
        *hashes_done = pdata[19] - first_nonce + 1;
  return 0;
 }
+#endif
--- a/algo/x11/tribus.c
+++ b/algo/x11/tribus.c
@@ -1,12 +1,13 @@
 #include "tribus-gate.h"
+
+#if !defined(TRIBUS_8WAY) && !defined(TRIBUS_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
 #include "algo/jh//sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
-
 #ifdef __AES__
  #include "algo/echo/aes_ni/hash_api.h"
 #else
@@ -117,4 +118,4 @@ int scanhash_tribus( struct work *work, uint32_t max_nonce,
 	return 0;
 }

-
+#endif
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -1,5 +1,8 @@
 #include "cpuminer-config.h"
 #include "x11-gate.h"
+
+#if !defined(X11_8WAY) && !defined(X11_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -10,9 +13,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/simd/sph_simd.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
@@ -172,3 +172,4 @@ int scanhash_x11( struct work *work, uint32_t max_nonce,
        pdata[19] = n;
        return 0;
 }
+#endif
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -1,5 +1,8 @@
 #include "cpuminer-config.h"
 #include "x11evo-gate.h"
+
+#if !defined(X11EVO_8WAY) && !defined(X11EVO_4WAY)
+
 #include <string.h>
 #include <stdint.h>
 #include <compat/portable_endian.h>
@@ -8,10 +11,7 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
 #ifdef __AES__
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
@@ -204,3 +204,4 @@ int scanhash_x11evo( struct work* work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
+#endif
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -1,4 +1,7 @@
 #include "x11gost-gate.h"
+
+#if !defined(X11GOST_8WAY) && !defined(X11GOST_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -10,9 +13,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/simd/sph_simd.h"
 #include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
@@ -160,3 +160,4 @@ int scanhash_x11gost( struct work *work, uint32_t max_nonce,
 	return 0;
 }

+#endif
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -1,5 +1,7 @@
 #include "x12-gate.h"

+#if !defined(X12_8WAY) && !defined(X12_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -12,9 +14,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/simd/sph_simd.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/luffa/luffa_for_sse2.h" 
@@ -177,3 +176,4 @@ int scanhash_x12( struct work *work, uint32_t max_nonce,
  pdata[19] = n;
  return 0;
 }
+#endif
--- a/algo/x13/drop.c
+++ b/algo/x13/drop.c
@@ -1,262 +0,0 @@
-/**
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2015 kernels10, tpruvot
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     drop.c
- * @author   kernels10 <kernels10@gmail.com.com>
- * @author   tpruvot <tpruvot@github>
- */
-
-#define POK_BOOL_MASK 0x00008000
-#define POK_DATA_MASK 0xFFFF0000
- 
-#include "algo-gate-api.h"
-
-#include <string.h>
-
-#include "algo/blake/sph_blake.h"
-#include "algo/groestl/sph_groestl.h"
-#include "algo/jh/sph_jh.h"
-#include "algo/keccak/sph_keccak.h"
-#include "algo/skein/sph_skein.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/echo/sph_echo.h"
-#include "algo/fugue//sph_fugue.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/shavite/sph_shavite.h"
-
-static void shiftr_lp(const uint32_t *input, uint32_t *output, unsigned int shift)
-{
-	if(!shift) {
-		memcpy(output, input, 64);
-		return;
-	}
-
-	memset(output, 0, 64);
-	for(int i = 0; i < 15; ++i) {
-		output[i + 1] |= (input[i] >> (32 - shift));
-		output[i] |= (input[i] << shift);
-	}
-
-	output[15] |= (input[15] << shift);
-	return;
-}
-
-static void switchHash(const void *input, void *output, int id)
-{
-/*
- 	sph_keccak512_context ctx_keccak;
-	sph_blake512_context ctx_blake;
-	sph_groestl512_context ctx_groestl;
-	sph_skein512_context ctx_skein;
-	sph_luffa512_context ctx_luffa;
-	sph_echo512_context ctx_echo;
-	sph_simd512_context ctx_simd;
-	sph_cubehash512_context ctx_cubehash;
-	sph_fugue512_context ctx_fugue;
-	sph_shavite512_context ctx_shavite;
-
-	switch(id) {
-	case 0:
-		sph_keccak512_init(&ctx_keccak); sph_keccak512(&ctx_keccak, input, 64); sph_keccak512_close(&ctx_keccak, output);
-		break;
-	case 1:
-		sph_blake512_init(&ctx_blake); sph_blake512(&ctx_blake, input, 64); sph_blake512_close(&ctx_blake, output);
-		break;
-	case 2:
-		sph_groestl512_init(&ctx_groestl); sph_groestl512(&ctx_groestl, input, 64); sph_groestl512_close(&ctx_groestl, output);
-		break;
-	case 3:
-		sph_skein512_init(&ctx_skein); sph_skein512(&ctx_skein, input, 64); sph_skein512_close(&ctx_skein, output);
-		break;
-	case 4:
-		sph_luffa512_init(&ctx_luffa); sph_luffa512(&ctx_luffa, input, 64); sph_luffa512_close(&ctx_luffa, output);
-		break;
-	case 5:
-		sph_echo512_init(&ctx_echo); sph_echo512(&ctx_echo, input, 64); sph_echo512_close(&ctx_echo, output);
-		break;
-	case 6:
-		sph_shavite512_init(&ctx_shavite); sph_shavite512(&ctx_shavite, input, 64); sph_shavite512_close(&ctx_shavite, output);
-		break;
-	case 7:
-		sph_fugue512_init(&ctx_fugue); sph_fugue512(&ctx_fugue, input, 64); sph_fugue512_close(&ctx_fugue, output);
-		break;
-	case 8:
-		sph_simd512_init(&ctx_simd); sph_simd512(&ctx_simd, input, 64); sph_simd512_close(&ctx_simd, output);
-		break;
-	case 9:
-		sph_cubehash512_init(&ctx_cubehash); sph_cubehash512(&ctx_cubehash, input, 64); sph_cubehash512_close(&ctx_cubehash, output);
-		break;
-	default:
-		break;
-	}
-*/
-}
-
-void droplp_hash(void *state, const void *input)
-{
-	uint32_t _ALIGN(64) hash[2][16];
-	sph_jh512_context ctx_jh;
-	uint32_t *hashA = hash[0];
-	uint32_t *hashB = hash[1];
-
-	sph_jh512_init(&ctx_jh);
-	sph_jh512(&ctx_jh, input, 80);
-	sph_jh512_close(&ctx_jh, (void*)(hashA));
-
-	unsigned int startPosition = hashA[0] % 31;
-	unsigned int i = 0;
-	int j = 0;
-	int start = 0;
-
-	for (i = startPosition; i < 31; i+=9) {
-		start = i % 10;
-		for (j = start; j < 10; j++) {
-			shiftr_lp(hashA, hashB, (i & 3));
-			switchHash((const void*)hashB, (void*)hashA, j);
-		}
-		for (j = 0; j < start; j++) {
-			shiftr_lp(hashA, hashB, (i & 3));
-			switchHash((const void*)hashB, (void*)hashA, j);
-		}
-	}
-	for (i = 0; i < startPosition; i += 9) {
-		start = i % 10;
-		for (j = start; j < 10; j++) {
-			shiftr_lp(hashA, hashB, (i & 3));
-			switchHash((const void*)hashB, (void*)hashA, j);
-		}
-		for (j = 0; j < start; j++) {
-			shiftr_lp(hashA, hashB, (i & 3));
-			switchHash((const void*)hashB, (void*)hashA, j);
-		}
-	}
-
-	memcpy(state, hashA, 32);
-}
-
-static void droplp_hash_pok(void *output, uint32_t *pdata, const uint32_t version)
-{
-	uint32_t _ALIGN(64) hash[8];
-	uint32_t pok;
-
-	pdata[0] = version;
-	droplp_hash(hash, pdata);
-
-	// fill PoK
-	pok = version | (hash[0] & POK_DATA_MASK);
-	if (pdata[0] != pok) {
-		pdata[0] = pok;
-		droplp_hash(hash, pdata);
-	}
-	memcpy(output, hash, 32);
-}
-
-int scanhash_drop( struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr )
-{
-	uint32_t _ALIGN(64) hash[16];
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-	const uint32_t version = pdata[0] & (~POK_DATA_MASK);
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-	#define tmpdata pdata
-
-	if (opt_benchmark)
-		ptarget[7] = 0x07ff;
-
-	const uint32_t htarg = ptarget[7];
-
-	do {
-		tmpdata[19] = nonce;
-		droplp_hash_pok(hash, tmpdata, version);
-
-		if (hash[7] <= htarg && fulltest(hash, ptarget)) {
-			pdata[0] = tmpdata[0];
-			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce + 1;
-			if (opt_debug)
-				applog(LOG_INFO, "found nonce %x", nonce);
-			return 1;
-		}
-		nonce++;
-
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
-}
-
-void drop_get_new_work( struct work* work, struct work* g_work, int thr_id,
-                        uint32_t* end_nonce_ptr )
-{
-   // ignore POK in first word
-// const int nonce_i = 19;
-   const int wkcmp_sz = 72;  // (19-1) * sizeof(uint32_t)
-   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
-   if ( memcmp( &work->data[1], &g_work->data[1], wkcmp_sz )
-       || ( *nonceptr >= *end_nonce_ptr ) )
-   {
-      work_free( work );
-      work_copy( work, g_work );
-      *nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id;
-      if ( opt_randomize )
-         *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
-      *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
-   }
-   else
-       ++(*nonceptr);
-}
-
-void drop_display_pok( struct work* work ) 
-{
-      if ( work->data[0] & 0x00008000 ) 
-        applog(LOG_BLUE, "POK received: %08xx", work->data[0] );
-}
-
-int drop_get_work_data_size() { return 80; }
-
-// Need to fix POK offset problems like zr5
-bool register_drop_algo( algo_gate_t* gate )
-{
-    algo_not_tested();
-    gate->scanhash              = (void*)&scanhash_drop;
-    gate->hash                  = (void*)&droplp_hash_pok;
-    gate->get_new_work          = (void*)&drop_get_new_work;
-    gate->build_stratum_request = (void*)&std_be_build_stratum_request;
-    gate->work_decode           = (void*)&std_be_work_decode;
-    gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-    gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
-    gate->decode_extra_data     = (void*)&drop_display_pok;
-    gate->get_work_data_size    = (void*)&drop_get_work_data_size;
-    gate->work_cmp_size         = 72;
-    opt_target_factor = 65536.0;
-    return true;
-};
-
--- a/algo/x13/phi1612.c
+++ b/algo/x13/phi1612.c
@@ -1,4 +1,7 @@
 #include "phi1612-gate.h"
+
+#if !defined(PHI1612_8WAY) && !defined(PHI1612_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -123,3 +126,4 @@ int scanhash_phi1612( struct work *work, uint32_t max_nonce,
 	return 0;
 }

+#endif
--- a/algo/x13/skunk.c
+++ b/algo/x13/skunk.c
@@ -1,4 +1,7 @@
 #include "skunk-gate.h"
+
+#if !defined(SKUNK_8WAY) && !defined(SKUNK_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -88,3 +91,4 @@ bool skunk_thread_init()
   sph_gost512_init( &skunk_ctx.gost );
   return true;
 }
+#endif
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -1,4 +1,7 @@
 #include "x13-gate.h"
+
+#if !defined(X13_8WAY) && !defined(X13_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,9 +12,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/simd/sph_simd.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/luffa/luffa_for_sse2.h"
@@ -185,3 +185,4 @@ int scanhash_x13( struct work *work, uint32_t max_nonce,
  pdata[19] = n;
  return 0;
 }
+#endif
--- a/algo/x13/x13bcd.c
+++ b/algo/x13/x13bcd.c
@@ -1,4 +1,7 @@
 #include "x13sm3-gate.h"
+
+#if !defined(X13BCD_8WAY) && !defined(X13VCD_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -10,7 +13,6 @@
 #include "algo/sm3/sph_sm3.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/cubehash/cubehash_sse2.h"
@@ -184,3 +186,4 @@ int scanhash_x13bcd( struct work *work, uint32_t max_nonce,
 	return 0;
 }

+#endif
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -1,4 +1,7 @@
 #include "x13sm3-gate.h"
+
+#if !defined(X13SM3_8WAY) && !defined(X13SM3_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -10,7 +13,6 @@
 #include "algo/sm3/sph_sm3.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/luffa/luffa_for_sse2.h"
@@ -197,3 +199,4 @@ int scanhash_x13sm3( struct work *work, uint32_t max_nonce,
 	return 0;
 }

+#endif
--- a/algo/x14/polytimos.c
+++ b/algo/x14/polytimos.c
@@ -1,4 +1,7 @@
 #include "polytimos-gate.h"
+
+#if !defined(POLYTIMOS_8WAY) && !defined(POLYTIMOS_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -111,3 +114,4 @@ int scanhash_polytimos( struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+#endif
--- a/algo/x14/veltor.c
+++ b/algo/x14/veltor.c
@@ -1,4 +1,7 @@
 #include "veltor-gate.h"
+
+#if !defined(VELTOR_8WAY) && !defined(VELTOR_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -102,3 +105,4 @@ int scanhash_veltor( struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
+#endif
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -1,19 +1,17 @@
 #include "x14-gate.h"
+
+#if !defined(X14_8WAY) && !defined(X14_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
-#include "algo/groestl/sph_groestl.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
@@ -186,3 +184,4 @@ int scanhash_x14( struct work *work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
+#endif
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -1,4 +1,7 @@
 #include "x15-gate.h"
+
+#if !defined(X15_8WAY) && !defined(X15_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,9 +12,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/simd/sph_simd.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
@@ -217,3 +217,4 @@ int scanhash_x15( struct work *work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
+#endif
--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -85,13 +85,6 @@ void hex_hash( void* output, const void* input )
   memcpy( &ctx, &hex_ctx, sizeof(ctx) );
   void *in = (void*) input;
   int size = 80;
-/*
-   if ( s_ntime == UINT32_MAX )
-   {
-      const uint8_t* in8 = (uint8_t*) input;
-      x16_r_s_getAlgoString( &in8[4], hashOrder );
-   }
-*/

   char elem = hashOrder[0];
   uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
@@ -249,12 +242,8 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
   const bool bench = opt_benchmark;
   if ( bench )  ptarget[7] = 0x0cff;

-   casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
-
+   mm128_bswap32_80( edata, pdata );
+   
   uint32_t ntime = swab32(pdata[17]);
   if ( s_ntime != ntime )
   {
@@ -277,6 +266,10 @@ int scanhash_hex( struct work *work, uint32_t max_nonce,
         sph_skein512_init( &hex_ctx.skein );
         sph_skein512( &hex_ctx.skein, edata, 64 );
      break;
+      case LUFFA:
+         init_luffa( &hex_ctx.luffa, 512 );
+         update_luffa( &hex_ctx.luffa, (const BitSequence*)edata, 64 );
+      break;
      case CUBEHASH:
         cubehashInit( &hex_ctx.cube, 512, 16, 32 );
         cubehashUpdate( &hex_ctx.cube, (const byte*)edata, 64 );
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -2,74 +2,85 @@
 * x16r algo implementation
 *
 * Implementation by tpruvot@github Jan 2018
- * Optimized by JayDDee@github Jan 2018
+ * Optimized by https://github.com/JayDDee/ Jan 2018
 */
 #include "x16r-gate.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "algo/blake/blake-hash-4way.h"
-#include "algo/bmw/bmw-hash-4way.h"
-#include "algo/groestl/aes_ni/hash-groestl.h"
-#include "algo/groestl/aes_ni/hash-groestl.h"
-#include "algo/skein/skein-hash-4way.h"
-#include "algo/jh/jh-hash-4way.h"
-#include "algo/keccak/keccak-hash-4way.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa-hash-2way.h"
-#include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/cube-hash-2way.h"
-#include "algo/cubehash/cubehash_sse2.h"
-#include "algo/simd/simd-hash-2way.h"
-#include "algo/echo/aes_ni/hash_api.h"
-#include "algo/hamsi/hamsi-hash-4way.h"
-#include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/shabal-hash-4way.h"
-#include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sha-hash-4way.h"
-#if defined(__VAES__)
-  #include "algo/groestl/groestl512-hash-4way.h"
-  #include "algo/shavite/shavite-hash-4way.h"
-  #include "algo/echo/echo-hash-4way.h"
-#endif
+
+// The hash and prehash code is shared among x16r, x16s, x16rt, and x21s.
+// The generic function performs the x16 hash as per the hash order
+// and produces a 512 bit intermediate hash which needs to be converted
+// to 256 bit final hash by a wrapper function. 

 #if defined (X16R_8WAY)

-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+// Perform midstate prehash of hash functions with block size <= 64 bytes
+// and interleave 4x64 before nonce insertion for final hash.

-union _x16r_8way_context_overlay
+void x16r_8way_prehash( void *vdata, void *pdata )
 {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
-    luffa_4way_context      luffa;
-    cubehashParam           cube;
-//    cube_4way_context       cube;
-    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
-    sph_fugue512_context    fugue;
-    shabal512_8way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-#if defined(__VAES__)
-    groestl512_4way_context groestl;
-    shavite512_4way_context shavite;
-    echo_4way_context       echo;
-#else
-    hashState_groestl       groestl;
-    sph_shavite512_context  shavite;
-    hashState_echo          echo;
-#endif
-} __attribute__ ((aligned (64)));
+   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
+   uint32_t edata[20] __attribute__ ((aligned (64)));

-typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
+   const char elem = x16r_hash_order[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

-static __thread x16r_8way_context_overlay x16r_ctx;
+   switch ( algo )
+   {
+      case JH:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         jh512_8way_init( &x16r_ctx.jh );
+         jh512_8way_update( &x16r_ctx.jh, vdata, 64 );
+      break;
+      case SKEIN:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         skein512_8way_init( &x16r_ctx.skein );
+         skein512_8way_update( &x16r_ctx.skein, vdata, 64 );
+      break;
+      case LUFFA:
+         mm128_bswap32_80( edata, pdata );
+         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
+         luffa_4way_init( &x16r_ctx.luffa, 512 );
+         luffa_4way_update( &x16r_ctx.luffa, vdata2, 64 );
+         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
+      break;
+      case CUBEHASH:
+         mm128_bswap32_80( edata, pdata );
+         cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
+      break;
+      case HAMSI:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+         hamsi512_8way_init( &x16r_ctx.hamsi );
+         hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 64 );
+      break;
+      case SHABAL:
+         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
+         shabal512_8way_init( &x16r_ctx.shabal );
+         shabal512_8way_update( &x16r_ctx.shabal, vdata2, 64 );
+         rintrlv_8x32_8x64( vdata, vdata2, 640 );
+      break;
+      case WHIRLPOOL:
+         mm128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x16r_ctx.whirlpool );
+         sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
+         intrlv_8x64( vdata, edata, edata, edata, edata,
+                             edata, edata, edata, edata, 640 );
+      break;
+      default:
+         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+   }
+}

-void x16r_8way_hash( void* output, const void* input )
+// Perform the full x16r hash and returns 512 bit intermediate hash.
+// Called by wrapper hash function to optionally continue hashing and
+// convert to final hash.
+
+void x16r_8way_hash_generic( void* output, const void* input )
 {
   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
   uint32_t hash0[20] __attribute__ ((aligned (64)));
@@ -97,7 +108,7 @@ void x16r_8way_hash( void* output, const void* input )

   for ( int i = 0; i < 16; i++ )
   {
-      const char elem = hashOrder[i];
+      const char elem = x16r_hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

      switch ( algo )
@@ -464,23 +475,39 @@ void x16r_8way_hash( void* output, const void* input )
      size = 64;
   }

-   memcpy( output,     hash0, 32 );
-   memcpy( output+32,  hash1, 32 );
-   memcpy( output+64,  hash2, 32 );
-   memcpy( output+96,  hash3, 32 );
-   memcpy( output+128, hash4, 32 );
-   memcpy( output+160, hash5, 32 );
-   memcpy( output+192, hash6, 32 );
-   memcpy( output+224, hash7, 32 );
+   memcpy( output,     hash0, 64 );
+   memcpy( output+64,  hash1, 64 );
+   memcpy( output+128, hash2, 64 );
+   memcpy( output+192, hash3, 64 );
+   memcpy( output+256, hash4, 64 );
+   memcpy( output+320, hash5, 64 );
+   memcpy( output+384, hash6, 64 );
+   memcpy( output+448, hash7, 64 );
 }

+// x16-r,-s,-rt wrapper called directly by scanhash to repackage 512 bit
+// hash to 256 bit final hash.
+void x16r_8way_hash( void* output, const void* input )
+{
+   uint8_t hash[64*8] __attribute__ ((aligned (128)));
+   x16r_8way_hash_generic( hash, input );
+
+   memcpy( output,     hash,     32 );
+   memcpy( output+32,  hash+64,  32 );
+   memcpy( output+64,  hash+128, 32 );
+   memcpy( output+96,  hash+192, 32 );
+   memcpy( output+128, hash+256, 32 );
+   memcpy( output+160, hash+320, 32 );
+   memcpy( output+192, hash+384, 32 );
+   memcpy( output+224, hash+448, 32 );
+}
+
+// x16r only
 int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[16*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
-   uint32_t edata[20] __attribute__ ((aligned (64)));
   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -496,66 +523,18 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,

   bedata1[0] = bswap_32( pdata[1] );
   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
   const uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-              applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
+          applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

-   // Do midstate prehash on hash functions with block size <= 64 bytes.
-   const char elem = hashOrder[0];
-   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-   switch ( algo )
-   {
-      case JH:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         jh512_8way_init( &x16r_ctx.jh );
-         jh512_8way_update( &x16r_ctx.jh, vdata, 64 );
-      break;
-      case SKEIN:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         skein512_8way_init( &x16r_ctx.skein );
-         skein512_8way_update( &x16r_ctx.skein, vdata, 64 );
-      break;
-      case LUFFA:
-         mm128_bswap32_80( edata, pdata );
-         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
-         luffa_4way_init( &x16r_ctx.luffa, 512 );
-         luffa_4way_update( &x16r_ctx.luffa, vdata2, 64 );
-         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 ); 
-      break;
-      case CUBEHASH:
-         mm128_bswap32_80( edata, pdata );
-         cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
-      break;
-      case HAMSI:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         hamsi512_8way_init( &x16r_ctx.hamsi );
-         hamsi512_8way_update( &x16r_ctx.hamsi, vdata, 64 );
-      break;
-      case SHABAL:
-         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
-         shabal512_8way_init( &x16r_ctx.shabal );
-         shabal512_8way_update( &x16r_ctx.shabal, vdata2, 64 );
-         rintrlv_8x32_8x64( vdata, vdata2, 640 );
-      break;
-      case WHIRLPOOL:
-         mm128_bswap32_80( edata, pdata );
-         sph_whirlpool_init( &x16r_ctx.whirlpool );
-         sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
-      break;
-      default:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-   }
-   
+   x16r_8way_prehash( vdata, pdata );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
@@ -580,34 +559,62 @@ int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,

 #elif defined (X16R_4WAY)

-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
-
-union _x16r_4way_context_overlay
+void x16r_4way_prehash( void *vdata, void *pdata )
 {
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
-    hashState_echo          echo;
-    hashState_groestl       groestl;
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
-    luffa_2way_context      luffa;
-    hashState_luffa         luffa1;
-    cubehashParam           cube;
-    sph_shavite512_context  shavite;
-    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
-    sph_fugue512_context    fugue;
-    shabal512_4way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-} __attribute__ ((aligned (64)));
-typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;
+   uint32_t vdata2[20*4] __attribute__ ((aligned (64)));
+   uint32_t edata[20] __attribute__ ((aligned (64)));

-static __thread x16r_4way_context_overlay x16r_ctx;
+   const char elem = x16r_hash_order[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

-void x16r_4way_hash( void* output, const void* input )
+   switch ( algo )
+   {
+      case JH:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         jh512_4way_init( &x16r_ctx.jh );
+         jh512_4way_update( &x16r_ctx.jh, vdata, 64 );
+      break;
+      case SKEIN:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         skein512_4way_init( &x16r_ctx.skein );
+         skein512_4way_update( &x16r_ctx.skein, vdata, 64 );
+      break;
+      case LUFFA:
+         mm128_bswap32_80( edata, pdata );
+         intrlv_2x128( vdata2, edata, edata, 640 );
+         luffa_2way_init( &x16r_ctx.luffa, 512 );
+         luffa_2way_update( &x16r_ctx.luffa, vdata2, 64 );
+         rintrlv_2x128_4x64( vdata, vdata2, vdata2, 640 );
+         break;
+      case CUBEHASH:
+         mm128_bswap32_80( edata, pdata );
+         cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      case HAMSI:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+         hamsi512_4way_init( &x16r_ctx.hamsi );
+         hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 64 );
+      break;
+      case SHABAL:
+         mm128_bswap32_intrlv80_4x32( vdata2, pdata );
+         shabal512_4way_init( &x16r_ctx.shabal );
+         shabal512_4way_update( &x16r_ctx.shabal, vdata2, 64 );
+         rintrlv_4x32_4x64( vdata, vdata2, 640 );
+      break;
+      case WHIRLPOOL:
+         mm128_bswap32_80( edata, pdata );
+         sph_whirlpool_init( &x16r_ctx.whirlpool );
+         sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
+         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
+      break;
+      default:
+         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+   }
+}
+
+void x16r_4way_hash_generic( void* output, const void* input )
 {
   uint32_t vhash[20*4] __attribute__ ((aligned (128)));
   uint32_t hash0[20] __attribute__ ((aligned (64)));
@@ -626,7 +633,7 @@ void x16r_4way_hash( void* output, const void* input )

   for ( int i = 0; i < 16; i++ )
   {
-      const char elem = hashOrder[i];
+      const char elem = x16r_hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

      switch ( algo )
@@ -698,11 +705,12 @@ void x16r_4way_hash( void* output, const void* input )
         case LUFFA:
            if ( i == 0 )
            {
-               intrlv_2x128( vhash, in0, in1, size<<3 );
-               luffa512_2way_full( &ctx.luffa, vhash, vhash + (16<<1), 16 );
+               intrlv_2x128( vhash, hash0, hash1, 640 );
+               luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
               dintrlv_2x128_512( hash0, hash1, vhash );
-               intrlv_2x128( vhash, in2, in3, size<<3 );
-               luffa512_2way_full( &ctx.luffa, vhash, vhash + (16<<1), 16 );
+               intrlv_2x128( vhash, hash2, hash3, 640 );
+               memcpy( &ctx, &x16r_ctx, sizeof(ctx) );
+               luffa_2way_update_close( &ctx.luffa, vhash, vhash + (16<<1), 16 );
               dintrlv_2x128_512( hash2, hash3, vhash );
            }
            else
@@ -863,10 +871,21 @@ void x16r_4way_hash( void* output, const void* input )
      }
      size = 64;
   }
-   memcpy( output,    hash0, 32 );
-   memcpy( output+32, hash1, 32 );
-   memcpy( output+64, hash2, 32 );
-   memcpy( output+96, hash3, 32 );
+   memcpy( output,     hash0, 64 );
+   memcpy( output+64,  hash1, 64 );
+   memcpy( output+128, hash2, 64 );
+   memcpy( output+192, hash3, 64 );
+}
+
+void x16r_4way_hash( void* output, const void* input )
+{
+   uint8_t hash[64*4] __attribute__ ((aligned (64)));
+   x16r_4way_hash_generic( hash, input );
+
+   memcpy( output,     hash,     32 );
+   memcpy( output+32,  hash+64,  32 );
+   memcpy( output+64,  hash+128, 32 );
+   memcpy( output+96,  hash+192, 32 );
 }

 int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
@@ -874,8 +893,6 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t vdata2[20*4] __attribute__ ((aligned (64)));
-   uint32_t edata[20] __attribute__ ((aligned (64)));
   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -891,67 +908,20 @@ int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,

   bedata1[0] = bswap_32( pdata[1] );
   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
   const uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-              applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
-   }
-
-   // Do midstate prehash on hash functions with block size <= 64 bytes.
-   const char elem = hashOrder[0];
-   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-   switch ( algo )
-   {
-      case JH:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         jh512_4way_init( &x16r_ctx.jh );
-         jh512_4way_update( &x16r_ctx.jh, vdata, 64 );
-      break;
-      case SKEIN:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_init( &x16r_ctx.skein );
-         skein512_4way_update( &x16r_ctx.skein, vdata, 64 );
-      break;
-      case LUFFA:
-         mm128_bswap32_80( edata, pdata );
-         intrlv_2x128( vdata2, edata, edata, 640 );
-         luffa_2way_init( &x16r_ctx.luffa, 512 );
-         luffa_2way_update( &x16r_ctx.luffa, vdata2, 64 );
-         rintrlv_2x128_4x64( vdata, vdata2, vdata2, 512 );
-      break;
-      case CUBEHASH:
-         mm128_bswap32_80( edata, pdata );
-         cubehashInit( &x16r_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16r_ctx.cube, (const byte*)edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
-      break;
-      case HAMSI:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         hamsi512_4way_init( &x16r_ctx.hamsi );
-         hamsi512_4way_update( &x16r_ctx.hamsi, vdata, 64 );
-      break;
-      case SHABAL:
-         mm128_bswap32_intrlv80_4x32( vdata2, pdata );
-         shabal512_4way_init( &x16r_ctx.shabal );
-         shabal512_4way_update( &x16r_ctx.shabal, vdata2, 64 );
-         rintrlv_4x32_4x64( vdata, vdata2, 640 );
-      break;
-      case WHIRLPOOL:
-         mm128_bswap32_80( edata, pdata );
-         sph_whirlpool_init( &x16r_ctx.whirlpool );
-         sph_whirlpool( &x16r_ctx.whirlpool, edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
-      break;
-      default:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+              applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

+   x16r_4way_prehash( vdata, pdata );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
-
   do
   {
      x16r_4way_hash( hash, vdata );
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -1,7 +1,22 @@
 #include "x16r-gate.h"

+__thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ] = { 0 };
+
 void (*x16_r_s_getAlgoString) ( const uint8_t*, char* ) = NULL;

+#if defined (X16R_8WAY)
+
+__thread x16r_8way_context_overlay x16r_ctx;
+
+#elif defined (X16R_4WAY)
+
+__thread x16r_4way_context_overlay x16r_ctx;
+
+#endif
+
+__thread x16r_context_overlay x16_ctx;
+
+
 void x16r_getAlgoString( const uint8_t* prevblock, char *output )
 {
   char *sptr = output;
@@ -207,15 +222,15 @@ void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )

 bool register_x16rt_algo( algo_gate_t* gate )
 {
-#if defined (X16RT_8WAY)
+#if defined (X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
-  gate->hash      = (void*)&x16rt_8way_hash;
-#elif defined (X16RT_4WAY)
+  gate->hash      = (void*)&x16r_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
-  gate->hash      = (void*)&x16rt_4way_hash;
+  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
-  gate->hash      = (void*)&x16rt_hash;
+  gate->hash      = (void*)&x16r_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  opt_target_factor = 256.0;
@@ -224,15 +239,15 @@ bool register_x16rt_algo( algo_gate_t* gate )

 bool register_x16rt_veil_algo( algo_gate_t* gate )
 {
-#if defined (X16RT_8WAY)
+#if defined (X16R_8WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_8way;
-  gate->hash      = (void*)&x16rt_8way_hash;
-#elif defined (X16RT_4WAY)
+  gate->hash      = (void*)&x16r_8way_hash;
+#elif defined (X16R_4WAY)
  gate->scanhash  = (void*)&scanhash_x16rt_4way;
-  gate->hash      = (void*)&x16rt_4way_hash;
+  gate->hash      = (void*)&x16r_4way_hash;
 #else
  gate->scanhash  = (void*)&scanhash_x16rt;
-  gate->hash      = (void*)&x16rt_hash;
+  gate->hash      = (void*)&x16r_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | VAES_OPT;
  gate->build_extraheader = (void*)&veil_build_extraheader;
@@ -247,7 +262,7 @@ bool register_x16rt_veil_algo( algo_gate_t* gate )
 bool register_hex_algo( algo_gate_t* gate )
 {
  gate->scanhash        = (void*)&scanhash_hex;
-  gate->hash            = (void*)&hex_hash;
+  gate->hash            = (void*)&x16r_hash;
  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  opt_target_factor = 128.0;
@@ -260,13 +275,13 @@ bool register_hex_algo( algo_gate_t* gate )

 bool register_x21s_algo( algo_gate_t* gate )
 {
-#if defined (X21S_8WAY)
+#if defined (X16R_8WAY)
  gate->scanhash          = (void*)&scanhash_x21s_8way;
  gate->hash              = (void*)&x21s_8way_hash;
  gate->miner_thread_init = (void*)&x21s_8way_thread_init;
  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT
                            | VAES_OPT;
-#elif defined (X21S_4WAY)
+#elif defined (X16R_4WAY)
  gate->scanhash          = (void*)&scanhash_x21s_4way;
  gate->hash              = (void*)&x21s_4way_hash;
  gate->miner_thread_init = (void*)&x21s_4way_thread_init;
--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -5,29 +5,60 @@
 #include "simd-utils.h"
 #include <stdint.h>
 #include <unistd.h>
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X16R_8WAY 1
-#elif defined(__AVX2__) && defined(__AES__)
-  #define X16R_4WAY 1
+#include "algo/blake/sph_blake.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/sph_groestl.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/luffa_for_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/nist.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include <openssl/sha.h>
+#if defined(__AES__)
+  #include "algo/echo/aes_ni/hash_api.h"
+  #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif
+#if defined (__AVX2__)
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/simd/simd-hash-2way.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/hamsi-hash-4way.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/sha/sha-hash-4way.h"
+#if defined(__VAES__)
+  #include "algo/groestl/groestl512-hash-4way.h"
+  #include "algo/shavite/shavite-hash-4way.h"
+  #include "algo/echo/echo-hash-4way.h"
+#endif
+#endif // AVX2

 #if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
+
+  #define X16R_8WAY   1
  #define X16RV2_8WAY 1
+  #define X16RT_8WAY  1
+  #define X21S_8WAY   1
+
 #elif defined(__AVX2__) && defined(__AES__)
+
  #define X16RV2_4WAY 1
-#endif
+  #define X16RT_4WAY  1
+  #define X21S_4WAY   1
+  #define X16R_4WAY   1

-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X16RT_8WAY 1
-#elif defined(__AVX2__) && defined(__AES__)
-  #define X16RT_4WAY 1
-#endif
-
-#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
-  #define X21S_8WAY 1
-#elif defined(__AVX2__) && defined(__AES__)
-  #define X21S_4WAY 1
 #endif

 enum x16r_Algo {
@@ -50,6 +81,8 @@ enum x16r_Algo {
        X16R_HASH_FUNC_COUNT
 };

+extern __thread char x16r_hash_order[ X16R_HASH_FUNC_COUNT + 1 ];
+
 extern void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
 void x16r_getAlgoString( const uint8_t *prevblock, char *output );
 void x16s_getAlgoString( const uint8_t *prevblock, char *output );
@@ -67,25 +100,115 @@ bool register_x21s__algo( algo_gate_t* gate );
 // x16r, x16s
 #if defined(X16R_8WAY)

-void x16r_8way_hash( void *state, const void *input );
-int scanhash_x16r_8way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
+union _x16r_8way_context_overlay
+{
+    blake512_8way_context   blake;
+    bmw512_8way_context     bmw;
+    skein512_8way_context   skein;
+    jh512_8way_context      jh;
+    keccak512_8way_context  keccak;
+    luffa_4way_context      luffa;
+    cubehashParam           cube;
+    simd_4way_context       simd;
+    hamsi512_8way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_8way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_8way_context     sha512;
+#if defined(__VAES__)
+    groestl512_4way_context groestl;
+    shavite512_4way_context shavite;
+    echo_4way_context       echo;
+#else
+    hashState_groestl       groestl;
+    sph_shavite512_context  shavite;
+    hashState_echo          echo;
+#endif
+} __attribute__ ((aligned (64)));
+
+typedef union _x16r_8way_context_overlay x16r_8way_context_overlay;
+
+extern __thread x16r_8way_context_overlay x16r_ctx;
+
+void x16r_8way_prehash( void *, void * );
+void x16r_8way_hash_generic( void *, const void * );
+void x16r_8way_hash( void *, const void * );
+int scanhash_x16r_8way( struct work *, uint32_t ,
+                        uint64_t *, struct thr_info * );
+extern __thread x16r_8way_context_overlay x16r_ctx;


 #elif defined(X16R_4WAY)

-void x16r_4way_hash( void *state, const void *input );
-int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done, struct thr_info *mythr );
+union _x16r_4way_context_overlay
+{
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_echo          echo;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    luffa_2way_context      luffa;
+    hashState_luffa         luffa1;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    simd_2way_context       simd;
+    hamsi512_4way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_4way_context     sha512;
+} __attribute__ ((aligned (64)));

-#else
+typedef union _x16r_4way_context_overlay x16r_4way_context_overlay;

-void x16r_hash( void *state, const void *input );
-int scanhash_x16r( struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done, struct thr_info *mythr );
+extern __thread x16r_4way_context_overlay x16r_ctx;
+
+void x16r_4way_prehash( void *, void * );
+void x16r_4way_hash_generic( void *, const void * );
+void x16r_4way_hash( void *, const void * );
+int scanhash_x16r_4way( struct work *, uint32_t,
+                        uint64_t *, struct thr_info * );
+extern __thread x16r_4way_context_overlay x16r_ctx;

 #endif

+// needed for hex
+union _x16r_context_overlay
+{
+#if defined(__AES__)
+        hashState_echo          echo;
+        hashState_groestl       groestl;
+#else
+        sph_groestl512_context   groestl;
+        sph_echo512_context      echo;
+#endif
+        sph_blake512_context    blake;
+        sph_bmw512_context      bmw;
+        sph_skein512_context    skein;
+        sph_jh512_context       jh;
+        sph_keccak512_context   keccak;
+        hashState_luffa         luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        hashState_sd            simd;
+        sph_hamsi512_context    hamsi;
+        sph_fugue512_context    fugue;
+        sph_shabal512_context   shabal;
+        sph_whirlpool_context   whirlpool;
+        SHA512_CTX              sha512;
+} __attribute__ ((aligned (64)));
+
+typedef union _x16r_context_overlay x16r_context_overlay;
+
+extern __thread x16r_context_overlay x16_ctx;
+
+void x16r_prehash( void *, void * );
+void x16r_hash_generic( void *, const void * );
+void x16r_hash( void *, const void * );
+int scanhash_x16r( struct work *, uint32_t, uint64_t *, struct thr_info * );
+
 // x16Rv2
 #if defined(X16RV2_8WAY)

@@ -108,35 +231,35 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
 #endif

 // x16rt, veil
-#if defined(X16RT_8WAY)
+#if defined(X16R_8WAY)

-void x16rt_8way_hash( void *state, const void *input );
+//void x16rt_8way_hash( void *state, const void *input );
 int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

-#elif defined(X16RT_4WAY)
+#elif defined(X16R_4WAY)

-void x16rt_4way_hash( void *state, const void *input );
+//void x16rt_4way_hash( void *state, const void *input );
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

 #else

-void x16rt_hash( void *state, const void *input );
+//void x16rt_hash( void *state, const void *input );
 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

 #endif

 // x21s
-#if defined(X21S_8WAY)
+#if defined(X16R_8WAY)

 void x21s_8way_hash( void *state, const void *input );
 int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );
 bool x21s_8way_thread_init();

-#elif defined(X21S_4WAY)
+#elif defined(X16R_4WAY)

 void x21s_4way_hash( void *state, const void *input );
 int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
@@ -152,7 +275,7 @@ bool x21s_thread_init();

 #endif

-void hex_hash( void *state, const void *input );
+//void hex_hash( void *state, const void *input );
 int scanhash_hex( struct work *work, uint32_t max_nonce,
                  uint64_t *hashes_done, struct thr_info *mythr );

--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -9,72 +9,56 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "algo/blake/sph_blake.h"
-#include "algo/bmw/sph_bmw.h"
-#include "algo/groestl/sph_groestl.h"
-#include "algo/jh/sph_jh.h"
-#include "algo/keccak/sph_keccak.h"
-#include "algo/skein/sph_skein.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/cubehash_sse2.h"
-#include "algo/simd/nist.h"
-#include "algo/echo/sph_echo.h"
-#include "algo/hamsi/sph_hamsi.h"
-#include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/sph_shabal.h"
-#include "algo/whirlpool/sph_whirlpool.h"
-#include <openssl/sha.h>
-#if defined(__AES__)
-  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/groestl/aes_ni/hash-groestl.h"
-#endif

-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
-
-union _x16r_context_overlay
+void x16r_prehash( void *edata, void *pdata )
 {
-#if defined(__AES__)
-        hashState_echo          echo;
-        hashState_groestl       groestl;
-#else
-        sph_groestl512_context   groestl;
-        sph_echo512_context      echo;
-#endif
-        sph_blake512_context    blake;
-        sph_bmw512_context      bmw;
-        sph_skein512_context    skein;
-        sph_jh512_context       jh;
-        sph_keccak512_context   keccak;
-        hashState_luffa         luffa;
-        cubehashParam           cube;
-        sph_shavite512_context  shavite;
-        hashState_sd            simd;
-        sph_hamsi512_context    hamsi;
-        sph_fugue512_context    fugue;
-        sph_shabal512_context   shabal;
-        sph_whirlpool_context   whirlpool;
-        SHA512_CTX              sha512;
-};
-typedef union _x16r_context_overlay x16r_context_overlay;
+   const char elem = x16r_hash_order[0];
+   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

-void x16r_hash( void* output, const void* input )
+   switch ( algo )
+   {
+      case JH:
+         sph_jh512_init( &x16_ctx.jh );
+         sph_jh512( &x16_ctx.jh, edata, 64 );
+      break;
+      case SKEIN:
+         sph_skein512_init( &x16_ctx.skein );
+         sph_skein512( &x16_ctx.skein, edata, 64 );
+      break;
+      case LUFFA:
+         init_luffa( &x16_ctx.luffa, 512 );
+         update_luffa( &x16_ctx.luffa, (const BitSequence*)edata, 64 );
+      break;
+      case CUBEHASH:
+         cubehashInit( &x16_ctx.cube, 512, 16, 32 );
+         cubehashUpdate( &x16_ctx.cube, (const byte*)edata, 64 );
+      break;
+      case HAMSI:
+         sph_hamsi512_init( &x16_ctx.hamsi );
+         sph_hamsi512( &x16_ctx.hamsi, edata, 64 );
+      break;
+      case SHABAL:
+         sph_shabal512_init( &x16_ctx.shabal );
+         sph_shabal512( &x16_ctx.shabal, edata, 64 );
+      break;
+      case WHIRLPOOL:
+         sph_whirlpool_init( &x16_ctx.whirlpool );
+         sph_whirlpool( &x16_ctx.whirlpool, edata, 64 );
+      break;
+   }
+}
+
+void x16r_hash_generic( void* output, const void* input )
 {
   uint32_t _ALIGN(128) hash[16];
   x16r_context_overlay ctx;
+   memcpy( &ctx, &x16_ctx, sizeof(ctx) );
   void *in = (void*) input;
   int size = 80;
-/*
-   if ( s_ntime == UINT32_MAX )
-   {
-      const uint8_t* in8 = (uint8_t*) input;
-      x16_r_s_getAlgoString( &in8[4], hashOrder );
-   }
-*/
+
   for ( int i = 0; i < 16; i++ )
   {
-      const char elem = hashOrder[i];
+      const char elem = x16r_hash_order[i];
      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

      switch ( algo )
@@ -91,23 +75,21 @@ void x16r_hash( void* output, const void* input )
         break;
         case GROESTL:
 #if defined(__AES__)
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                      (const char*)in, size<<3 );
+            groestl512_full( &ctx.groestl, (char*)hash, (char*)in, size<<3 );
 #else
            sph_groestl512_init( &ctx.groestl );
            sph_groestl512( &ctx.groestl, in, size );
            sph_groestl512_close(&ctx.groestl, hash);
 #endif
         break;
-         case SKEIN:
-            sph_skein512_init( &ctx.skein );
-            sph_skein512( &ctx.skein, in, size );
-            sph_skein512_close( &ctx.skein, hash );
-         break;
         case JH:
-            sph_jh512_init( &ctx.jh );
-            sph_jh512(&ctx.jh, in, size );
+            if ( i == 0 )
+               sph_jh512(&ctx.jh, in+64, 16 );
+            else
+            {
+               sph_jh512_init( &ctx.jh );
+               sph_jh512(&ctx.jh, in, size );
+            }
            sph_jh512_close(&ctx.jh, hash );
         break;
         case KECCAK:
@@ -115,15 +97,31 @@ void x16r_hash( void* output, const void* input )
            sph_keccak512( &ctx.keccak, in, size );
            sph_keccak512_close( &ctx.keccak, hash );
         break;
+         case SKEIN:
+            if ( i == 0 )
+               sph_skein512(&ctx.skein, in+64, 16 );
+            else
+            {
+               sph_skein512_init( &ctx.skein );
+               sph_skein512( &ctx.skein, in, size );
+            }
+            sph_skein512_close( &ctx.skein, hash );
+         break;
         case LUFFA:
-            init_luffa( &ctx.luffa, 512 );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
-                                    (const BitSequence*)in, size );
+            if ( i == 0 )
+               update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                             (const BitSequence*)in+64, 16 );
+            else
+               luffa_full( &ctx.luffa, (BitSequence*)hash, 512,
+                                 (const BitSequence*)in, size );
         break;
         case CUBEHASH:
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash,
-                                  (const byte*)in, size );
+            if ( i == 0 )
+               cubehashUpdateDigest( &ctx.cube, (byte*)hash,
+                                          (const byte*)in+64, 16 );
+            else
+               cubehash_full( &ctx.cube, (byte*)hash, 512,
+                                         (byte*)in, size );
         break;
         case SHAVITE:
            sph_shavite512_init( &ctx.shavite );
@@ -131,93 +129,109 @@ void x16r_hash( void* output, const void* input )
            sph_shavite512_close( &ctx.shavite, hash );
         break;
         case SIMD:
-             init_sd( &ctx.simd, 512 );
-             update_final_sd( &ctx.simd, (BitSequence *)hash,
-                              (const BitSequence*)in, size<<3 );
+            simd_full( &ctx.simd, (BitSequence *)hash,
+                             (const BitSequence*)in, size<<3 );
         break;
         case ECHO:
 #if defined(__AES__)
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash,
-                                (const BitSequence*)in, size<<3 );
+            echo_full( &ctx.echo, hash, 512, in, size );
 #else
-             sph_echo512_init( &ctx.echo );
-             sph_echo512( &ctx.echo, in, size );
-             sph_echo512_close( &ctx.echo, hash );
+            sph_echo512_init( &ctx.echo );
+            sph_echo512( &ctx.echo, in, size );
+            sph_echo512_close( &ctx.echo, hash );
 #endif
         break;
         case HAMSI:
-             sph_hamsi512_init( &ctx.hamsi );
-             sph_hamsi512( &ctx.hamsi, in, size );
-             sph_hamsi512_close( &ctx.hamsi, hash );
+            if ( i == 0 )
+               sph_hamsi512( &ctx.hamsi, in+64, 16 );
+            else
+            {
+               sph_hamsi512_init( &ctx.hamsi );
+               sph_hamsi512( &ctx.hamsi, in, size );
+            }
+            sph_hamsi512_close( &ctx.hamsi, hash );
         break;
         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in, size );
-             sph_fugue512_close( &ctx.fugue, hash );
+            sph_fugue512_init( &ctx.fugue );
+            sph_fugue512( &ctx.fugue, in, size );
+            sph_fugue512_close( &ctx.fugue, hash );
         break;
         case SHABAL:
-             sph_shabal512_init( &ctx.shabal );
-             sph_shabal512( &ctx.shabal, in, size );
-             sph_shabal512_close( &ctx.shabal, hash );
+            if ( i == 0 )
+               sph_shabal512( &ctx.shabal, in+64, 16 );
+            else
+            {
+               sph_shabal512_init( &ctx.shabal );
+               sph_shabal512( &ctx.shabal, in, size );
+            }
+            sph_shabal512_close( &ctx.shabal, hash );
         break;
         case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash );
+            if ( i == 0 )
+               sph_whirlpool( &ctx.whirlpool, in+64, 16 );
+            else
+            {
+               sph_whirlpool_init( &ctx.whirlpool );
+               sph_whirlpool( &ctx.whirlpool, in, size );
+            }
+            sph_whirlpool_close( &ctx.whirlpool, hash );
         break;
         case SHA_512:
-             SHA512_Init( &ctx.sha512 );
-             SHA512_Update( &ctx.sha512, in, size );
-             SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+            SHA512_Init( &ctx.sha512 );
+            SHA512_Update( &ctx.sha512, in, size );
+            SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
         break;
      }
      in = (void*) hash;
      size = 64;
   }
-   memcpy(output, hash, 32);
+   memcpy( output, hash, 64 );
+}
+
+void x16r_hash( void* output, const void* input )
+{  
+   uint8_t hash[64] __attribute__ ((aligned (64)));
+   x16r_hash_generic( hash, input );
+   
+   memcpy( output, hash, 32 );
 }

 int scanhash_x16r( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(128) hash32[8];
-   uint32_t _ALIGN(128) endiandata[20];
+   uint32_t _ALIGN(128) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id; 
   uint32_t nonce = first_nonce;
-   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   volatile uint8_t *restart = &( work_restart[thr_id].restart );
+   const bool bench = opt_benchmark;
+   if ( bench )  ptarget[7] = 0x0cff;

-   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   mm128_bswap32_80( edata, pdata );

+   static __thread uint32_t s_ntime = UINT32_MAX;
   if ( s_ntime != pdata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
-      x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
+      x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+           applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

-   if ( opt_benchmark )
-      ptarget[7] = 0x0cff;
+   x16r_prehash( edata, pdata );

   do
   {
-      be32enc( &endiandata[19], nonce );
-      x16r_hash( hash32, endiandata );
+      edata[19] = nonce;
+      x16r_hash( hash32, edata );

-      if ( hash32[7] <= Htarg )
-      if (fulltest( hash32, ptarget ) && !opt_benchmark )
+      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
      {
-         pdata[19] = nonce;
+         pdata[19] = bswap_32( nonce );
         submit_solution( work, hash32, mythr );
      }
      nonce++;
@@ -226,3 +240,4 @@ int scanhash_x16r( struct work *work, uint32_t max_nonce,
   *hashes_done = pdata[19] - first_nonce + 1;
   return 0;
 }
+
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -2,481 +2,14 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "algo/blake/blake-hash-4way.h"
-#include "algo/bmw/bmw-hash-4way.h"
-#include "algo/groestl/aes_ni/hash-groestl.h"
-#include "algo/groestl/aes_ni/hash-groestl.h"
-#include "algo/skein/skein-hash-4way.h"
-#include "algo/jh/jh-hash-4way.h"
-#include "algo/keccak/keccak-hash-4way.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa-hash-2way.h"
-#include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/cubehash_sse2.h"
-#include "algo/cubehash/cube-hash-2way.h"
-#include "algo/simd/simd-hash-2way.h"
-#include "algo/echo/aes_ni/hash_api.h"
-#include "algo/hamsi/hamsi-hash-4way.h"
-#include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/shabal-hash-4way.h"
-#include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sha-hash-4way.h"
-#if defined(__VAES__)
-  #include "algo/groestl/groestl512-hash-4way.h"
-  #include "algo/shavite/shavite-hash-4way.h"
-  #include "algo/echo/echo-hash-4way.h"
-#endif

-#if defined (X16RT_8WAY)
-
-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
-
-union _x16rt_8way_context_overlay
-{
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
-    luffa_4way_context      luffa;
-    cubehashParam           cube;
-//    cube_4way_context       cube;
-    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
-    sph_fugue512_context    fugue;
-    shabal512_8way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
-#if defined(__VAES__)
-    groestl512_4way_context groestl;
-    shavite512_4way_context shavite;
-    echo_4way_context       echo;
-#else
-    hashState_groestl       groestl;
-    sph_shavite512_context  shavite;
-    hashState_echo          echo;
-#endif
-} __attribute__ ((aligned (64)));
-
-typedef union _x16rt_8way_context_overlay x16rt_8way_context_overlay;
-
-static __thread x16rt_8way_context_overlay x16rt_ctx;
-
-void x16rt_8way_hash( void* output, const void* input )
-{
-   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[20] __attribute__ ((aligned (64)));
-   uint32_t hash1[20] __attribute__ ((aligned (64)));
-   uint32_t hash2[20] __attribute__ ((aligned (64)));
-   uint32_t hash3[20] __attribute__ ((aligned (64)));
-   uint32_t hash4[20] __attribute__ ((aligned (64)));
-   uint32_t hash5[20] __attribute__ ((aligned (64)));
-   uint32_t hash6[20] __attribute__ ((aligned (64)));
-   uint32_t hash7[20] __attribute__ ((aligned (64)));
-   x16rt_8way_context_overlay ctx;
-   memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-   void *in0 = (void*) hash0;
-   void *in1 = (void*) hash1;
-   void *in2 = (void*) hash2;
-   void *in3 = (void*) hash3;
-   void *in4 = (void*) hash4;
-   void *in5 = (void*) hash5;
-   void *in6 = (void*) hash6;
-   void *in7 = (void*) hash7;
-   int size = 80;
-
-   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                 input, 640 );
-
-   for ( int i = 0; i < 16; i++ )
-   {
-      const char elem = hashOrder[i];
-      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
-      switch ( algo )
-      {
-         case BLAKE:
-            if ( i == 0 )
-               blake512_8way_full( &ctx.blake, vhash, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               blake512_8way_full( &ctx.blake, vhash, vhash, size );
-            }
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5,
-                                 hash6, hash7, vhash );
-         break;
-         case BMW:
-            bmw512_8way_init( &ctx.bmw );
-            if ( i == 0 )
-               bmw512_8way_update( &ctx.bmw, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-            bmw512_8way_update( &ctx.bmw, vhash, size );
-            }
-            bmw512_8way_close( &ctx.bmw, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case GROESTL:
-#if defined(__VAES__)
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            groestl512_4way_init( &ctx.groestl, 64 );
-            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            groestl512_4way_init( &ctx.groestl, 64 );
-            groestl512_4way_update_close( &ctx.groestl, vhash, vhash, size<<3 );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-#else
-            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash4, (char*)in4, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash5, (char*)in5, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash6, (char*)in6, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash7, (char*)in7, size<<3 );
-#endif
-               break;
-         case JH:
-            if ( i == 0 )
-               jh512_8way_update( &ctx.jh, input + (64<<3), 16 );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               jh512_8way_init( &ctx.jh );
-               jh512_8way_update( &ctx.jh, vhash, size );
-            }
-            jh512_8way_close( &ctx.jh, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case KECCAK:
-            keccak512_8way_init( &ctx.keccak );
-            if ( i == 0 )
-               keccak512_8way_update( &ctx.keccak, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               keccak512_8way_update( &ctx.keccak, vhash, size );
-            }
-            keccak512_8way_close( &ctx.keccak, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case SKEIN:
-            if ( i == 0 )
-               skein512_8way_update( &ctx.skein, input + (64<<3), 16 );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               skein512_8way_init( &ctx.skein );
-               skein512_8way_update( &ctx.skein, vhash, size );
-            }
-            skein512_8way_close( &ctx.skein, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case LUFFA:
-            if ( i == 0 )
-            {
-                intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-                luffa_4way_update_close( &ctx.luffa, vhash,
-                                                     vhash + (16<<2), 16 );
-                dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-                memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-                intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-                luffa_4way_update_close( &ctx.luffa, vhash,
-                                                     vhash + (16<<2), 16 );
-                dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-            }
-            else
-            {
-               intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-               luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
-               dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-               intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-               luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
-               dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-            }
-         break;
-         case CUBEHASH:
-            if ( i == 0 )
-            {
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                            (const byte*)in0 + 64, 16 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
-                                            (const byte*)in1 + 64, 16 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
-                                            (const byte*)in2 + 64, 16 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
-                                            (const byte*)in3 + 64, 16 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash4,
-                                            (const byte*)in4 + 64, 16 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash5,
-                                            (const byte*)in5 + 64, 16 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash6,
-                                            (const byte*)in6 + 64, 16 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash7,
-                                            (const byte*)in7 + 64, 16 );
-            }
-            else
-            {
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
-                                             (const byte*)in0, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                             (const byte*)in1, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                             (const byte*)in2, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                             (const byte*)in3, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash4,
-                                             (const byte*)in4, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash5,
-                                             (const byte*)in5, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash6,
-                                             (const byte*)in6, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash7,
-                                             (const byte*)in7, size );
-            }
-         break;
-         case SHAVITE:
-#if defined(__VAES__)
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            shavite512_4way_full( &ctx.shavite, vhash, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            shavite512_4way_full( &ctx.shavite, vhash, vhash, size );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-#else
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in4, size );
-            sph_shavite512_close( &ctx.shavite, hash4 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in5, size );
-            sph_shavite512_close( &ctx.shavite, hash5 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in6, size );
-            sph_shavite512_close( &ctx.shavite, hash6 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in7, size );
-            sph_shavite512_close( &ctx.shavite, hash7 );
-#endif
-            break;
-         case SIMD:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            simd512_4way_full( &ctx.simd, vhash, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            simd512_4way_full( &ctx.simd, vhash, vhash, size );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-         break;
-         case ECHO:
-#if defined(__VAES__)
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            echo_4way_full( &ctx.echo, vhash, 512, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            echo_4way_full( &ctx.echo, vhash, 512, vhash, size );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-#else
-            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                              (const BitSequence *)in0, size );
-            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                              (const BitSequence *)in1, size );
-            echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                              (const BitSequence *)in2, size );
-            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                              (const BitSequence *)in3, size );
-            echo_full( &ctx.echo, (BitSequence *)hash4, 512,
-                              (const BitSequence *)in4, size );
-            echo_full( &ctx.echo, (BitSequence *)hash5, 512,
-                              (const BitSequence *)in5, size );
-            echo_full( &ctx.echo, (BitSequence *)hash6, 512,
-                              (const BitSequence *)in6, size );
-            echo_full( &ctx.echo, (BitSequence *)hash7, 512,
-                              (const BitSequence *)in7, size );
-#endif
-         break;
-         case HAMSI:
-            if ( i == 0 )
-               hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               hamsi512_8way_init( &ctx.hamsi );
-               hamsi512_8way_update( &ctx.hamsi, vhash, size );
-            }
-            hamsi512_8way_close( &ctx.hamsi, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in4, size );
-             sph_fugue512_close( &ctx.fugue, hash4 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in5, size );
-             sph_fugue512_close( &ctx.fugue, hash5 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in6, size );
-             sph_fugue512_close( &ctx.fugue, hash6 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in7, size );
-             sph_fugue512_close( &ctx.fugue, hash7 );
-         break;
-         case SHABAL:
-             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                             size<<3 );
-             if ( i == 0 )
-                shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 );
-             else
-             {
-                shabal512_8way_init( &ctx.shabal );
-                shabal512_8way_update( &ctx.shabal, vhash, size );
-             }
-             shabal512_8way_close( &ctx.shabal, vhash );
-             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case WHIRLPOOL:
-            if ( i == 0 )
-            {
-               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash4 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash5 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash6 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash7 );
-            }
-            else
-            {
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in0, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in1, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in2, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in3, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in4, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash4 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in5, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash5 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in6, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash6 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in7, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash7 );
-            }
-         break;
-         case SHA_512:
-             sha512_8way_init( &ctx.sha512 );
-             if ( i == 0 )
-                sha512_8way_update( &ctx.sha512, input, size );
-             else
-             {
-                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                             size<<3 );
-                sha512_8way_update( &ctx.sha512, vhash, size );
-             }
-             sha512_8way_close( &ctx.sha512, vhash );
-             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                               hash7, vhash );
-          break;
-      }
-      size = 64;
-   }
-
-   memcpy( output,     hash0, 32 );
-   memcpy( output+32,  hash1, 32 );
-   memcpy( output+64,  hash2, 32 );
-   memcpy( output+96,  hash3, 32 );
-   memcpy( output+128, hash4, 32 );
-   memcpy( output+160, hash5, 32 );
-   memcpy( output+192, hash6, 32 );
-   memcpy( output+224, hash7, 32 );
-}
+#if defined (X16R_8WAY)

 int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[16*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
-   uint32_t edata[20] __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) timeHash[8*8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -490,74 +23,25 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,

   if ( bench )   ptarget[7] = 0x0cff;

+   static __thread uint32_t s_ntime = UINT32_MAX;
   uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
      x16rt_getTimeHash( ntime, &timeHash );
-      x16rt_getAlgoString( &timeHash[0], hashOrder );
+      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                               hashOrder, ntime, timeHash );
-   }
-
-   // Do midstate prehash on hash functions with block size <= 64 bytes.
-   const char elem = hashOrder[0];
-   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-   switch ( algo )
-   {
-      case JH:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         jh512_8way_init( &x16rt_ctx.jh );
-         jh512_8way_update( &x16rt_ctx.jh, vdata, 64 );
-      break;
-      case SKEIN:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         skein512_8way_init( &x16rt_ctx.skein );
-         skein512_8way_update( &x16rt_ctx.skein, vdata, 64 );
-      break;
-      case LUFFA:
-         mm128_bswap32_80( edata, pdata );
-         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
-         luffa_4way_init( &x16rt_ctx.luffa, 512 );
-         luffa_4way_update( &x16rt_ctx.luffa, vdata2, 64 );
-         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
-      break;
-      case CUBEHASH:
-         mm128_bswap32_80( edata, pdata );
-         cubehashInit( &x16rt_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16rt_ctx.cube, (const byte*)edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
-      break;
-      case HAMSI:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         hamsi512_8way_init( &x16rt_ctx.hamsi );
-         hamsi512_8way_update( &x16rt_ctx.hamsi, vdata, 64 );
-      break;
-      case SHABAL:
-         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
-         shabal512_8way_init( &x16rt_ctx.shabal );
-         shabal512_8way_update( &x16rt_ctx.shabal, vdata2, 64 );
-         rintrlv_8x32_8x64( vdata, vdata2, 640 );
-      break;
-      case WHIRLPOOL:
-         mm128_bswap32_80( edata, pdata );
-         sph_whirlpool_init( &x16rt_ctx.whirlpool );
-         sph_whirlpool( &x16rt_ctx.whirlpool, edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
-      break;
-      default:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+                               x16r_hash_order, ntime, timeHash );
   }

+   x16r_8way_prehash( vdata, pdata );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
   do
   {
-      x16rt_8way_hash( hash, vdata );
+      x16r_8way_hash( hash, vdata );

      for ( int i = 0; i < 8; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
@@ -574,313 +58,13 @@ int scanhash_x16rt_8way( struct work *work, uint32_t max_nonce,
   return 0;
 }

-#elif defined (X16RT_4WAY)
-
-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
-
-union _x16rt_4way_context_overlay
-{
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
-    hashState_echo          echo;
-    hashState_groestl       groestl;
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
-    luffa_2way_context      luffa;
-    hashState_luffa         luffa1;
-    cubehashParam           cube;
-    sph_shavite512_context  shavite;
-    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
-    sph_fugue512_context    fugue;
-    shabal512_4way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
-};
-typedef union _x16rt_4way_context_overlay x16rt_4way_context_overlay;
-
-static __thread x16rt_4way_context_overlay x16rt_ctx;
-
-void x16rt_4way_hash( void* output, const void* input )
-{
-   uint32_t hash0[20] __attribute__ ((aligned (64)));
-   uint32_t hash1[20] __attribute__ ((aligned (64)));
-   uint32_t hash2[20] __attribute__ ((aligned (64)));
-   uint32_t hash3[20] __attribute__ ((aligned (64)));
-   uint32_t vhash[20*4] __attribute__ ((aligned (64)));
-   x16rt_4way_context_overlay ctx;
-   memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-   void *in0 = (void*) hash0;
-   void *in1 = (void*) hash1;
-   void *in2 = (void*) hash2;
-   void *in3 = (void*) hash3;
-   int size = 80;
-
-   dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
- 
-   // Input data is both 64 bit interleaved (input)
-   // and deinterleaved in inp0-3. First function has no need re-interleave.
-   for ( int i = 0; i < 16; i++ )
-   {
-      const char elem = hashOrder[i];
-      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
-      switch ( algo )
-      {
-         case BLAKE:
-            if ( i == 0 )
-               blake512_4way_full( &ctx.blake, vhash, input, size );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way_full( &ctx.blake, vhash, vhash, size );
-            }
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case BMW:
-            bmw512_4way_init( &ctx.bmw );
-            if ( i == 0 )
-               bmw512_4way_update( &ctx.bmw, input, size );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way_update( &ctx.bmw, vhash, size );
-            }
-            bmw512_4way_close( &ctx.bmw, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case GROESTL:
-            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
-         break;
-         case JH:
-            if ( i == 0 )
-               jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way_init( &ctx.jh );
-               jh512_4way_update( &ctx.jh, vhash, size );
-            }
-            jh512_4way_close( &ctx.jh, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case KECCAK:
-            keccak512_4way_init( &ctx.keccak );
-            if ( i == 0 )
-               keccak512_4way_update( &ctx.keccak, input, size );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               keccak512_4way_update( &ctx.keccak, vhash, size );
-            }
-            keccak512_4way_close( &ctx.keccak, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case SKEIN:
-            if ( i == 0 )
-               skein512_4way_update( &ctx.skein, input + (64<<2), 16 );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way_init( &ctx.skein );
-               skein512_4way_update( &ctx.skein, vhash, size );
-            }
-            skein512_4way_close( &ctx.skein, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case LUFFA:
-            if ( i == 0 )
-            {
-               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash0,
-                                    (const BitSequence*)in0 + 64, 16 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash1,
-                                    (const BitSequence*)in1 + 64, 16 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash2,
-                                    (const BitSequence*)in2 + 64, 16 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash3,
-                                    (const BitSequence*)in3 + 64, 16 );
-            }
-            else
-            {
-               intrlv_2x128( vhash, in0, in1, size<<3 );
-               luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
-               dintrlv_2x128_512( hash0, hash1, vhash );
-               intrlv_2x128( vhash, in2, in3, size<<3 );
-               luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
-               dintrlv_2x128_512( hash2, hash3, vhash );
-            }
-         break;
-         case CUBEHASH:
-            if ( i == 0 )
-            {
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                          (const byte*)in0 + 64, 16 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                          (const byte*)in1 + 64, 16 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                          (const byte*)in2 + 64, 16 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                          (const byte*)in3 + 64, 16 );
-
-            }
-            else
-            {
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                          (const byte*)in0, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
-                                     (const byte*)in1, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
-                                     (const byte*)in2, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
-                                     (const byte*)in3, size );
-            }
-         break;
-         case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
-         break;
-         case SIMD:
-            intrlv_2x128( vhash, in0, in1, size<<3 );
-            simd_2way_init( &ctx.simd, 512 );
-            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
-            intrlv_2x128( vhash, in2, in3, size<<3 );
-            simd_2way_init( &ctx.simd, 512 );
-            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
-         break;
-         case ECHO:
-            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                              (const BitSequence *)in0, size );
-            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                              (const BitSequence *)in1, size );
-            echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                              (const BitSequence *)in2, size );
-            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                              (const BitSequence *)in3, size );
-         break;
-         case HAMSI:
-            if ( i == 0 )
-               hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               hamsi512_4way_init( &ctx.hamsi );
-               hamsi512_4way_update( &ctx.hamsi, vhash, size );
-            }
-            hamsi512_4way_close( &ctx.hamsi, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
-         break;
-         case SHABAL:
-            intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
-            if ( i == 0 )
-               shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 );
-            else
-            {
-               shabal512_4way_init( &ctx.shabal );
-               shabal512_4way_update( &ctx.shabal, vhash, size );
-            }
-            shabal512_4way_close( &ctx.shabal, vhash );
-            dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case WHIRLPOOL:
-            if ( i == 0 )
-            {
-               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               memcpy( &ctx, &x16rt_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
-            }
-            else
-            {
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in0, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in1, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in2, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in3, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
-            }
-            break;
-         case SHA_512:
-            sha512_4way_init( &ctx.sha512 );
-            if ( i == 0 )
-               sha512_4way_update( &ctx.sha512, input, size );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               sha512_4way_update( &ctx.sha512, vhash, size );
-            }
-            sha512_4way_close( &ctx.sha512, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-      }
-      size = 64;
-   }
-   memcpy( output,    hash0, 32 );
-   memcpy( output+32, hash1, 32 );
-   memcpy( output+64, hash2, 32 );
-   memcpy( output+96, hash3, 32 );
-}
+#elif defined (X16R_4WAY)

 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr)
 {
   uint32_t hash[4*16] __attribute__ ((aligned (64)));
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
-   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
-   uint32_t edata[20] __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) timeHash[4*8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -894,70 +78,24 @@ int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,

   if ( bench )  ptarget[7] = 0x0cff;

+   static __thread uint32_t s_ntime = UINT32_MAX;
   uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
      x16rt_getTimeHash( ntime, &timeHash );
-      x16rt_getAlgoString( &timeHash[0], hashOrder );
+      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                               hashOrder, ntime, timeHash );
-   }
-
-   const char elem = hashOrder[0];
-   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-   switch ( algo )
-   {
-      case JH:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         jh512_4way_init( &x16rt_ctx.jh );
-         jh512_4way_update( &x16rt_ctx.jh, vdata, 64 );
-      break;
-      case SKEIN:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_init( &x16rt_ctx.skein );
-         skein512_4way_update( &x16rt_ctx.skein, vdata, 64 );
-      break;
-      case LUFFA:
-         mm128_bswap32_80( edata, pdata );
-         init_luffa( &x16rt_ctx.luffa1, 512 );
-         update_luffa( &x16rt_ctx.luffa1, (const BitSequence*)edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
-      break;
-      case CUBEHASH:
-         mm128_bswap32_80( edata, pdata );
-         cubehashInit( &x16rt_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x16rt_ctx.cube, (const byte*)edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
-      break;
-      case HAMSI:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         hamsi512_4way_init( &x16rt_ctx.hamsi );
-         hamsi512_4way_update( &x16rt_ctx.hamsi, vdata, 64 );
-      break;
-      case SHABAL:
-         mm128_bswap32_intrlv80_4x32( vdata32, pdata );
-         shabal512_4way_init( &x16rt_ctx.shabal );
-         shabal512_4way_update( &x16rt_ctx.shabal, vdata32, 64 );
-         rintrlv_4x32_4x64( vdata, vdata32, 640 );
-      break;
-      case WHIRLPOOL:
-         mm128_bswap32_80( edata, pdata );
-         sph_whirlpool_init( &x16rt_ctx.whirlpool );
-         sph_whirlpool( &x16rt_ctx.whirlpool, edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
-      break;
-      default:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+                               x16r_hash_order, ntime, timeHash );
   }

+   x16r_4way_prehash( vdata, pdata );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
-
   do
   {
-      x16rt_4way_hash( hash, vdata );
+      x16r_4way_hash( hash, vdata );
      for ( int i = 0; i < 4; i++ )
      if ( unlikely( valid_hash( hash + (i<<3), ptarget ) && !bench ) )
      {
--- a/algo/x16/x16rt.c
+++ b/algo/x16/x16rt.c
@@ -1,234 +1,46 @@
 #include "x16r-gate.h"

-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "algo/blake/sph_blake.h"
-#include "algo/bmw/sph_bmw.h"
-#include "algo/groestl/sph_groestl.h"
-#include "algo/jh/sph_jh.h"
-#include "algo/keccak/sph_keccak.h"
-#include "algo/skein/sph_skein.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/cubehash_sse2.h"
-#include "algo/simd/nist.h"
-#include "algo/echo/sph_echo.h"
-#include "algo/hamsi/sph_hamsi.h"
-#include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/sph_shabal.h"
-#include "algo/whirlpool/sph_whirlpool.h"
-#include <openssl/sha.h>
-#if defined(__AES__)
-  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/groestl/aes_ni/hash-groestl.h"
-#endif
-
-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread bool s_implemented = false;
-static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
-
-union _x16rt_context_overlay
-{
-#if defined(__AES__)
-        hashState_echo          echo;
-        hashState_groestl       groestl;
-#else
-        sph_groestl512_context   groestl;
-        sph_echo512_context      echo;
-#endif
-        sph_blake512_context    blake;
-        sph_bmw512_context      bmw;
-        sph_skein512_context    skein;
-        sph_jh512_context       jh;
-        sph_keccak512_context   keccak;
-        hashState_luffa         luffa;
-        cubehashParam           cube;
-        sph_shavite512_context  shavite;
-        hashState_sd            simd;
-        sph_hamsi512_context    hamsi;
-        sph_fugue512_context    fugue;
-        sph_shabal512_context   shabal;
-        sph_whirlpool_context   whirlpool;
-        SHA512_CTX              sha512;
-};
-typedef union _x16rt_context_overlay x16rt_context_overlay;
-
-void x16rt_hash( void* output, const void* input )
-{
-   uint32_t _ALIGN(128) hash[16];
-   x16rt_context_overlay ctx;
-   int size = 80;
-   void *in = (void*) input;
-
-/*
-   void *in = (void*) input;
-   uint32_t *in32 = (uint32_t*) in;
-   uint32_t ntime = in32[17];
-   if ( s_ntime == UINT32_MAX )
-   {
-      uint32_t _ALIGN(64) timeHash[8];
-      x16rt_getTimeHash(ntime, &timeHash);
-      x16rt_getAlgoString(&timeHash[0], hashOrder);
-   }
-*/
-   
-   for ( int i = 0; i < 16; i++ )
-   {
-      const char elem = hashOrder[i];
-      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
-      switch ( algo )
-      {
-         case BLAKE:
-            sph_blake512_init( &ctx.blake );
-            sph_blake512( &ctx.blake, in, size );
-            sph_blake512_close( &ctx.blake, hash );
-         break;
-         case BMW:
-            sph_bmw512_init( &ctx.bmw );
-            sph_bmw512(&ctx.bmw, in, size);
-            sph_bmw512_close(&ctx.bmw, hash);
-         break;
-         case GROESTL:
-#if defined(__AES__)
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                      (const char*)in, size<<3 );
-#else
-            sph_groestl512_init( &ctx.groestl );
-            sph_groestl512( &ctx.groestl, in, size );
-            sph_groestl512_close(&ctx.groestl, hash);
-#endif
-         break;
-         case SKEIN:
-            sph_skein512_init( &ctx.skein );
-            sph_skein512( &ctx.skein, in, size );
-            sph_skein512_close( &ctx.skein, hash );
-         break;
-         case JH:
-            sph_jh512_init( &ctx.jh );
-            sph_jh512(&ctx.jh, in, size );
-            sph_jh512_close(&ctx.jh, hash );
-         break;
-         case KECCAK:
-            sph_keccak512_init( &ctx.keccak );
-            sph_keccak512( &ctx.keccak, in, size );
-            sph_keccak512_close( &ctx.keccak, hash );
-         break;
-         case LUFFA:
-            init_luffa( &ctx.luffa, 512 );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
-                                    (const BitSequence*)in, size );
-         break;
-         case CUBEHASH:
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash,
-                                  (const byte*)in, size );
-         break;
-         case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in, size );
-            sph_shavite512_close( &ctx.shavite, hash );
-         break;
-         case SIMD:
-             init_sd( &ctx.simd, 512 );
-             update_final_sd( &ctx.simd, (BitSequence *)hash,
-                              (const BitSequence*)in, size<<3 );
-         break;
-         case ECHO:
-#if defined(__AES__)
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash,
-                                (const BitSequence*)in, size<<3 );
-#else
-             sph_echo512_init( &ctx.echo );
-             sph_echo512( &ctx.echo, in, size );
-             sph_echo512_close( &ctx.echo, hash );
-#endif
-         break;
-         case HAMSI:
-             sph_hamsi512_init( &ctx.hamsi );
-             sph_hamsi512( &ctx.hamsi, in, size );
-             sph_hamsi512_close( &ctx.hamsi, hash );
-         break;
-         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in, size );
-             sph_fugue512_close( &ctx.fugue, hash );
-         break;
-         case SHABAL:
-             sph_shabal512_init( &ctx.shabal );
-             sph_shabal512( &ctx.shabal, in, size );
-             sph_shabal512_close( &ctx.shabal, hash );
-         break;
-         case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash );
-         break;
-         case SHA_512:
-             SHA512_Init( &ctx.sha512 );
-             SHA512_Update( &ctx.sha512, in, size );
-             SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
-         break;
-      }
-      in = (void*) hash;
-      size = 64;
-   }
-   memcpy(output, hash, 32);
-}
+#if !defined(X16R_8WAY) && !defined(X16R_4WAY)

 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(128) hash32[8];
-   uint32_t _ALIGN(128) endiandata[20];
+   uint32_t _ALIGN(128) edata[20];
   uint32_t _ALIGN(64) timeHash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id; 
   uint32_t nonce = first_nonce;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+   if ( bench )  ptarget[7] = 0x0cff;

-   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   mm128_bswap32_80( edata, pdata );

+   static __thread uint32_t s_ntime = UINT32_MAX;
   uint32_t ntime = swab32( pdata[17] );
   if ( s_ntime != ntime )
   {
      x16rt_getTimeHash( ntime, &timeHash );
-      x16rt_getAlgoString( &timeHash[0], hashOrder );
+      x16rt_getAlgoString( &timeHash[0], x16r_hash_order );
      s_ntime = ntime;
-      s_implemented = true;
      if ( opt_debug && !thr_id )
          applog( LOG_INFO, "hash order: %s time: (%08x) time hash: (%08x)",
-                               hashOrder, ntime, timeHash );
-   }
-   if ( !s_implemented )
-   {
-      applog( LOG_WARNING, "s not implemented");
-      sleep(1);
-      return 0;
+                               x16r_hash_order, ntime, timeHash );
   }
   
-   if ( opt_benchmark )
-      ptarget[7] = 0x0cff;
-
+   x16r_prehash( edata, pdata );
+   
   do
   {
-      be32enc( &endiandata[19], nonce );
-      x16rt_hash( hash32, endiandata );
+      edata[19] = nonce;
+      x16r_hash( hash32, edata );

-      if ( hash32[7] <= Htarg )
-      if (fulltest( hash32, ptarget ) && !opt_benchmark )
+      if ( valid_hash( hash32, ptarget ) && !bench )
      {
-         pdata[19] = nonce;
+         pdata[19] = bswap_32( nonce );
         submit_solution( work, hash32, mythr );
      }
      nonce++;
@@ -237,3 +49,6 @@ int scanhash_x16rt( struct work *work, uint32_t max_nonce,
   *hashes_done = pdata[19] - first_nonce + 1;
   return 0;
 }
+
+#endif  // !defined(X16R_8WAY) && !defined(X16R_4WAY)
+
--- a/algo/x16/x16rv2.c
+++ b/algo/x16/x16rv2.c
@@ -6,6 +6,8 @@
 */
 #include "x16r-gate.h"

+#if !defined(X16R_8WAY) && !defined(X16R_4WAY)
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -245,3 +247,5 @@ int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
   *hashes_done = pdata[19] - first_nonce + 1;
   return 0;
 }
+
+#endif
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -8,480 +8,43 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "algo/blake/blake-hash-4way.h"
-#include "algo/bmw/bmw-hash-4way.h"
-#include "algo/groestl/aes_ni/hash-groestl.h"
-#include "algo/groestl/aes_ni/hash-groestl.h"
-#include "algo/skein/skein-hash-4way.h"
-#include "algo/jh/jh-hash-4way.h"
-#include "algo/keccak/keccak-hash-4way.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa-hash-2way.h"
-#include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/cubehash_sse2.h"
-#include "algo/cubehash/cube-hash-2way.h"
-#include "algo/simd/simd-hash-2way.h"
-#include "algo/echo/aes_ni/hash_api.h"
-#include "algo/hamsi/hamsi-hash-4way.h"
-#include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/shabal-hash-4way.h"
-#include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sha-hash-4way.h"
 #include "algo/haval/haval-hash-4way.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"
-#if defined(__VAES__)
-  #include "algo/groestl/groestl512-hash-4way.h"
-  #include "algo/shavite/shavite-hash-4way.h"
-  #include "algo/echo/echo-hash-4way.h"
-#endif
 #if defined(__SHA__)
 #include <openssl/sha.h>
 #endif

-#if defined(X21S_8WAY) || defined(X21S_4WAY)
-
-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
-
-#endif
-
 #if defined (X21S_8WAY)

 static __thread uint64_t* x21s_8way_matrix;

 union _x21s_8way_context_overlay
 {
-    blake512_8way_context   blake;
-    bmw512_8way_context     bmw;
-    skein512_8way_context   skein;
-    jh512_8way_context      jh;
-    keccak512_8way_context  keccak;
-    luffa_4way_context      luffa;
-    cubehashParam           cube;
-//    cube_4way_context       cube;
-    simd_4way_context       simd;
-    hamsi512_8way_context   hamsi;
-    sph_fugue512_context    fugue;
-    shabal512_8way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_8way_context     sha512;
    haval256_5_8way_context haval;
    sph_tiger_context       tiger;
    sph_gost512_context     gost;
    sha256_8way_context     sha256;
-#if defined(__VAES__)
-    groestl512_4way_context groestl;
-    shavite512_4way_context shavite;
-    echo_4way_context       echo;
-#else
-    hashState_groestl       groestl;
-    sph_shavite512_context  shavite;
-    hashState_echo          echo;
-#endif
 } __attribute__ ((aligned (64)));

 typedef union _x21s_8way_context_overlay x21s_8way_context_overlay;

-static __thread x21s_8way_context_overlay x21s_ctx;
-
 void x21s_8way_hash( void* output, const void* input )
 {
-   uint32_t vhash[20*8] __attribute__ ((aligned (128)));
-   uint32_t hash0[20] __attribute__ ((aligned (64)));
-   uint32_t hash1[20] __attribute__ ((aligned (64)));
-   uint32_t hash2[20] __attribute__ ((aligned (64)));
-   uint32_t hash3[20] __attribute__ ((aligned (64)));
-   uint32_t hash4[20] __attribute__ ((aligned (64)));
-   uint32_t hash5[20] __attribute__ ((aligned (64)));
-   uint32_t hash6[20] __attribute__ ((aligned (64)));
-   uint32_t hash7[20] __attribute__ ((aligned (64)));
+   uint32_t vhash[16*8] __attribute__ ((aligned (128)));
+   uint8_t shash[64*8] __attribute__ ((aligned (64)));
+   uint32_t *hash0 = (uint32_t*)  shash;
+   uint32_t *hash1 = (uint32_t*)( shash+64  ); 
+   uint32_t *hash2 = (uint32_t*)( shash+128 );
+   uint32_t *hash3 = (uint32_t*)( shash+192 );
+   uint32_t *hash4 = (uint32_t*)( shash+256 );
+   uint32_t *hash5 = (uint32_t*)( shash+320 );
+   uint32_t *hash6 = (uint32_t*)( shash+384 );
+   uint32_t *hash7 = (uint32_t*)( shash+448 );
   x21s_8way_context_overlay ctx;
-   memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-   void *in0 = (void*) hash0;
-   void *in1 = (void*) hash1;
-   void *in2 = (void*) hash2;
-   void *in3 = (void*) hash3;
-   void *in4 = (void*) hash4;
-   void *in5 = (void*) hash5;
-   void *in6 = (void*) hash6;
-   void *in7 = (void*) hash7;
-   int size = 80;

-   dintrlv_8x64( hash0, hash1, hash2, hash3, hash4, hash5, hash6, hash7,
-                 input, 640 );
-
-   for ( int i = 0; i < 16; i++ )
-   {
-      const char elem = hashOrder[i];
-      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
-      switch ( algo )
-      {
-         case BLAKE:
-            if ( i == 0 )
-               blake512_8way_full( &ctx.blake, vhash, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               blake512_8way_full( &ctx.blake, vhash, vhash, size );
-            }
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5,
-                                 hash6, hash7, vhash );
-         break;
-         case BMW:
-            bmw512_8way_init( &ctx.bmw );
-            if ( i == 0 )
-               bmw512_8way_update( &ctx.bmw, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-            bmw512_8way_update( &ctx.bmw, vhash, size );
-            }
-            bmw512_8way_close( &ctx.bmw, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case GROESTL:
-#if defined(__VAES__)
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            groestl512_4way_full( &ctx.groestl, vhash, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            groestl512_4way_full( &ctx.groestl, vhash, vhash, size );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-#else
-            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash4, (char*)in4, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash5, (char*)in5, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash6, (char*)in6, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash7, (char*)in7, size<<3 );
-#endif
-         break;
-         case JH:
-            if ( i == 0 )
-               jh512_8way_update( &ctx.jh, input + (64<<3), 16 );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               jh512_8way_init( &ctx.jh );
-               jh512_8way_update( &ctx.jh, vhash, size );
-            }
-            jh512_8way_close( &ctx.jh, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case KECCAK:
-            keccak512_8way_init( &ctx.keccak );
-            if ( i == 0 )
-               keccak512_8way_update( &ctx.keccak, input, size );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               keccak512_8way_update( &ctx.keccak, vhash, size );
-            }
-            keccak512_8way_close( &ctx.keccak, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case SKEIN:
-            if ( i == 0 )
-               skein512_8way_update( &ctx.skein, input + (64<<3), 16 );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               skein512_8way_init( &ctx.skein );
-               skein512_8way_update( &ctx.skein, vhash, size );
-            }
-            skein512_8way_close( &ctx.skein, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case LUFFA:
-            if ( i == 0 )
-            {
-                intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-                luffa_4way_update_close( &ctx.luffa, vhash,
-                                                     vhash + (16<<2), 16 );
-                dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-                memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-                intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-                luffa_4way_update_close( &ctx.luffa, vhash,
-                                                     vhash + (16<<2), 16 );
-                dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-            }
-            else
-            {
-               intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-               luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
-               dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-               intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-               luffa512_4way_full( &ctx.luffa, vhash, vhash, size );
-               dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-            }
-         break;
-         case CUBEHASH:
-            if ( i == 0 )
-            {
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                            (const byte*)in0 + 64, 16 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
-                                            (const byte*)in1 + 64, 16 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
-                                            (const byte*)in2 + 64, 16 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
-                                            (const byte*)in3 + 64, 16 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash4,
-                                            (const byte*)in4 + 64, 16 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash5,
-                                            (const byte*)in5 + 64, 16 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash6,
-                                            (const byte*)in6 + 64, 16 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash7,
-                                            (const byte*)in7 + 64, 16 );
-            }
-            else
-            {
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
-                                             (const byte*)in0, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                             (const byte*)in1, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                             (const byte*)in2, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                             (const byte*)in3, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash4,
-                                             (const byte*)in4, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash5,
-                                             (const byte*)in5, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash6,
-                                             (const byte*)in6, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash7,
-                                             (const byte*)in7, size );
-            }
-         break;
-         case SHAVITE:
-#if defined(__VAES__)
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            shavite512_4way_init( &ctx.shavite );
-            shavite512_4way_update_close( &ctx.shavite, vhash, vhash, size );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-#else
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in4, size );
-            sph_shavite512_close( &ctx.shavite, hash4 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in5, size );
-            sph_shavite512_close( &ctx.shavite, hash5 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in6, size );
-            sph_shavite512_close( &ctx.shavite, hash6 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in7, size );
-            sph_shavite512_close( &ctx.shavite, hash7 );
-#endif
-         break;
-         case SIMD:
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            simd512_4way_full( &ctx.simd, vhash, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            simd512_4way_full( &ctx.simd, vhash, vhash, size );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-         break;
-         case ECHO:
-#if defined(__VAES__)
-            intrlv_4x128( vhash, in0, in1, in2, in3, size<<3 );
-            echo_4way_full( &ctx.echo, vhash, 512, vhash, size );
-            dintrlv_4x128_512( hash0, hash1, hash2, hash3, vhash );
-            intrlv_4x128( vhash, in4, in5, in6, in7, size<<3 );
-            echo_4way_full( &ctx.echo, vhash, 512, vhash, size );
-            dintrlv_4x128_512( hash4, hash5, hash6, hash7, vhash );
-#else
-            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                              (const BitSequence *)in0, size );
-            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                              (const BitSequence *)in1, size );
-            echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                              (const BitSequence *)in2, size );
-            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                              (const BitSequence *)in3, size );
-            echo_full( &ctx.echo, (BitSequence *)hash4, 512,
-                              (const BitSequence *)in4, size );
-            echo_full( &ctx.echo, (BitSequence *)hash5, 512,
-                              (const BitSequence *)in5, size );
-            echo_full( &ctx.echo, (BitSequence *)hash6, 512,
-                              (const BitSequence *)in6, size );
-            echo_full( &ctx.echo, (BitSequence *)hash7, 512,
-                              (const BitSequence *)in7, size );
-#endif
-         break;
-         case HAMSI:
-            if ( i == 0 )
-               hamsi512_8way_update( &ctx.hamsi, input + (64<<3), 16 );
-            else
-            {
-               intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                            size<<3 );
-               hamsi512_8way_init( &ctx.hamsi );
-               hamsi512_8way_update( &ctx.hamsi, vhash, size );
-            }
-            hamsi512_8way_close( &ctx.hamsi, vhash );
-            dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in4, size );
-             sph_fugue512_close( &ctx.fugue, hash4 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in5, size );
-             sph_fugue512_close( &ctx.fugue, hash5 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in6, size );
-             sph_fugue512_close( &ctx.fugue, hash6 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in7, size );
-             sph_fugue512_close( &ctx.fugue, hash7 );
-         break;
-         case SHABAL:
-             intrlv_8x32( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                             size<<3 );
-             if ( i == 0 )
-                shabal512_8way_update( &ctx.shabal, vhash + (16<<3), 16 );
-             else
-             {
-                shabal512_8way_init( &ctx.shabal );
-                shabal512_8way_update( &ctx.shabal, vhash, size );
-             }
-             shabal512_8way_close( &ctx.shabal, vhash );
-             dintrlv_8x32_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                          hash7, vhash );
-         break;
-         case WHIRLPOOL:
-            if ( i == 0 )
-            {
-               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in4 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash4 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in5 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash5 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in6 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash6 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in7 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash7 );
-            }
-            else
-            {
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in0, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in1, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in2, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in3, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in4, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash4 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in5, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash5 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in6, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash6 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in7, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash7 );
-            }
-         break;
-         case SHA_512:
-             sha512_8way_init( &ctx.sha512 );
-             if ( i == 0 )
-                sha512_8way_update( &ctx.sha512, input, size );
-             else
-             {
-                intrlv_8x64( vhash, in0, in1, in2, in3, in4, in5, in6, in7,
-                             size<<3 );
-                sha512_8way_update( &ctx.sha512, vhash, size );
-             }
-             sha512_8way_close( &ctx.sha512, vhash );
-             dintrlv_8x64_512( hash0, hash1, hash2, hash3, hash4, hash5, hash6,
-                               hash7, vhash );
-          break;
-      }
-      size = 64;
-   }
+   x16r_8way_hash_generic( shash, input );

   intrlv_8x32_512( vhash, hash0, hash1, hash2, hash3, hash4, hash5, hash6,
                    hash7 );
@@ -568,8 +131,6 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*8] __attribute__ ((aligned (128)));
   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
-   uint32_t vdata2[20*8] __attribute__ ((aligned (64)));
-   uint32_t edata[20] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &hash[7<<3];
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t bedata1[2] __attribute__((aligned(64)));
@@ -588,71 +149,21 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,

   bedata1[0] = bswap_32( pdata[1] );
   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
   uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-              applog( LOG_INFO, "hash order %s (%08x)", hashOrder, ntime );
-   }
-
-   // Do midstate prehash on hash functions with block size <= 64 bytes.
-   const char elem = hashOrder[0];
-   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-   switch ( algo )
-   {
-      case JH:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         jh512_8way_init( &x21s_ctx.jh );
-         jh512_8way_update( &x21s_ctx.jh, vdata, 64 );
-      break;
-      case SKEIN:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         skein512_8way_init( &x21s_ctx.skein );
-         skein512_8way_update( &x21s_ctx.skein, vdata, 64 );
-      break;
-      case LUFFA:
-         mm128_bswap32_80( edata, pdata );
-         intrlv_4x128( vdata2, edata, edata, edata, edata, 640 );
-         luffa_4way_init( &x21s_ctx.luffa, 512 );
-         luffa_4way_update( &x21s_ctx.luffa, vdata2, 64 );
-         rintrlv_4x128_8x64( vdata, vdata2, vdata2, 640 );
-      break;
-      case CUBEHASH:
-         mm128_bswap32_80( edata, pdata );
-         cubehashInit( &x21s_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x21s_ctx.cube, (const byte*)edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
-      break;
-      case HAMSI:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
-         hamsi512_8way_init( &x21s_ctx.hamsi );
-         hamsi512_8way_update( &x21s_ctx.hamsi, vdata, 64 );
-      break;
-      case SHABAL:
-         mm256_bswap32_intrlv80_8x32( vdata2, pdata );
-         shabal512_8way_init( &x21s_ctx.shabal );
-         shabal512_8way_update( &x21s_ctx.shabal, vdata2, 64 );
-         rintrlv_8x32_8x64( vdata, vdata2, 640 );
-      break;
-      case WHIRLPOOL:
-         mm128_bswap32_80( edata, pdata );
-         sph_whirlpool_init( &x21s_ctx.whirlpool );
-         sph_whirlpool( &x21s_ctx.whirlpool, edata, 64 );
-         intrlv_8x64( vdata, edata, edata, edata, edata,
-                             edata, edata, edata, edata, 640 );
-      break;
-      default:
-         mm512_bswap32_intrlv80_8x64( vdata, pdata );
+              applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

+   x16r_8way_prehash( vdata, pdata );
   *noncev = mm512_intrlv_blend_32( _mm512_set_epi32(
                             n+7, 0, n+6, 0, n+5, 0, n+4, 0,
                             n+3, 0, n+2, 0, n+1, 0, n,   0 ), *noncev );
-
- 
   do
   {
      x21s_8way_hash( hash, vdata );
@@ -670,7 +181,7 @@ int scanhash_x21s_8way( struct work *work, uint32_t max_nonce,
      *noncev = _mm512_add_epi32( *noncev,
                                  m512_const1_64( 0x0000000800000000 ) );
      n += 8;
-   } while ( (  n < last_nonce ) && !(*restart) );
+   } while ( likely( ( n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
@@ -692,23 +203,6 @@ static __thread uint64_t* x21s_4way_matrix;

 union _x21s_4way_context_overlay
 {
-    blake512_4way_context   blake;
-    bmw512_4way_context     bmw;
-    hashState_echo          echo;
-    hashState_groestl       groestl;
-    skein512_4way_context   skein;
-    jh512_4way_context      jh;
-    keccak512_4way_context  keccak;
-    luffa_2way_context      luffa;
-    hashState_luffa         luffa1;
-    cubehashParam           cube;
-    sph_shavite512_context  shavite;
-    simd_2way_context       simd;
-    hamsi512_4way_context   hamsi;
-    sph_fugue512_context    fugue;
-    shabal512_4way_context  shabal;
-    sph_whirlpool_context   whirlpool;
-    sha512_4way_context     sha512;
    haval256_5_4way_context haval;
    sph_tiger_context       tiger;
    sph_gost512_context     gost;
@@ -718,282 +212,21 @@ union _x21s_4way_context_overlay
    sha256_4way_context     sha256;
 #endif
 } __attribute__ ((aligned (64)));
-typedef union _x21s_4way_context_overlay x21s_4way_context_overlay;

-static __thread x21s_4way_context_overlay x21s_ctx;
+typedef union _x21s_4way_context_overlay x21s_4way_context_overlay;

 void x21s_4way_hash( void* output, const void* input )
 {
-   uint32_t hash0[20] __attribute__ ((aligned (64)));
-   uint32_t hash1[20] __attribute__ ((aligned (64)));
-   uint32_t hash2[20] __attribute__ ((aligned (64)));
-   uint32_t hash3[20] __attribute__ ((aligned (64)));
-   uint32_t vhash[20*4] __attribute__ ((aligned (64)));
+   uint32_t vhash[16*4] __attribute__ ((aligned (64)));
+   uint8_t  shash[64*4] __attribute__ ((aligned (64)));
   x21s_4way_context_overlay ctx;
-   memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-   void *in0 = (void*) hash0;
-   void *in1 = (void*) hash1;
-   void *in2 = (void*) hash2;
-   void *in3 = (void*) hash3;
-   int size = 80;
-
-   dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
-
-   // Input data is both 64 bit interleaved (input)
-   // and deinterleaved in inp0-3.
-   // If First function uses 64 bit data it is not required to interleave inp
-   // first. It may use the inerleaved data dmost convenient, ie 4way 64 bit.
-   // All other functions assume data is deinterleaved in hash0-3
-   // All functions must exit with data deinterleaved in hash0-3.
-   // Alias in0-3 points to either inp0-3 or hash0-3 according to
-   // its hashOrder position. Size is also set accordingly.
-   for ( int i = 0; i < 16; i++ )
-   {
-      const char elem = hashOrder[i];
-      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
-      switch ( algo )
-      {
-         case BLAKE:
-            if ( i == 0 )
-               blake512_4way_full( &ctx.blake, vhash, input, size );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               blake512_4way_full( &ctx.blake, vhash, vhash, size );
-            }
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case BMW:
-            bmw512_4way_init( &ctx.bmw );
-            if ( i == 0 )
-               bmw512_4way_update( &ctx.bmw, input, size );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               bmw512_4way_update( &ctx.bmw, vhash, size );
-            }
-            bmw512_4way_close( &ctx.bmw, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case GROESTL:
-            groestl512_full( &ctx.groestl, (char*)hash0, (char*)in0, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash1, (char*)in1, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash2, (char*)in2, size<<3 );
-            groestl512_full( &ctx.groestl, (char*)hash3, (char*)in3, size<<3 );
-         break;
-         case JH:
-            if ( i == 0 )
-               jh512_4way_update( &ctx.jh, input + (64<<2), 16 );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               jh512_4way_init( &ctx.jh );
-               jh512_4way_update( &ctx.jh, vhash, size );
-            }
-            jh512_4way_close( &ctx.jh, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case KECCAK:
-            keccak512_4way_init( &ctx.keccak );
-            if ( i == 0 )
-               keccak512_4way_update( &ctx.keccak, input, size );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               keccak512_4way_update( &ctx.keccak, vhash, size );
-            }
-            keccak512_4way_close( &ctx.keccak, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case SKEIN:
-            if ( i == 0 )
-               skein512_4way_update( &ctx.skein, input + (64<<2), 16 );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               skein512_4way_init( &ctx.skein );
-               skein512_4way_update( &ctx.skein, vhash, size );
-            }
-            skein512_4way_close( &ctx.skein, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case LUFFA:
-            if ( i == 0 )
-            {
-               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash0,
-                                    (const BitSequence*)in0 + 64, 16 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash1,
-                                    (const BitSequence*)in1 + 64, 16 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );  
-               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash2,
-                                    (const BitSequence*)in2 + 64, 16 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );  
-               update_and_final_luffa( &ctx.luffa1, (BitSequence*)hash3,
-                                    (const BitSequence*)in3 + 64, 16 );
-            }
-            else
-            {
-               intrlv_2x128( vhash, in0, in1, size<<3 );
-               luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
-               dintrlv_2x128_512( hash0, hash1, vhash );
-               intrlv_2x128( vhash, in2, in3, size<<3 );
-               luffa512_2way_full( &ctx.luffa, vhash, vhash, size );
-               dintrlv_2x128_512( hash2, hash3, vhash );
-            }
-         break;
-         case CUBEHASH:
-            if ( i == 0 )
-            {
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                          (const byte*)in0 + 64, 16 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
-                                          (const byte*)in1 + 64, 16 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
-                                          (const byte*)in2 + 64, 16 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
-                                          (const byte*)in3 + 64, 16 );
-
-            }
-            else
-            {   
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
-                                          (const byte*)in0, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
-                                     (const byte*)in1, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
-                                     (const byte*)in2, size );
-               cubehashInit( &ctx.cube, 512, 16, 32 );
-               cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
-                                     (const byte*)in3, size );
-            }
-         break;
-         case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in0, size );
-            sph_shavite512_close( &ctx.shavite, hash0 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in1, size );
-            sph_shavite512_close( &ctx.shavite, hash1 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in2, size );
-            sph_shavite512_close( &ctx.shavite, hash2 );
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in3, size );
-            sph_shavite512_close( &ctx.shavite, hash3 );
-         break;
-         case SIMD:
-            intrlv_2x128( vhash, in0, in1, size<<3 );
-            simd_2way_init( &ctx.simd, 512 );
-            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash0, hash1, vhash, 512 );
-            intrlv_2x128( vhash, in2, in3, size<<3 );
-            simd_2way_init( &ctx.simd, 512 );
-            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
-            dintrlv_2x128( hash2, hash3, vhash, 512 );
-         break;
-         case ECHO:
-            echo_full( &ctx.echo, (BitSequence *)hash0, 512,
-                              (const BitSequence *)in0, size );
-            echo_full( &ctx.echo, (BitSequence *)hash1, 512,
-                              (const BitSequence *)in1, size );
-            echo_full( &ctx.echo, (BitSequence *)hash2, 512,
-                              (const BitSequence *)in2, size );
-            echo_full( &ctx.echo, (BitSequence *)hash3, 512,
-                              (const BitSequence *)in3, size );
-         break;
-         case HAMSI:
-            if ( i == 0 )
-               hamsi512_4way_update( &ctx.hamsi, input + (64<<2), 16 );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               hamsi512_4way_init( &ctx.hamsi );
-               hamsi512_4way_update( &ctx.hamsi, vhash, size );
-            }
-            hamsi512_4way_close( &ctx.hamsi, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in0, size );
-             sph_fugue512_close( &ctx.fugue, hash0 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in1, size );
-             sph_fugue512_close( &ctx.fugue, hash1 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in2, size );
-             sph_fugue512_close( &ctx.fugue, hash2 );
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in3, size );
-             sph_fugue512_close( &ctx.fugue, hash3 );
-         break;
-         case SHABAL:
-            intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
-            if ( i == 0 )
-               shabal512_4way_update( &ctx.shabal, vhash + (16<<2), 16 );
-            else
-            {
-               shabal512_4way_init( &ctx.shabal );
-               shabal512_4way_update( &ctx.shabal, vhash, size );
-            }
-            shabal512_4way_close( &ctx.shabal, vhash );
-            dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-         case WHIRLPOOL:
-            if ( i == 0 )
-            {
-               sph_whirlpool( &ctx.whirlpool, in0 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in1 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in2 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               memcpy( &ctx, &x21s_ctx, sizeof(ctx) );
-               sph_whirlpool( &ctx.whirlpool, in3 + 64, 16 );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
-            }
-            else
-            {
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in0, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash0 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in1, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash1 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in2, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash2 );
-               sph_whirlpool_init( &ctx.whirlpool );
-               sph_whirlpool( &ctx.whirlpool, in3, size );
-               sph_whirlpool_close( &ctx.whirlpool, hash3 );
-            }
-         break;
-         case SHA_512:
-            sha512_4way_init( &ctx.sha512 );
-            if ( i == 0 )
-               sha512_4way_update( &ctx.sha512, input, size );
-            else
-            {
-               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
-               sha512_4way_update( &ctx.sha512, vhash, size );
-            }
-            sha512_4way_close( &ctx.sha512, vhash );
-            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-         break;
-      }
-      size = 64;
-   }
+   uint32_t *hash0 = (uint32_t*)  shash;
+   uint32_t *hash1 = (uint32_t*)( shash+64  );
+   uint32_t *hash2 = (uint32_t*)( shash+128 );
+   uint32_t *hash3 = (uint32_t*)( shash+192 );

+   x16r_4way_hash_generic( shash, input );
+   
   intrlv_4x32( vhash, hash0, hash1, hash2, hash3,  512 );

   haval256_5_4way_init( &ctx.haval );
@@ -1073,8 +306,6 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[16*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t vdata32[20*4] __attribute__ ((aligned (64)));
-   uint32_t edata[20] __attribute__ ((aligned (64)));
   uint32_t bedata1[2] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -1090,66 +321,20 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
 
   bedata1[0] = bswap_32( pdata[1] );
   bedata1[1] = bswap_32( pdata[2] );
+
+   static __thread uint32_t s_ntime = UINT32_MAX;
   uint32_t ntime = bswap_32( pdata[17] );
   if ( s_ntime != ntime )
   {
-      x16_r_s_getAlgoString( (const uint8_t*)bedata1, hashOrder );
+      x16_r_s_getAlgoString( (const uint8_t*)bedata1, x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
-   }
-   
-   const char elem = hashOrder[0];
-   const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
-   switch ( algo )
-   {
-      case JH:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         jh512_4way_init( &x21s_ctx.jh );
-         jh512_4way_update( &x21s_ctx.jh, vdata, 64 );
-      break;
-      case SKEIN:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         skein512_4way_init( &x21s_ctx.skein );
-         skein512_4way_update( &x21s_ctx.skein, vdata, 64 );
-      break;
-      case LUFFA:
-         mm128_bswap32_80( edata, pdata );
-         init_luffa( &x21s_ctx.luffa1, 512 );
-         update_luffa( &x21s_ctx.luffa1, (const BitSequence*)edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
-      break;
-      case CUBEHASH:
-         mm128_bswap32_80( edata, pdata );
-         cubehashInit( &x21s_ctx.cube, 512, 16, 32 );
-         cubehashUpdate( &x21s_ctx.cube, (const byte*)edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
-      break;
-      case HAMSI:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
-         hamsi512_4way_init( &x21s_ctx.hamsi );
-         hamsi512_4way_update( &x21s_ctx.hamsi, vdata, 64 );
-      break;
-      case SHABAL:
-         mm128_bswap32_intrlv80_4x32( vdata32, pdata );
-         shabal512_4way_init( &x21s_ctx.shabal );
-         shabal512_4way_update( &x21s_ctx.shabal, vdata32, 64 );
-         rintrlv_4x32_4x64( vdata, vdata32, 640 );
-      break;
-      case WHIRLPOOL:
-         mm128_bswap32_80( edata, pdata );
-         sph_whirlpool_init( &x21s_ctx.whirlpool );
-         sph_whirlpool( &x21s_ctx.whirlpool, edata, 64 );
-         intrlv_4x64( vdata, edata, edata, edata, edata, 640 );
-      break;
-      default:
-         mm256_bswap32_intrlv80_4x64( vdata, pdata );
+              applog( LOG_DEBUG, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

+   x16r_4way_prehash( vdata, pdata );
   *noncev = mm256_intrlv_blend_32(
                   _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ), *noncev );
-
   do
   {
      x21s_4way_hash( hash, vdata );
@@ -1162,7 +347,7 @@ int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
      *noncev = _mm256_add_epi32( *noncev,
                                  m256_const1_64( 0x0000000400000000 ) );
      n += 4;
-   } while ( (  n < last_nonce ) && !(*restart) );
+   } while ( likely( (  n < last_nonce ) && !(*restart) ) );
   pdata[19] = n;
   *hashes_done = n - first_nonce;
   return 0;
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -5,63 +5,21 @@
 * Optimized by JayDDee@github Jan 2018
 */
 #include "x16r-gate.h"
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "algo/blake/sph_blake.h"
-#include "algo/bmw/sph_bmw.h"
-#include "algo/groestl/sph_groestl.h"
-#include "algo/jh/sph_jh.h"
-#include "algo/keccak/sph_keccak.h"
-#include "algo/skein/sph_skein.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/cubehash_sse2.h"
-#include "algo/simd/nist.h"
-#include "algo/echo/sph_echo.h"
-#include "algo/hamsi/sph_hamsi.h"
-#include "algo/fugue/sph_fugue.h"
-#include "algo/shabal/sph_shabal.h"
-#include "algo/whirlpool/sph_whirlpool.h"
 #include <openssl/sha.h>
-#if defined(__AES__)
-  #include "algo/echo/aes_ni/hash_api.h"
-  #include "algo/groestl/aes_ni/hash-groestl.h"
-#endif
 #include "algo/haval/sph-haval.h"
 #include "algo/tiger/sph_tiger.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/lyra2/lyra2.h"

-static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+#if !defined(X16R_8WAY) && !defined(X16R_4WAY)

 static __thread uint64_t* x21s_matrix;

 union _x21s_context_overlay
 {
-#if defined(__AES__)
-        hashState_echo          echo;
-        hashState_groestl       groestl;
-#else
-        sph_groestl512_context   groestl;
-        sph_echo512_context      echo;
-#endif
-        sph_blake512_context    blake;
-        sph_bmw512_context      bmw;
-        sph_skein512_context    skein;
-        sph_jh512_context       jh;
-        sph_keccak512_context   keccak;
-        hashState_luffa         luffa;
-        cubehashParam           cube;
-        sph_shavite512_context  shavite;
-        hashState_sd            simd;
-        sph_hamsi512_context    hamsi;
-        sph_fugue512_context    fugue;
-        sph_shabal512_context   shabal;
-        sph_whirlpool_context   whirlpool;
-        SHA512_CTX              sha512;
        sph_haval256_5_context  haval;
        sph_tiger_context       tiger;
        sph_gost512_context     gost;
@@ -73,112 +31,8 @@ void x21s_hash( void* output, const void* input )
 {
   uint32_t _ALIGN(128) hash[16];
   x21s_context_overlay ctx;
-   void *in = (void*) input;
-   int size = 80;

-   for ( int i = 0; i < 16; i++ )
-   {
-      const char elem = hashOrder[i];
-      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
-
-      switch ( algo )
-      {
-         case BLAKE:
-            sph_blake512_init( &ctx.blake );
-            sph_blake512( &ctx.blake, in, size );
-            sph_blake512_close( &ctx.blake, hash );
-         break;
-         case BMW:
-            sph_bmw512_init( &ctx.bmw );
-            sph_bmw512(&ctx.bmw, in, size);
-            sph_bmw512_close(&ctx.bmw, hash);
-         break;
-         case GROESTL:
-#if defined(__AES__)
-            init_groestl( &ctx.groestl, 64 );
-            update_and_final_groestl( &ctx.groestl, (char*)hash,
-                                      (const char*)in, size<<3 );
-#else
-            sph_groestl512_init( &ctx.groestl );
-            sph_groestl512( &ctx.groestl, in, size );
-            sph_groestl512_close(&ctx.groestl, hash);
-#endif
-         break;
-         case SKEIN:
-            sph_skein512_init( &ctx.skein );
-            sph_skein512( &ctx.skein, in, size );
-            sph_skein512_close( &ctx.skein, hash );
-         break;
-         case JH:
-            sph_jh512_init( &ctx.jh );
-            sph_jh512(&ctx.jh, in, size );
-            sph_jh512_close(&ctx.jh, hash );
-         break;
-         case KECCAK:
-            sph_keccak512_init( &ctx.keccak );
-            sph_keccak512( &ctx.keccak, in, size );
-            sph_keccak512_close( &ctx.keccak, hash );
-         break;
-         case LUFFA:
-            init_luffa( &ctx.luffa, 512 );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
-                                    (const BitSequence*)in, size );
-         break;
-         case CUBEHASH:
-            cubehashInit( &ctx.cube, 512, 16, 32 );
-            cubehashUpdateDigest( &ctx.cube, (byte*) hash,
-                                  (const byte*)in, size );
-         break;
-         case SHAVITE:
-            sph_shavite512_init( &ctx.shavite );
-            sph_shavite512( &ctx.shavite, in, size );
-            sph_shavite512_close( &ctx.shavite, hash );
-         break;
-         case SIMD:
-             init_sd( &ctx.simd, 512 );
-             update_final_sd( &ctx.simd, (BitSequence *)hash,
-                              (const BitSequence*)in, size<<3 );
-         break;
-         case ECHO:
-#if defined(__AES__)
-             init_echo( &ctx.echo, 512 );
-             update_final_echo ( &ctx.echo, (BitSequence *)hash,
-                                (const BitSequence*)in, size<<3 );
-#else
-             sph_echo512_init( &ctx.echo );
-             sph_echo512( &ctx.echo, in, size );
-             sph_echo512_close( &ctx.echo, hash );
-#endif
-         break;
-         case HAMSI:
-             sph_hamsi512_init( &ctx.hamsi );
-             sph_hamsi512( &ctx.hamsi, in, size );
-             sph_hamsi512_close( &ctx.hamsi, hash );
-         break;
-         case FUGUE:
-             sph_fugue512_init( &ctx.fugue );
-             sph_fugue512( &ctx.fugue, in, size );
-             sph_fugue512_close( &ctx.fugue, hash );
-         break;
-         case SHABAL:
-             sph_shabal512_init( &ctx.shabal );
-             sph_shabal512( &ctx.shabal, in, size );
-             sph_shabal512_close( &ctx.shabal, hash );
-         break;
-         case WHIRLPOOL:
-             sph_whirlpool_init( &ctx.whirlpool );
-             sph_whirlpool( &ctx.whirlpool, in, size );
-             sph_whirlpool_close( &ctx.whirlpool, hash );
-         break;
-         case SHA_512:
-             SHA512_Init( &ctx.sha512 );
-             SHA512_Update( &ctx.sha512, in, size );
-             SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
-         break;
-      }
-      in = (void*) hash;
-      size = 64;
-   }
+   x16r_hash_generic( hash, input );

   sph_haval256_5_init( &ctx.haval );
   sph_haval256_5( &ctx.haval, (const void*) hash, 64) ;
@@ -206,42 +60,38 @@ int scanhash_x21s( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t _ALIGN(128) hash32[8];
-   uint32_t _ALIGN(128) endiandata[20];
+   uint32_t _ALIGN(128) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
-   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;
   uint32_t nonce = first_nonce;
   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   const bool bench = opt_benchmark;
+   if ( bench )  ptarget[7] = 0x0cff;

-   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   mm128_bswap32_80( edata, pdata );

+   static __thread uint32_t s_ntime = UINT32_MAX;
   if ( s_ntime != pdata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
-      x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
+      x16_r_s_getAlgoString( (const uint8_t*)(&edata[1]), x16r_hash_order );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
-              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+          applog( LOG_INFO, "hash order %s (%08x)", x16r_hash_order, ntime );
   }

-   if ( opt_benchmark )
-      ptarget[7] = 0x0cff;
+   x16r_prehash( edata, pdata );

   do
   {
-      be32enc( &endiandata[19], nonce );
-      x21s_hash( hash32, endiandata );
+      edata[19] = nonce;
+      x21s_hash( hash32, edata );

-      if ( hash32[7] <= Htarg )
-      if (fulltest( hash32, ptarget ) && !opt_benchmark )
+      if ( unlikely( valid_hash( hash32, ptarget ) && !bench ) )
      {
-         pdata[19] = nonce;
+         pdata[19] = bswap_32( nonce );
         submit_solution( work, hash32, mythr );
      }
      nonce++;
@@ -261,3 +111,4 @@ bool x21s_thread_init()
   return x21s_matrix;
 }

+#endif
--- a/algo/x17/sonoa.c
+++ b/algo/x17/sonoa.c
@@ -1,4 +1,7 @@
 #include "sonoa-gate.h"
+
+#if !defined(SONOA_8WAY) && !defined(SONOA_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -616,3 +619,5 @@ int scanhash_sonoa( struct work *work, uint32_t max_nonce,
   pdata[19] = n;
   return 0;
 }
+
+#endif
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -1,4 +1,7 @@
 #include "x17-gate.h"
+
+#if !defined(X17_8WAY) && !defined(X17_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,9 +12,6 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/simd/sph_simd.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
@@ -148,30 +148,32 @@ void x17_hash(void *output, const void *input)
 int scanhash_x17( struct work *work, uint32_t max_nonce,
 	          uint64_t *hashes_done, struct thr_info *mythr)
 {
-   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t edata[20] __attribute__((aligned(64)));
   uint32_t hash64[8] __attribute__((aligned(64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19] - 1;
   const uint32_t first_nonce = pdata[19];
-   int thr_id = mythr->id;  // thr_id arg is deprecated
-
-   // we need bigendian data...
-   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
-   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
-   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
-   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
-   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+   const int thr_id = mythr->id;
+   const bool bench = opt_benchmark;

+   mm128_bswap32_80( edata, pdata );
+   
   do
   {
-      pdata[19] = ++n;
-      be32enc( &endiandata[19], n );
-      x17_hash( hash64, endiandata );
-      if unlikely( valid_hash( hash64, ptarget ) && !opt_benchmark )
-             submit_solution( work, hash64, mythr );
+      edata[19] = n;
+      x17_hash( hash64, edata );
+      if ( unlikely( valid_hash( hash64, ptarget ) && !bench ) )
+      {
+         pdata[19] = bswap_32( n );
+         submit_solution( work, hash64, mythr );
+      }
+      n++;
   } while ( n < max_nonce && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
   pdata[19] = n;
   return 0;
 }
+
+#endif
+
--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -1,5 +1,7 @@
 #include "xevan-gate.h"

+#if !defined(XEVAN_8WAY) && !defined(XEVAN_4WAY)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -268,3 +270,4 @@ int scanhash_xevan( struct work *work, uint32_t max_nonce,
 	return 0;
 }

+#endif
--- a/algo/x22/x22i.c
+++ b/algo/x22/x22i.c
@@ -1,3 +1,7 @@
+#include "x22i-gate.h"
+
+#if !( defined(X22I_8WAY) || defined(X22I_4WAY) )
+
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
 #if defined(__AES__)
@@ -24,7 +28,6 @@
 #include "algo/lyra2/lyra2.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/swifftx/swifftx.h"
-#include "x22i-gate.h"

 union _x22i_context_overlay
 {
@@ -200,3 +203,4 @@ int scanhash_x22i( struct work* work, uint32_t max_nonce,
 	 return 0;
 }

+#endif
--- a/algo/x22/x25x.c
+++ b/algo/x22/x25x.c
@@ -1,4 +1,7 @@
 #include "x22i-gate.h"
+
+#if !( defined(X25X_8WAY) || defined(X25X_4WAY) )
+
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
 #if defined(__AES__)
@@ -201,7 +204,7 @@ void x25x_hash( void *output, const void *input )
 int scanhash_x25x( struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr )
 {
-   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t edata[20] __attribute__((aligned(64)));
   uint32_t hash[8] __attribute__((aligned(64)));
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
@@ -213,17 +216,19 @@ int scanhash_x25x( struct work* work, uint32_t max_nonce,
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x08ff;

+   mm128_bswap32_80( edata, pdata );
+   
 	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], pdata[k]);
+		be32enc(&edata[k], pdata[k]);

   InitializeSWIFFTX();

   do
   {
       pdata[19] = ++n;
-       be32enc( &endiandata[19], n );
+       be32enc( &edata[19], n );

-       x25x_hash( hash, endiandata );
+       x25x_hash( hash, edata );

       if ( hash[7] < Htarg )
       if ( fulltest( hash, ptarget ) && !opt_benchmark )
@@ -234,3 +239,4 @@ int scanhash_x25x( struct work* work, uint32_t max_nonce,
 	 return 0;
 }

+#endif