diff --git a/Makefile.am b/Makefile.am
index b9c7907..cbd8de4 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -29,6 +29,7 @@ cpuminer_SOURCES = \
   algo/shavite/shavite.c \
   algo/echo/sph_echo.c \
   algo/blake/sph_blake.c \
+  algo/blake/sph_blake2b.c \
   algo/heavy/sph_hefty1.c \
   algo/blake/mod_blakecoin.c \
   algo/luffa/sph_luffa.c \
@@ -62,7 +63,8 @@ cpuminer_SOURCES = \
   algo/argon2/ar2/blake2b.c \
   algo/axiom.c \
   algo/blake/blake.c \
-  algo/blake/blake2.c \
+  algo/blake/blake2b.c \
+  algo/blake/blake2s.c \
   algo/blake/blakecoin.c \
   algo/blake/decred.c \
   algo/blake/pentablake.c \
@@ -112,6 +114,7 @@ cpuminer_SOURCES = \
   algo/pluck.c \
   algo/quark/quark.c \
   algo/qubit/qubit.c \
+  algo/qubit/deep.c \
   algo/ripemd/sph_ripemd.c \
   algo/scrypt.c \
   algo/scryptjane/scrypt-jane.c \
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 9c9713a..5ea6fd5 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -2,6 +2,14 @@ Compile instruction for Linux and Windows are at the bottom of this file.
 
 Change Log
 ----------
+
+V3.5.9
+
+Reduced stack usage for hmq1725 and small speedup.
+Added Deep algo optimized for AES and AVX2
+Rewrite of SSE2 Luffa, midstate now supported in deep, qubit & timetravel
+Small changes to algo-gate.
+
 v3.5.8
 
 Lyra2RE fixed on Windows, broken in v3.5.6.
diff --git a/algo-gate-api.c b/algo-gate-api.c
index 2bcca55..0901e3e 100644
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -123,7 +123,8 @@ void init_algo_gate( algo_gate_t* gate )
    gate->build_extraheader       = (void*)&std_build_extraheader;
    gate->set_work_data_endian    = (void*)&do_nothing;
    gate->calc_network_diff       = (void*)&std_calc_network_diff;
-   gate->prevent_dupes           = (void*)&return_false;
+//   gate->prevent_dupes           = (void*)&return_false;
+   gate->ready_to_mine           = (void*)&std_ready_to_mine;
    gate->resync_threads          = (void*)&do_nothing;
    gate->do_this_thread          = (void*)&return_true;
    gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
@@ -159,11 +160,13 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
      case ALGO_BASTION:     register_bastion_algo    ( gate ); break;
      case ALGO_BLAKE:       register_blake_algo      ( gate ); break;
      case ALGO_BLAKECOIN:   register_blakecoin_algo  ( gate ); break;
+//     case ALGO_BLAKE2B:     register_blake2b_algo    ( gate ); break;
      case ALGO_BLAKE2S:     register_blake2s_algo    ( gate ); break;
      case ALGO_C11:         register_c11_algo        ( gate ); break;
      case ALGO_CRYPTOLIGHT: register_cryptolight_algo( gate ); break;
      case ALGO_CRYPTONIGHT: register_cryptonight_algo( gate ); break;
      case ALGO_DECRED:      register_decred_algo     ( gate ); break;
+     case ALGO_DEEP:        register_deep_algo       ( gate ); break;
      case ALGO_DROP:        register_drop_algo       ( gate ); break;
      case ALGO_FRESH:       register_fresh_algo      ( gate ); break;
      case ALGO_GROESTL:     register_groestl_algo    ( gate ); break;
@@ -267,6 +270,7 @@ const char* const algo_alias_map[][2] =
 //   alias                proper
   { "blake256r8",        "blakecoin"   },
   { "blake256r8vnl",     "vanilla"     },
+  { "sia",               "blake2b"     },
   { "blake256r14",       "blake"       },
   { "cryptonote",        "cryptonight" },
   { "cryptonight-light", "cryptolight" },
diff --git a/algo-gate-api.h b/algo-gate-api.h
index e8a3fbd..316cd1e 100644
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -105,7 +105,6 @@ inline bool set_excl ( set_t a, set_t b ) { return (a & b) == 0; }
 
 typedef struct
 {
-//migrate to use work instead of pdata & ptarget, see decred for example.
 // mandatory functions, must be overwritten
 int ( *scanhash ) ( int, struct work*, uint32_t, uint64_t* );
 
@@ -127,11 +126,12 @@ bool ( *work_decode )            ( const json_t*, struct work* );
 void ( *set_target)              ( struct work*, double );
 bool ( *submit_getwork_result )  ( CURL*, struct work* );
 void ( *gen_merkle_root )        ( char*, struct stratum_ctx* );
+void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );
 void ( *build_stratum_request )  ( char*, struct work*, struct stratum_ctx* );
 void ( *set_work_data_endian )   ( struct work* );
 double ( *calc_network_diff )    ( struct work* );
-void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );
-bool ( *prevent_dupes )          ( struct work*, struct stratum_ctx*, int );
+//bool ( *prevent_dupes )          ( struct work*, struct stratum_ctx*, int );
+bool ( *ready_to_mine )          ( struct work*, struct stratum_ctx*, int );
 void ( *resync_threads )         ( struct work* );
 bool ( *do_this_thread )         ( int );
 json_t* (*longpoll_rpc_call)     ( CURL*, int*, char* );
@@ -165,8 +165,8 @@ void algo_not_implemented();
 // conventions results in different behaviour for pointers with different
 // target sizes requiring customized casting to make it work consistently.
 // Rant mode: yet another thing I hate about c/c++. Array indexes should
-// be scaled, pointer offsets should always be bytes. Noconfusion and no hidden
-// math.
+// be scaled, pointer offsets should always be bytes. No confusion and no
+// hidden math.
 
 #define STD_NTIME_INDEX 17
 #define STD_NBITS_INDEX 18
@@ -227,7 +227,7 @@ void std_le_build_stratum_request( char *req, struct work *work );
 void std_be_build_stratum_request( char *req, struct work *work );
 void jr2_build_stratum_request   ( char *req, struct work *work );
 
-// default is do_nothing;
+// set_work_data_endian target, default is do_nothing;
 void swab_work_data( struct work *work );
 
 double std_calc_network_diff( struct work *work );
@@ -240,6 +240,9 @@ json_t* jr2_longpoll_rpc_call( CURL *curl, int *err );
 bool std_stratum_handle_response( json_t *val );
 bool jr2_stratum_handle_response( json_t *val );
 
+bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
+                        int thr_id );
+
 // Gate admin functions
 
 // Called from main to initialize all gate functions and algo-specific data
diff --git a/algo/blake/b/sia-rpc.cpp b/algo/blake/b/sia-rpc.cpp
new file mode 100644
index 0000000..5eafe9e
--- /dev/null
+++ b/algo/blake/b/sia-rpc.cpp
@@ -0,0 +1,203 @@
+#include <ccminer-config.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <inttypes.h>
+#include <unistd.h>
+#include <math.h>
+#include <sys/time.h>
+#include <time.h>
+#include <signal.h>
+#include <curl/curl.h>
+#include <miner.h>
+
+#include "sia-rpc.h"
+
+static bool sia_debug_diff = false;
+
+extern int share_result(int result, int pooln, double sharediff, const char *reason);
+
+/* compute nbits to get the network diff */
+static void calc_network_diff(struct work *work)
+{
+	uint32_t nbits = work->data[11]; // unsure if correct
+	uint32_t bits = (nbits & 0xffffff);
+	int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
+
+	uint64_t diffone = 0x0000FFFF00000000ull;
+	double d = (double)0x0000ffff / (double)bits;
+
+	for (int m=shift; m < 29; m++) d *= 256.0;
+	for (int m=29; m < shift; m++) d /= 256.0;
+	if (sia_debug_diff)
+		applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
+
+	net_diff = d;
+}
+
+// ---- SIA LONGPOLL --------------------------------------------------------------------------------
+
+struct data_buffer {
+	void *buf;
+	size_t len;
+};
+
+static size_t sia_data_cb(const void *ptr, size_t size, size_t nmemb,
+			  void *user_data)
+{
+	struct data_buffer *db = (struct data_buffer *)user_data;
+	size_t len = size * nmemb;
+	size_t oldlen, newlen;
+	void *newmem;
+	static const uchar zero = 0;
+
+	oldlen = db->len;
+	newlen = oldlen + len;
+
+	newmem = realloc(db->buf, newlen + 1);
+	if (!newmem)
+		return 0;
+
+	db->buf = newmem;
+	db->len = newlen;
+	memcpy((char*)db->buf + oldlen, ptr, len);
+	memcpy((char*)db->buf + newlen, &zero, 1);	/* null terminate */
+
+	return len;
+}
+
+char* sia_getheader(CURL *curl, struct pool_infos *pool)
+{
+	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
+	struct data_buffer all_data = { 0 };
+	struct curl_slist *headers = NULL;
+	char data[256] = { 0 };
+	char url[512];
+
+	// nanopool
+	snprintf(url, 512, "%s/miner/header?address=%s&worker=%s", //&longpoll
+		pool->url, pool->user, pool->pass);
+
+	if (opt_protocol)
+		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	curl_easy_setopt(curl, CURLOPT_URL, url);
+	curl_easy_setopt(curl, CURLOPT_POST, 0);
+	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
+	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
+	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
+	curl_easy_setopt(curl, CURLOPT_TIMEOUT, opt_timeout);
+	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
+	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, sia_data_cb);
+	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
+
+	headers = curl_slist_append(headers, "Accept: application/octet-stream");
+	headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr
+	headers = curl_slist_append(headers, "User-Agent: Sia-Agent"); // required for now
+//	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
+//	headers = curl_slist_append(headers, "X-Mining-Extensions: longpoll");
+
+	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+
+	int rc = curl_easy_perform(curl);
+	if (rc && strlen(curl_err_str)) {
+		applog(LOG_WARNING, "%s", curl_err_str);
+	}
+
+	if (all_data.len >= 112)
+		cbin2hex(data, (const char*) all_data.buf, 112);
+	if (opt_protocol || all_data.len != 112)
+		applog(LOG_DEBUG, "received %d bytes: %s", (int) all_data.len, data);
+
+	curl_slist_free_all(headers);
+
+	return rc == 0 && all_data.len ? strdup(data) : NULL;
+}
+
+bool sia_work_decode(const char *hexdata, struct work *work)
+{
+	uint8_t target[32];
+	if (!work) return false;
+
+	hex2bin((uchar*)target, &hexdata[0], 32);
+	swab256(work->target, target);
+	work->targetdiff = target_to_diff(work->target);
+
+	hex2bin((uchar*)work->data, &hexdata[64], 80);
+	// high 16 bits of the 64 bits nonce
+	work->data[9] = rand() << 16;
+
+	// use work ntime as job id
+	cbin2hex(work->job_id, (const char*)&work->data[10], 4);
+	calc_network_diff(work);
+
+	if (stratum_diff != work->targetdiff) {
+		stratum_diff = work->targetdiff;
+		applog(LOG_WARNING, "Pool diff set to %g", stratum_diff);
+	}
+
+	return true;
+}
+
+bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work)
+{
+	char curl_err_str[CURL_ERROR_SIZE] = { 0 };
+	struct data_buffer all_data = { 0 };
+	struct curl_slist *headers = NULL;
+	char buf[256] = { 0 };
+	char url[512];
+
+	if (opt_protocol)
+		applog_hex(work->data, 80);
+	//applog_hex(&work->data[8], 16);
+	//applog_hex(&work->data[10], 4);
+
+	// nanopool
+	snprintf(url, 512, "%s/miner/header?address=%s&worker=%s",
+		pool->url, pool->user, pool->pass);
+
+	if (opt_protocol)
+		curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
+	curl_easy_setopt(curl, CURLOPT_URL, url);
+	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
+	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 0);
+	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
+	curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
+	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
+	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
+	curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10);
+
+	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
+	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, sia_data_cb);
+
+	memcpy(buf, work->data, 80);
+	curl_easy_setopt(curl, CURLOPT_POST, 1);
+	curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, 80);
+	curl_easy_setopt(curl, CURLOPT_POSTFIELDS, (void*) buf);
+
+//	headers = curl_slist_append(headers, "Content-Type: application/octet-stream");
+//	headers = curl_slist_append(headers, "Content-Length: 80");
+	headers = curl_slist_append(headers, "Accept:"); // disable Accept hdr
+	headers = curl_slist_append(headers, "Expect:"); // disable Expect hdr
+	headers = curl_slist_append(headers, "User-Agent: Sia-Agent");
+//	headers = curl_slist_append(headers, "User-Agent: " USER_AGENT);
+	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+
+	int res = curl_easy_perform(curl) == 0;
+	long errcode;
+	CURLcode c = curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &errcode);
+	if (errcode != 204) {
+		if (strlen(curl_err_str))
+			applog(LOG_ERR, "submit err %ld %s", errcode, curl_err_str);
+		res = 0;
+	}
+	share_result(res, work->pooln, work->sharediff[0], res ? NULL : (char*) all_data.buf);
+
+	curl_slist_free_all(headers);
+	return true;
+}
+
+// ---- END SIA LONGPOLL ----------------------------------------------------------------------------
diff --git a/algo/blake/b/sia-rpc.h b/algo/blake/b/sia-rpc.h
new file mode 100644
index 0000000..0b427e4
--- /dev/null
+++ b/algo/blake/b/sia-rpc.h
@@ -0,0 +1,6 @@
+#include <miner.h>
+
+char* sia_getheader(CURL *curl, struct pool_infos *pool);
+bool sia_work_decode(const char *hexdata, struct work *work);
+bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work);
+
diff --git a/algo/blake/blake2b.c b/algo/blake/blake2b.c
new file mode 100644
index 0000000..5dd0d1e
--- /dev/null
+++ b/algo/blake/blake2b.c
@@ -0,0 +1,228 @@
+/**
+ * Blake2-B Implementation
+ * tpruvot@github 2015-2016
+ */
+
+#include "miner.h"
+#include "algo-gate-api.h"
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/sph_blake2b.h"
+
+static __thread sph_blake2b_ctx s_midstate;
+static __thread sph_blake2b_ctx s_ctx;
+#define MIDLEN 76
+#define A 64
+
+void blake2b_hash(void *output, const void *input)
+{
+	uint8_t _ALIGN(A) hash[32];
+	sph_blake2b_ctx ctx;
+
+	sph_blake2b_init(&ctx, 32, NULL, 0);
+	sph_blake2b_update(&ctx, input, 80);
+	sph_blake2b_final(&ctx, hash);
+
+	memcpy(output, hash, 32);
+}
+
+static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
+{
+	s_ctx.outlen = MIDLEN;
+	memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
+	sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
+	sph_blake2b_final(&s_ctx, (uint8_t*) output);
+}
+
+int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done )
+{
+	uint32_t _ALIGN(A) vhashcpu[8];
+	uint32_t _ALIGN(A) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t Htarg = ptarget[7];
+//        const uint32_t first_nonce = pdata[19];
+	const uint32_t first_nonce = pdata[8];
+
+	uint32_t n = first_nonce;
+
+	for (int i=0; i < 19; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+
+	// midstate (untested yet)
+	//blake2b_init(&s_midstate, 32, NULL, 0);
+	//blake2b_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
+	//memcpy(&s_ctx, &s_midstate, sizeof(blake2b_ctx));
+
+	do {
+//                be32enc(&endiandata[19], n);
+		be32enc(&endiandata[8], n);
+		//blake2b_hash_end(vhashcpu, endiandata);
+		blake2b_hash(vhashcpu, endiandata);
+
+		if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) {
+			work_set_target_ratio(work, vhashcpu);
+			*hashes_done = n - first_nonce + 1;
+//                        pdata[19] = n;
+			pdata[8] = n;
+			return 1;
+		}
+		n++;
+
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+	*hashes_done = n - first_nonce + 1;
+//        pdata[19] = n;
+	pdata[8] = n;
+
+	return 0;
+}
+
+static inline void swab256(void *dest_p, const void *src_p)
+{
+	uint32_t *dest = (uint32_t *)dest_p;
+	const uint32_t *src = (uint32_t *)src_p;
+
+	dest[0] = swab32(src[7]);
+	dest[1] = swab32(src[6]);
+	dest[2] = swab32(src[5]);
+	dest[3] = swab32(src[4]);
+	dest[4] = swab32(src[3]);
+	dest[5] = swab32(src[2]);
+	dest[6] = swab32(src[1]);
+	dest[7] = swab32(src[0]);
+}
+
+/* compute nbits to get the network diff */
+void blake2b_calc_network_diff(struct work *work)
+{
+        // sample for diff 43.281 : 1c05ea29
+        uint32_t nbits = work->data[11]; // unsure if correct
+        uint32_t bits = (nbits & 0xffffff);
+        int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
+
+        double d = (double)0x0000ffff / (double)bits;
+        for (int m=shift; m < 29; m++) d *= 256.0;
+        for (int m=29; m < shift; m++) d /= 256.0;
+        if (opt_debug_diff)
+                applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
+        net_diff = d;
+}
+
+void blake2b_be_build_stratum_request( char *req, struct work *work )
+{
+   unsigned char *xnonce2str;
+   uint32_t ntime,       nonce;
+   char     ntimestr[9], noncestr[9];
+   be32enc( &ntime, work->data[ algo_gate.ntime_index ] );
+   be32enc( &nonce, work->data[ algo_gate.nonce_index ] );
+   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
+   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
+   uint16_t high_nonce = swab32(work->data[9]) >> 16;
+   xnonce2str = abin2hex((unsigned char*)(&high_nonce), 2);
+   snprintf( req, JSON_BUF_LEN,
+        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
+         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
+   free( xnonce2str );
+}
+
+// merkle root handled here, no need for gen_merkle_root gate target
+void blake2b_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
+{
+    uchar merkle_root[64] = { 0 };
+    uint32_t extraheader[32] = { 0 };
+    int headersize = 0;
+    size_t t;
+    int i;
+
+    // merkle root
+    memcpy( merkle_root, sctx->job.coinbase, 32 );
+    headersize = min( (int)sctx->job.coinbase_size - 32, sizeof(extraheader) );
+    memcpy( extraheader, &sctx->job.coinbase[32], headersize );
+    // Increment extranonce2 
+    for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
+    // Assemble block header 
+    memset( g_work->data, 0, sizeof(g_work->data) );
+//    g_work->data[0] = le32dec( sctx->job.version );
+//    for ( i = 0; i < 8; i++ )
+//       g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
+    for ( i = 0; i < 8; i++ )
+       g_work->data[i] = ((uint32_t*)sctx->job.prevhash)[7-i];
+//    for ( i = 0; i < 8; i++ )
+//       g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
+    g_work->data[8]  = 0; // nonce
+    g_work->data[9]  = swab32( extraheader[0] ) | ( rand() & 0xf0 );
+    g_work->data[10] = be32dec( sctx->job.ntime );
+    g_work->data[11] = be32dec( sctx->job.nbits );
+    for ( i = 0; i < 8; i++ )
+       g_work->data[12+i] = ( (uint32_t*)merkle_root )[i];
+}
+
+void blake2b_get_new_work( struct work* work, struct work* g_work, int thr_id,
+                           uint32_t* end_nonce_ptr, bool clean_job )
+{
+   const int wkcmp_sz = 32;  // bytes
+   const int wkcmp_off = 32 + 16; 
+   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
+
+   if ( memcmp( &work->data[ wkcmp_off ], &g_work->data[ wkcmp_off ], wkcmp_sz )
+      && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) ) 
+      || strcmp( work->job_id, g_work->job_id ) )
+   {
+      work_free( work );
+      work_copy( work, g_work );
+      *nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id;
+      if ( opt_randomize )
+         *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
+      *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
+   }
+   else
+       ++(*nonceptr);
+
+   // suprnova job_id check without data/target/height change...
+   // we just may have copied new g_wwork to work so why this test here?
+//   if (  have_stratum && strcmp( work->job_id, g_work->job_id ) )
+      // exit thread loop
+//      continue;
+//   else
+//   {
+//      nonceptr[1] += 0x10;
+//      nonceptr[1] |= thr_id;
+//   }
+}
+
+bool blake2b_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
+                           int thr_id )
+{
+   if ( have_stratum && strcmp( stratum->job.job_id, work->job_id ) )
+      // need to regen g_work..
+      return false;
+   // extradata: prevent duplicates
+   work->data[ 8     ] += 0x10;
+   work->data[ 8 + 1 ] |= thr_id;
+   return true;
+}
+
+double blake2b_get_max64() { return 0x1fffffLL; }
+
+bool register_blake2b_algo( algo_gate_t* gate )
+{
+  algo_not_tested();
+  gate->ntime_index   = 10;
+  gate->nbits_index   = 11;
+  gate->nonce_index   =  8;
+  gate->work_cmp_size = 32;
+  gate->scanhash              = (void*)&scanhash_blake2b;
+  gate->hash                  = (void*)&blake2b_hash;
+  gate->calc_network_diff     = (void*)&blake2b_calc_network_diff;
+  gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request;
+  gate->build_extraheader     = (void*)&blake2b_build_extraheader;
+  gate->get_new_work          = (void*)&blake2b_get_new_work;
+  gate->get_max64             = (void*)&blake2b_get_max64;
+  gate->ready_to_mine         = (void*)&blake2b_ready_to_mine;
+  have_gbt = false;
+  return true;
+}
diff --git a/algo/blake/blake2.c b/algo/blake/blake2s.c
similarity index 100%
rename from algo/blake/blake2.c
rename to algo/blake/blake2s.c
diff --git a/algo/blake/decred.c b/algo/blake/decred.c
index b6684aa..f4bc725 100644
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -5,13 +5,14 @@
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>
-
+/*
 #ifndef min
 #define min(a,b) (a>b ? b : a)
 #endif
 #ifndef max 
 #define max(a,b) (a<b ? b : a)
 #endif
+*/
 
 #define DECRED_NBITS_INDEX 29
 #define DECRED_NTIME_INDEX 34
@@ -179,9 +180,10 @@ void decred_be_build_stratum_request( char *req, struct work *work,
    free(xnonce2str);
 }
 
+/*
 // data shared between gen_merkle_root and build_extraheader.
-uint32_t decred_extraheader[32] = { 0 };
-int decred_headersize = 0;
+__thread uint32_t decred_extraheader[32] = { 0 };
+__thread int decred_headersize = 0;
 
 void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 {
@@ -191,31 +193,57 @@ void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
                   sizeof(decred_extraheader) );
    memcpy( decred_extraheader, &sctx->job.coinbase[32], decred_headersize);
 }
-
-void decred_build_extraheader( struct work* work, struct stratum_ctx* sctx )
+*/
+void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
+   uchar merkle_root[64] = { 0 };
+   uint32_t extraheader[32] = { 0 };
+   int headersize = 0;
    uint32_t* extradata = (uint32_t*) sctx->xnonce1;
+   size_t t;
    int i;
-   for ( i = 0; i < 8; i++ ) // prevhash
-      work->data[1 + i] = swab32( work->data[1 + i] );
-   for ( i = 0; i < 8; i++ ) // merkle
-      work->data[9 + i] = swab32( work->data[9 + i] );
-   for ( i = 0; i < decred_headersize/4; i++ ) // header
-      work->data[17 + i] = decred_extraheader[i];
+
+   // getwork over stratum, getwork merkle + header passed in coinb1
+   memcpy(merkle_root, sctx->job.coinbase, 32);
+   headersize = min((int)sctx->job.coinbase_size - 32,
+                  sizeof(extraheader) );
+   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
+
+   // Increment extranonce2 
+   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
+
+   // Assemble block header 
+   memset( g_work->data, 0, sizeof(g_work->data) );
+   g_work->data[0] = le32dec( sctx->job.version );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[1 + i] = swab32(
+                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
+
+//   for ( i = 0; i < 8; i++ ) // prevhash
+//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
+//   for ( i = 0; i < 8; i++ ) // merkle
+//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
+
+   for ( i = 0; i < headersize/4; i++ ) // header
+      g_work->data[17 + i] = extraheader[i];
    // extradata
    for ( i = 0; i < sctx->xnonce1_size/4; i++ )
-      work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
+      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
    for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
-      work->data[i] = 0;
-   work->data[37] = (rand()*4) << 8;
-   sctx->bloc_height = work->data[32];
+      g_work->data[i] = 0;
+   g_work->data[37] = (rand()*4) << 8;
+   sctx->bloc_height = g_work->data[32];
    //applog_hex(work->data, 180);
    //applog_hex(&work->data[36], 36);
 }
 
+/*
 bool decred_prevent_dupes( struct work* work, struct stratum_ctx* stratum,
                            int thr_id )
 {
+return false;
    if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
       // need to regen g_work..
       return true;
@@ -224,6 +252,25 @@ bool decred_prevent_dupes( struct work* work, struct stratum_ctx* stratum,
    work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
    return false;
 }
+*/
+
+bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
+                           int thr_id )
+{
+   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
+      // need to regen g_work..
+      return false;
+   if ( have_stratum && !work->data[0] && !opt_benchmark )
+   {
+      sleep(1);
+      return false;
+   }      
+   // extradata: prevent duplicates
+   work->data[ DECRED_XNONCE_INDEX     ] += 1;
+   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
+   return true;
+}
+
 
 bool register_decred_algo( algo_gate_t* gate )
 {
@@ -235,9 +282,10 @@ bool register_decred_algo( algo_gate_t* gate )
   gate->get_max64             = (void*)&get_max64_0x3fffffLL;
   gate->display_extra_data    = (void*)&decred_decode_extradata;
   gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
-  gate->gen_merkle_root       = (void*)&decred_gen_merkle_root;
+//  gate->gen_merkle_root       = (void*)&decred_gen_merkle_root;
   gate->build_extraheader     = (void*)&decred_build_extraheader;
-  gate->prevent_dupes         = (void*)&decred_prevent_dupes;
+//  gate->prevent_dupes         = (void*)&decred_prevent_dupes;
+  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
   gate->nbits_index           = DECRED_NBITS_INDEX;
   gate->ntime_index           = DECRED_NTIME_INDEX;
   gate->nonce_index           = DECRED_NONCE_INDEX;
diff --git a/algo/blake/sph_blake2b.c b/algo/blake/sph_blake2b.c
new file mode 100644
index 0000000..0b7b794
--- /dev/null
+++ b/algo/blake/sph_blake2b.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright 2009 Colin Percival, 2014 savale
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "algo/sha3/sph_types.h"
+#include "sph_blake2b.h"
+
+// Cyclic right rotation.
+
+#ifndef ROTR64
+#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
+#endif
+
+// Little-endian byte access.
+
+#define B2B_GET64(p)                            \
+	(((uint64_t) ((uint8_t *) (p))[0]) ^        \
+	(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^  \
+	(((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \
+	(((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \
+	(((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \
+	(((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \
+	(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
+	(((uint64_t) ((uint8_t *) (p))[7]) << 56))
+
+// G Mixing function.
+
+#define B2B_G(a, b, c, d, x, y) {   \
+	v[a] = v[a] + v[b] + x;         \
+	v[d] = ROTR64(v[d] ^ v[a], 32); \
+	v[c] = v[c] + v[d];             \
+	v[b] = ROTR64(v[b] ^ v[c], 24); \
+	v[a] = v[a] + v[b] + y;         \
+	v[d] = ROTR64(v[d] ^ v[a], 16); \
+	v[c] = v[c] + v[d];             \
+	v[b] = ROTR64(v[b] ^ v[c], 63); }
+
+// Initialization Vector.
+
+static const uint64_t blake2b_iv[8] = {
+	0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+	0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+	0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+	0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+};
+
+// Compression function. "last" flag indicates last block.
+
+static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
+{
+	const uint8_t sigma[12][16] = {
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+	};
+	int i;
+	uint64_t v[16], m[16];
+
+	for (i = 0; i < 8; i++) {           // init work variables
+		v[i] = ctx->h[i];
+		v[i + 8] = blake2b_iv[i];
+	}
+
+	v[12] ^= ctx->t[0];                 // low 64 bits of offset
+	v[13] ^= ctx->t[1];                 // high 64 bits
+	if (last)                           // last block flag set ?
+		v[14] = ~v[14];
+
+	for (i = 0; i < 16; i++)            // get little-endian words
+		m[i] = B2B_GET64(&ctx->b[8 * i]);
+
+	for (i = 0; i < 12; i++) {          // twelve rounds
+		B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
+		B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
+		B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
+		B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
+		B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
+		B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
+		B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
+		B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
+	}
+
+	for( i = 0; i < 8; ++i )
+		ctx->h[i] ^= v[i] ^ v[i + 8];
+}
+
+// Initialize the hashing context "ctx" with optional key "key".
+//      1 <= outlen <= 64 gives the digest size in bytes.
+//      Secret key (also <= 64 bytes) is optional (keylen = 0).
+
+int sph_blake2b_init( sph_blake2b_ctx *ctx, size_t outlen, const void *key,
+                      size_t keylen )        // (keylen=0: no key)
+{
+	size_t i;
+
+	if (outlen == 0 || outlen > 64 || keylen > 64)
+		return -1;                      // illegal parameters
+
+	for (i = 0; i < 8; i++)             // state, "param block"
+		ctx->h[i] = blake2b_iv[i];
+	ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen;
+
+	ctx->t[0] = 0;                      // input count low word
+	ctx->t[1] = 0;                      // input count high word
+	ctx->c = 0;                         // pointer within buffer
+	ctx->outlen = outlen;
+
+	for (i = keylen; i < 128; i++)      // zero input block
+		ctx->b[i] = 0;
+	if (keylen > 0) {
+		sph_blake2b_update(ctx, key, keylen);
+		ctx->c = 128;                   // at the end
+	}
+
+	return 0;
+}
+
+// Add "inlen" bytes from "in" into the hash.
+
+void sph_blake2b_update( sph_blake2b_ctx *ctx, const void *in, size_t inlen )  
+{
+	size_t i;
+
+	for (i = 0; i < inlen; i++) {
+		if (ctx->c == 128) {            // buffer full ?
+			ctx->t[0] += ctx->c;        // add counters
+			if (ctx->t[0] < ctx->c)     // carry overflow ?
+				ctx->t[1]++;            // high word
+			blake2b_compress(ctx, 0);   // compress (not last)
+			ctx->c = 0;                 // counter to zero
+		}
+		ctx->b[ctx->c++] = ((const uint8_t *) in)[i];
+	}
+}
+
+// Generate the message digest (size given in init).
+//      Result placed in "out".
+
+void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out )
+{
+	size_t i;
+
+	ctx->t[0] += ctx->c;                // mark last block offset
+	if (ctx->t[0] < ctx->c)             // carry overflow
+		ctx->t[1]++;                    // high word
+
+	while (ctx->c < 128)                // fill up with zeros
+		ctx->b[ctx->c++] = 0;
+	blake2b_compress(ctx, 1);           // final block flag = 1
+
+	// little endian convert and store
+	for (i = 0; i < ctx->outlen; i++) {
+		((uint8_t *) out)[i] =
+			(ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF;
+	}
+}
+
diff --git a/algo/blake/sph_blake2b.h b/algo/blake/sph_blake2b.h
new file mode 100644
index 0000000..eaae071
--- /dev/null
+++ b/algo/blake/sph_blake2b.h
@@ -0,0 +1,41 @@
+#pragma once
+#ifndef __BLAKE2B_H__
+#define __BLAKE2B_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <inttypes.h>
+#define inline __inline
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__)
+#define NATIVE_LITTLE_ENDIAN
+#endif
+
+// state context
+ALIGN(64) typedef struct {
+	uint8_t b[128]; // input buffer
+	uint64_t h[8];  // chained state
+	uint64_t t[2];  // total number of bytes
+	size_t c;       // pointer for b[]
+	size_t outlen;  // digest size
+} sph_blake2b_ctx;
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+int sph_blake2b_init( sph_blake2b_ctx *ctx, size_t outlen, const void *key, size_t keylen);
+void sph_blake2b_update( sph_blake2b_ctx *ctx, const void *in, size_t inlen);
+void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/algo/bmw/sse2/bmw.c b/algo/bmw/sse2/bmw.c
index 6eb350b..a990afb 100644
--- a/algo/bmw/sse2/bmw.c
+++ b/algo/bmw/sse2/bmw.c
@@ -485,6 +485,7 @@ do { \
     sph_enc64le(out + 8 * u, h1[v]); \
 } while (0) 
 
+/*
 static void
 compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16])
 {
@@ -499,6 +500,7 @@ compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16])
 #undef H
 #undef dH
 }
+*/
 
 static const sph_u64 final_b[16] = {
 	SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1),
diff --git a/algo/cryptonight/cryptonight-aesni.c b/algo/cryptonight/cryptonight-aesni.c
index 00145b1..f906c17 100644
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -202,7 +202,7 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	_mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x);
 
 	uint64_t *nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
-	uint64_t b[2];
+//	uint64_t b[2];
 	b[0] = nextblock[0];
 	b[1] = nextblock[1];
 
diff --git a/algo/cubehash/sse2/cubehash_sse2.c b/algo/cubehash/sse2/cubehash_sse2.c
index c883c07..b4c882e 100644
--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -24,10 +24,10 @@ static void transform( cubehashParam *sp )
 
     __m256i x0, x1, x2, x3, y0, y1;
 
-    x0 = _mm256_load_si256( 0 + sp->x );
-    x1 = _mm256_load_si256( 2 + sp->x );   
-    x2 = _mm256_load_si256( 4 + sp->x );
-    x3 = _mm256_load_si256( 6 + sp->x );
+    x0 = _mm256_load_si256( (__m256i*)sp->x     );
+    x1 = _mm256_load_si256( (__m256i*)sp->x + 1 );   
+    x2 = _mm256_load_si256( (__m256i*)sp->x + 2 );
+    x3 = _mm256_load_si256( (__m256i*)sp->x + 3 );
 
     for ( r = 0; r < rounds; ++r )
     { 
@@ -57,22 +57,22 @@ static void transform( cubehashParam *sp )
         x3 = _mm256_shuffle_epi32( x3, 0xb1 );
     }
 
-    _mm256_store_si256( 0 + sp->x, x0 );
-    _mm256_store_si256( 2 + sp->x, x1 );
-    _mm256_store_si256( 4 + sp->x, x2 );
-    _mm256_store_si256( 6 + sp->x, x3 );
+    _mm256_store_si256( (__m256i*)sp->x,     x0 );
+    _mm256_store_si256( (__m256i*)sp->x + 1, x1 );
+    _mm256_store_si256( (__m256i*)sp->x + 2, x2 );
+    _mm256_store_si256( (__m256i*)sp->x + 3, x3 );
 
 #else
     __m128i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
 
-    x0 = _mm_load_si128(0 + sp->x);
-    x1 = _mm_load_si128(1 + sp->x);
-    x2 = _mm_load_si128(2 + sp->x);
-    x3 = _mm_load_si128(3 + sp->x);
-    x4 = _mm_load_si128(4 + sp->x);
-    x5 = _mm_load_si128(5 + sp->x);
-    x6 = _mm_load_si128(6 + sp->x);
-    x7 = _mm_load_si128(7 + sp->x);
+    x0 = _mm_load_si128( (__m128i*)sp->x     );
+    x1 = _mm_load_si128( (__m128i*)sp->x + 1 );
+    x2 = _mm_load_si128( (__m128i*)sp->x + 2 );
+    x3 = _mm_load_si128( (__m128i*)sp->x + 3 );
+    x4 = _mm_load_si128( (__m128i*)sp->x + 4 );
+    x5 = _mm_load_si128( (__m128i*)sp->x + 5 );
+    x6 = _mm_load_si128( (__m128i*)sp->x + 6 );
+    x7 = _mm_load_si128( (__m128i*)sp->x + 7 );
 
     for (r = 0; r < rounds; ++r) {
 	x4 = _mm_add_epi32(x0, x4);
@@ -117,14 +117,14 @@ static void transform( cubehashParam *sp )
 	x7 = _mm_shuffle_epi32(x7, 0xb1);
     }
 
-    _mm_store_si128(0 + sp->x, x0);
-    _mm_store_si128(1 + sp->x, x1);
-    _mm_store_si128(2 + sp->x, x2);
-    _mm_store_si128(3 + sp->x, x3);
-    _mm_store_si128(4 + sp->x, x4);
-    _mm_store_si128(5 + sp->x, x5);
-    _mm_store_si128(6 + sp->x, x6);
-    _mm_store_si128(7 + sp->x, x7);
+    _mm_store_si128( (__m128i*)sp->x,     x0 );
+    _mm_store_si128( (__m128i*)sp->x + 1, x1 );
+    _mm_store_si128( (__m128i*)sp->x + 2, x2 );
+    _mm_store_si128( (__m128i*)sp->x + 3, x3 );
+    _mm_store_si128( (__m128i*)sp->x + 4, x4 );
+    _mm_store_si128( (__m128i*)sp->x + 5, x5 );
+    _mm_store_si128( (__m128i*)sp->x + 6, x6 );
+    _mm_store_si128( (__m128i*)sp->x + 7, x7 );
 
 #endif
 }  // transform
diff --git a/algo/hmq1725.c b/algo/hmq1725.c
index 25b9b31..79d9c04 100644
--- a/algo/hmq1725.c
+++ b/algo/hmq1725.c
@@ -49,7 +49,6 @@ typedef struct {
   sph_whirlpool_context   whirlpool1, whirlpool2, whirlpool3, whirlpool4;
   sph_sha512_context      sha1, sha2;
   sph_haval256_5_context  haval1, haval2;
-
 #ifdef NO_AES_NI
   sph_groestl512_context  groestl1, groestl2;
   sph_echo512_context     echo1, echo2;
@@ -57,10 +56,10 @@ typedef struct {
   hashState_echo          echo1, echo2;
   hashState_groestl       groestl1, groestl2;
 #endif
-
 } hmq1725_ctx_holder;
 
 static hmq1725_ctx_holder hmq1725_ctx;
+static __thread sph_bmw512_context hmq_bmw_mid;
 
 void init_hmq1725_ctx()
 {
@@ -122,174 +121,185 @@ void init_hmq1725_ctx()
 #endif
 }
 
+void hmq_bmw512_midstate( const void* input )
+{
+    memcpy( &hmq_bmw_mid, &hmq1725_ctx.bmw1, sizeof hmq_bmw_mid );
+    sph_bmw512( &hmq_bmw_mid, input, 64 );
+}
+
+__thread hmq1725_ctx_holder h_ctx;
+
 extern void hmq1725hash(void *state, const void *input)
 {
     const uint32_t mask = 24;
-    uint32_t hashA[25], hashB[25];
-    hmq1725_ctx_holder ctx;
+    uint32_t hashA[16] __attribute__((aligned(64)));
+    uint32_t hashB[16] __attribute__((aligned(64)));
+    const int midlen = 64;            // bytes
+    const int tail   = 80 - midlen;   // 16
 
-    memcpy(&ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
+    memcpy(&h_ctx, &hmq1725_ctx, sizeof(hmq1725_ctx));
 
-    sph_bmw512 (&ctx.bmw1, input, 80);    //0
-    sph_bmw512_close(&ctx.bmw1, hashA);   //1
+    memcpy( &h_ctx.bmw1, &hmq_bmw_mid, sizeof hmq_bmw_mid );
+    sph_bmw512( &h_ctx.bmw1, input + midlen, tail );
+    sph_bmw512_close(&h_ctx.bmw1, hashA);   //1
 
-    sph_whirlpool (&ctx.whirlpool1, hashA, 64);    //0
-    sph_whirlpool_close(&ctx.whirlpool1, hashB);   //1
+    sph_whirlpool (&h_ctx.whirlpool1, hashA, 64);    //0
+    sph_whirlpool_close(&h_ctx.whirlpool1, hashB);   //1
 
     if ( hashB[0] & mask )   //1
     {
 #ifdef NO_AES_NI
-     sph_groestl512 (&ctx.groestl1, hashB, 64); //1
-     sph_groestl512_close(&ctx.groestl1, hashA); //2
+     sph_groestl512 (&h_ctx.groestl1, hashB, 64); //1
+     sph_groestl512_close(&h_ctx.groestl1, hashA); //2
 #else
-     update_groestl( &ctx.groestl1, (char*)hashB, 512 );
-     final_groestl( &ctx.groestl1, (char*)hashA );
+     update_and_final_groestl( &h_ctx.groestl1, (char*)hashA, 
+                               (const char*)hashB, 512 );
 #endif
     }
     else
     {
-      sph_skein512 (&ctx.skein1, hashB, 64); //1
-      sph_skein512_close(&ctx.skein1, hashA); //2
+      sph_skein512 (&h_ctx.skein1, hashB, 64); //1
+      sph_skein512_close(&h_ctx.skein1, hashA); //2
     }
 	
-    sph_jh512 (&ctx.jh1, hashA, 64); //3
-    sph_jh512_close(&ctx.jh1, hashB); //4
+    sph_jh512 (&h_ctx.jh1, hashA, 64); //3
+    sph_jh512_close(&h_ctx.jh1, hashB); //4
 
-    sph_keccak512 (&ctx.keccak1, hashB, 64); //2
-    sph_keccak512_close(&ctx.keccak1, hashA); //3
+    sph_keccak512 (&h_ctx.keccak1, hashB, 64); //2
+    sph_keccak512_close(&h_ctx.keccak1, hashA); //3
 
     if ( hashA[0] & mask ) //4
     {
-        sph_blake512 (&ctx.blake1, hashA, 64); //
-        sph_blake512_close(&ctx.blake1, hashB); //5
+        sph_blake512 (&h_ctx.blake1, hashA, 64); //
+        sph_blake512_close(&h_ctx.blake1, hashB); //5
     }
     else
     {
-        sph_bmw512 (&ctx.bmw2, hashA, 64); //4
-        sph_bmw512_close(&ctx.bmw2, hashB);   //5
+        sph_bmw512 (&h_ctx.bmw2, hashA, 64); //4
+        sph_bmw512_close(&h_ctx.bmw2, hashB);   //5
     }
     
-     update_luffa( &ctx.luffa1, (BitSequence*)hashB, 64 );
-     final_luffa( &ctx.luffa1, (BitSequence*)hashA );
+     update_and_final_luffa( &h_ctx.luffa1, (BitSequence*)hashA, 
+                             (const BitSequence*)hashB, 64 );
 
-     cubehashUpdate( &ctx.cube, (BitSequence *)hashA, 64 );
-     cubehashDigest( &ctx.cube, (BitSequence *)hashB );
+     cubehashUpdateDigest( &h_ctx.cube, (BitSequence *)hashB,
+                           (const BitSequence *)hashA, 64 );
 
     if ( hashB[0] & mask ) //7
     {
-        sph_keccak512 (&ctx.keccak2, hashB, 64); //
-        sph_keccak512_close(&ctx.keccak2, hashA); //8
+        sph_keccak512 (&h_ctx.keccak2, hashB, 64); //
+        sph_keccak512_close(&h_ctx.keccak2, hashA); //8
     }
     else
     {
-        sph_jh512 (&ctx.jh2, hashB, 64); //7
-        sph_jh512_close(&ctx.jh2, hashA); //8
+        sph_jh512 (&h_ctx.jh2, hashB, 64); //7
+        sph_jh512_close(&h_ctx.jh2, hashA); //8
     }
 
-    sph_shavite512 (&ctx.shavite1, hashA, 64); //3
-    sph_shavite512_close(&ctx.shavite1, hashB); //4
+    sph_shavite512 (&h_ctx.shavite1, hashA, 64); //3
+    sph_shavite512_close(&h_ctx.shavite1, hashB); //4
 
-    update_sd( &ctx.simd1, (BitSequence *)hashB, 512 );
-    final_sd( &ctx.simd1, (BitSequence *)hashA );
+    update_final_sd( &h_ctx.simd1, (BitSequence *)hashA,
+                                   (const BitSequence *)hashB, 512 );
 
     if ( hashA[0] & mask ) //4
     {
-        sph_whirlpool (&ctx.whirlpool2, hashA, 64); //
-        sph_whirlpool_close(&ctx.whirlpool2, hashB); //5
+        sph_whirlpool (&h_ctx.whirlpool2, hashA, 64); //
+        sph_whirlpool_close(&h_ctx.whirlpool2, hashB); //5
     }
     else
     {
-        sph_haval256_5 (&ctx.haval1, hashA, 64); //4
-        sph_haval256_5_close(&ctx.haval1, hashB);   //5
+        sph_haval256_5 (&h_ctx.haval1, hashA, 64); //4
+        sph_haval256_5_close(&h_ctx.haval1, hashB);   //5
 	memset(&hashB[8], 0, 32);
     }
 
 #ifdef NO_AES_NI
-    sph_echo512 (&ctx.echo1, hashB, 64); //5
-    sph_echo512_close(&ctx.echo1, hashA); //6
+    sph_echo512 (&h_ctx.echo1, hashB, 64); //5
+    sph_echo512_close(&h_ctx.echo1, hashA); //6
 #else
-    update_echo ( &ctx.echo1, (BitSequence *)hashB, 512 );
-    final_echo( &ctx.echo1, (BitSequence *)hashA );
+    update_final_echo ( &h_ctx.echo1, (BitSequence *)hashA,
+                        (const BitSequence *)hashB, 512 );
 #endif
 
-    sph_blake512 (&ctx.blake2, hashA, 64); //6
-    sph_blake512_close(&ctx.blake2, hashB); //7
+    sph_blake512 (&h_ctx.blake2, hashA, 64); //6
+    sph_blake512_close(&h_ctx.blake2, hashB); //7
 
     if ( hashB[0] & mask ) //7
     {
-        sph_shavite512 (&ctx.shavite2, hashB, 64); //
-        sph_shavite512_close(&ctx.shavite2, hashA); //8
+        sph_shavite512 (&h_ctx.shavite2, hashB, 64); //
+        sph_shavite512_close(&h_ctx.shavite2, hashA); //8
     }
     else
     {
-     update_luffa( &ctx.luffa2, (BitSequence *)hashB, 64 );
-     final_luffa( &ctx.luffa2, (BitSequence *)hashA );
+     update_and_final_luffa( &h_ctx.luffa2, (BitSequence *)hashA,
+                             (const BitSequence *)hashB, 64 );
     }
 
-    sph_hamsi512 (&ctx.hamsi1, hashA, 64); //3
-    sph_hamsi512_close(&ctx.hamsi1, hashB); //4
+    sph_hamsi512 (&h_ctx.hamsi1, hashA, 64); //3
+    sph_hamsi512_close(&h_ctx.hamsi1, hashB); //4
 
-    sph_fugue512 (&ctx.fugue1, hashB, 64); //2   ////
-    sph_fugue512_close(&ctx.fugue1, hashA); //3 
+    sph_fugue512 (&h_ctx.fugue1, hashB, 64); //2   ////
+    sph_fugue512_close(&h_ctx.fugue1, hashA); //3 
 
     if ( hashA[0] & mask ) //4
     {
 #ifdef NO_AES_NI
-     sph_echo512 (&ctx.echo2, hashA, 64); //
-     sph_echo512_close(&ctx.echo2, hashB); //5
+     sph_echo512 (&h_ctx.echo2, hashA, 64); //
+     sph_echo512_close(&h_ctx.echo2, hashB); //5
 #else
-     update_echo ( &ctx.echo2, (BitSequence *)hashA, 512 );
-     final_echo( &ctx.echo2, (BitSequence *)hashB );
+     update_final_echo ( &h_ctx.echo2, (BitSequence *)hashB,
+                         (const BitSequence *)hashA, 512 );
 #endif
     }
     else
     {
-     update_sd( &ctx.simd2, (BitSequence *)hashA, 512 );
-     final_sd( &ctx.simd2, (BitSequence *)hashB );
+     update_final_sd( &h_ctx.simd2, (BitSequence *)hashB,
+                      (const BitSequence *)hashA, 512 );
     }
 
-    sph_shabal512 (&ctx.shabal1, hashB, 64); //5
-    sph_shabal512_close(&ctx.shabal1, hashA); //6
+    sph_shabal512 (&h_ctx.shabal1, hashB, 64); //5
+    sph_shabal512_close(&h_ctx.shabal1, hashA); //6
 
-    sph_whirlpool (&ctx.whirlpool3, hashA, 64); //6
-    sph_whirlpool_close(&ctx.whirlpool3, hashB); //7
+    sph_whirlpool (&h_ctx.whirlpool3, hashA, 64); //6
+    sph_whirlpool_close(&h_ctx.whirlpool3, hashB); //7
 
     if ( hashB[0] & mask ) //7
     {
-        sph_fugue512 (&ctx.fugue2, hashB, 64); //
-        sph_fugue512_close(&ctx.fugue2, hashA); //8
+        sph_fugue512 (&h_ctx.fugue2, hashB, 64); //
+        sph_fugue512_close(&h_ctx.fugue2, hashA); //8
     }
     else
     {
-        sph_sha512 (&ctx.sha1, hashB, 64); //7
-        sph_sha512_close(&ctx.sha1, hashA); //8
+        sph_sha512 (&h_ctx.sha1, hashB, 64); //7
+        sph_sha512_close(&h_ctx.sha1, hashA); //8
     }
 
 #ifdef NO_AES_NI
-    sph_groestl512 (&ctx.groestl2, hashA, 64); //3
-    sph_groestl512_close(&ctx.groestl2, hashB); //4
+    sph_groestl512 (&h_ctx.groestl2, hashA, 64); //3
+    sph_groestl512_close(&h_ctx.groestl2, hashB); //4
 #else
-    update_groestl( &ctx.groestl2, (char*)hashA, 512 );
-    final_groestl( &ctx.groestl2, (char*)hashB );
+    update_and_final_groestl( &h_ctx.groestl2, (char*)hashB,
+                               (const char*)hashA, 512 );
 #endif
 
-    sph_sha512 (&ctx.sha2, hashB, 64); //2 
-    sph_sha512_close(&ctx.sha2, hashA); //3 
+    sph_sha512 (&h_ctx.sha2, hashB, 64); //2 
+    sph_sha512_close(&h_ctx.sha2, hashA); //3 
 
     if ( hashA[0] & mask ) //4
     {
-        sph_haval256_5 (&ctx.haval2, hashA, 64); //
-        sph_haval256_5_close(&ctx.haval2, hashB); //5
+        sph_haval256_5 (&h_ctx.haval2, hashA, 64); //
+        sph_haval256_5_close(&h_ctx.haval2, hashB); //5
 	memset(&hashB[8], 0, 32);
     }
     else
     {
-        sph_whirlpool (&ctx.whirlpool4, hashA, 64); //4
-        sph_whirlpool_close(&ctx.whirlpool4, hashB);   //5
+        sph_whirlpool (&h_ctx.whirlpool4, hashA, 64); //4
+        sph_whirlpool_close(&h_ctx.whirlpool4, hashB);   //5
     }
 
-    sph_bmw512 (&ctx.bmw3, hashB, 64); //5
-    sph_bmw512_close(&ctx.bmw3, hashA); //6
+    sph_bmw512 (&h_ctx.bmw3, hashB, 64); //5
+    sph_bmw512_close(&h_ctx.bmw3, hashA); //6
 
 	memcpy(state, hashA, 32);
 }
@@ -309,6 +319,8 @@ int scanhash_hmq1725( int thr_id, struct work *work, int32_t max_nonce,
         for (int k = 0; k < 32; k++)
                 be32enc(&endiandata[k], pdata[k]);
 
+        hmq_bmw512_midstate( endiandata );
+
 //	if (opt_debug) 
 //	{
 //		applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce);
diff --git a/algo/hodl/hodl-gate.c b/algo/hodl/hodl-gate.c
index a9fe168..1b53247 100644
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -47,6 +47,21 @@ void hodl_le_build_stratum_request( char* req, struct work* work,
 
 void hodl_build_extraheader( struct work* g_work, struct stratum_ctx *sctx )
 {
+   uchar merkle_root[64] = { 0 };
+   size_t t;
+   int i;
+
+   algo_gate.gen_merkle_root( merkle_root, sctx );
+   // Increment extranonce2
+   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
+   // Assemble block header
+   memset( g_work->data, 0, sizeof(g_work->data) );
+   g_work->data[0] = le32dec( sctx->job.version );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
+
    g_work->data[ algo_gate.ntime_index ] = le32dec( sctx->job.ntime );
    g_work->data[ algo_gate.nbits_index ] = le32dec( sctx->job.nbits );
    g_work->data[22] = 0x80000000;
diff --git a/algo/jh/sse2/jh_sse2_opt64.h b/algo/jh/sse2/jh_sse2_opt64.h
index a49e605..7f8e53f 100644
--- a/algo/jh/sse2/jh_sse2_opt64.h
+++ b/algo/jh/sse2/jh_sse2_opt64.h
@@ -102,7 +102,7 @@ static DATA_ALIGN16(const unsigned char jhE8_bitslice_roundconstant[42][32])={
 {0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}};
 
 
-static void jhF8(jhState *state);    /* the compression function F8 */
+//static void jhF8(jhState *state);    /* the compression function F8 */
 
 /*The API functions*/
 
@@ -236,7 +236,8 @@ static void jhF8(jhState *state);    /* the compression function F8 */
       jhSS(jhSx0,jhSx2,jhSx4,jhSx6,jhSx1,jhSx3,jhSx5,jhSx7, jhLOAD(jhE8_bitslice_roundconstant[r]), jhLOAD(jhE8_bitslice_roundconstant[r]+16) ); \
       jhlineartransform_R##nn(jhSx0,jhSx2,jhSx4,jhSx6,jhSx1,jhSx3,jhSx5,jhSx7);
 
-/*the compression function F8 */
+/*
+//the compression function F8
 static void jhF8(jhState *state)
 {
     return;
@@ -257,9 +258,9 @@ static void jhF8(jhState *state)
       y6 = state->x6;
       y7 = state->x7;
 
-      /*xor the 512-bit message with the fist half of the 1024-bit hash state*/
+      //xor the 512-bit message with the fist half of the 1024-bit hash state
 
-      /*perform 42 rounds*/
+      //perform 42 rounds
       for (i = 0; i < 42; i = i+7) {
             jhround_function(00,i);
             jhround_function(01,i+1);
@@ -270,7 +271,7 @@ static void jhF8(jhState *state)
             jhround_function(06,i+6);
       }
 
-      /*xor the 512-bit message with the second half of the 1024-bit hash state*/
+      //xor the 512-bit message with the second half of the 1024-bit hash state
 
       state->x0 = y0;
       state->x1 = y1;
@@ -285,6 +286,7 @@ static void jhF8(jhState *state)
       y7 = jhXOR(y7, jhLOAD(state->buffer+48)),
       state->x7 = y7;
 }
+*/
 
 #define jhF8I \
 do { \
diff --git a/algo/lbry.c b/algo/lbry.c
index 16d8570..21d1987 100644
--- a/algo/lbry.c
+++ b/algo/lbry.c
@@ -146,12 +146,10 @@ double lbry_calc_network_diff( struct work *work )
 {
         // sample for diff 43.281 : 1c05ea29
         // todo: endian reversed on longpoll could be zr5 specific...
-//        uint32_t nbits = have_longpoll ? work->data[18] : swab32(work->data[18]);
 
    uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
    uint32_t bits = (nbits & 0xffffff);
    int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
-//   uint64_t diffone = 0x0000FFFF00000000ull;
    double d = (double)0x0000ffff / (double)bits;
 
    for (int m=shift; m < 29; m++) d *= 256.0;
@@ -181,13 +179,27 @@ void lbry_le_build_stratum_request( char *req, struct work *work,
    free(xnonce2str);
 }
 
-void lbry_build_extraheader( struct work* work, struct stratum_ctx* sctx )
+void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
+   unsigned char merkle_root[64] = { 0 };
+   size_t t;
+   int i;
+
+   algo_gate.gen_merkle_root( merkle_root, sctx );
+   // Increment extranonce2 
+   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
+   // Assemble block header 
+   memset( g_work->data, 0, sizeof(g_work->data) );
+   g_work->data[0] = le32dec( sctx->job.version );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
    for ( int i = 0; i < 8; i++ )
-        work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
-   work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime);
-   work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits);
-   work->data[28] = 0x80000000;
+        g_work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
+   g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime);
+   g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits);
+   g_work->data[28] = 0x80000000;
 }
 
 void lbry_set_target( struct work* work, double job_diff )
diff --git a/algo/luffa/sse2/luffa_for_sse2.c b/algo/luffa/sse2/luffa_for_sse2.c
index db4d31c..368d8ad 100644
--- a/algo/luffa/sse2/luffa_for_sse2.c
+++ b/algo/luffa/sse2/luffa_for_sse2.c
@@ -20,20 +20,9 @@
 
 #include <string.h>
 #include <emmintrin.h>
+#include "avxdefs.h"
 #include "luffa_for_sse2.h"
 
-#ifdef HASH_BIG_ENDIAN
-# define BYTES_SWAP32(x) x
-#else
-# define BYTES_SWAP32(x) \
-    ((x << 24) | ((x & 0x0000ff00) << 8) | ((x & 0x00ff0000) >> 8) | (x >> 24))
-#endif /* HASH_BIG_ENDIAN */
-
-/* BYTES_SWAP256(x) stores each 32-bit word of 256 bits data in little-endian convention */
-#define BYTES_SWAP256(x) { \
-    int _i = 8; while(_i--){x[_i] = BYTES_SWAP32(x[_i]);} \
-}
-
 #define MULT2(a0,a1) do \
 { \
   __m128i b; \
@@ -200,11 +189,10 @@
     NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
 
 
-static void Update512(hashState_luffa *state, const BitSequence *data, DataLength databitlen);
+static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 );
+//static void rnd512( hashState_luffa *state );
 
-static void rnd512(hashState_luffa *state);
-
-static void finalization512(hashState_luffa *state, uint32 *b);
+static void finalization512( hashState_luffa *state, uint32 *b );
 
 
 /* initial values of chaining variables */
@@ -282,45 +270,44 @@ HashReturn init_luffa(hashState_luffa *state, int hashbitlen)
 HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
                          size_t len )
 {
-    HashReturn ret=SUCCESS;
-    int i, j;
-    int rem = len % 32;
-    int blocks = (int)( len / 32 );
-    uint8 *p = (uint8*)state->buffer;
+    int i;
+    int blocks = (int)len / 32;
+    state-> rembytes = (int)len % 32;
 
     // full blocks
-    for ( j = 0; j < blocks; j++ )
+    for ( i = 0; i < blocks; i++ )
     {
-       state->buffer[0] = BYTES_SWAP32( ((uint32*)data)[0] );
-       state->buffer[1] = BYTES_SWAP32( ((uint32*)data)[1] );
-       state->buffer[2] = BYTES_SWAP32( ((uint32*)data)[2] );
-       state->buffer[3] = BYTES_SWAP32( ((uint32*)data)[3] );
-       state->buffer[4] = BYTES_SWAP32( ((uint32*)data)[4] );
-       state->buffer[5] = BYTES_SWAP32( ((uint32*)data)[5] );
-       state->buffer[6] = BYTES_SWAP32( ((uint32*)data)[6] );
-       state->buffer[7] = BYTES_SWAP32( ((uint32*)data)[7] );
-
-       rnd512( state );
+       rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
+                      mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
        data += MSG_BLOCK_BYTE_LEN;
     }
 
-    // remaining partial block, if any
-    for ( i = 0; i < rem/4; i++ )
-       state->buffer[i] = BYTES_SWAP32( ((uint32*)data)[i] );
+    // 16 byte partial block exists for 80 byte len
+    // store in buffer for transform in final for midstate to work
+    if ( state->rembytes  )
+    {
+      // remaining data bytes
+      casti_m128i( state->buffer, 0 ) = mm_byteswap_epi32( cast_m128i( data ) );
+      // padding of partial block
+      casti_m128i( state->buffer, 1 ) =
+            _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
+    }
 
-    // padding of partial block
-    memset( p+rem+1, 0, (31-rem)*sizeof(uint8) );
-    p[rem] = 0x80;
-    for ( i = rem/4; i < 8; i++ )
-       state->buffer[i] = BYTES_SWAP32(state->buffer[i]);
-
-    rnd512(state);
-
-    return ret;
+    return SUCCESS;
 }
 
 HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval) 
 {
+    // transform pad block
+    if ( state->rembytes )
+      // not empty, data is in buffer
+      rnd512( state, casti_m128i( state->buffer, 1 ),
+                     casti_m128i( state->buffer, 0 ) );
+    else
+      // empty pad block, constant data
+      rnd512( state, _mm_setzero_si128(),
+                     _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
+
     finalization512(state, (uint32*) hashval);
     if ( state->hashbitlen > 512 )
         finalization512( state, (uint32*)( hashval+128 ) );
@@ -330,79 +317,66 @@ HashReturn final_luffa(hashState_luffa *state, BitSequence *hashval)
 HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
               const BitSequence* data, size_t inlen )
 {
-    HashReturn ret=SUCCESS;
-    int i, j;
-    int rem = inlen % 32;
+// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
+    int i;
     int blocks = (int)( inlen / 32 );
-    uint8 *p = (uint8*)state->buffer;
+    state->rembytes = inlen % 32;
 
     // full blocks
-    for ( j = 0; j < blocks; j++ )
+    for ( i = 0; i < blocks; i++ )
     {
-       state->buffer[0] = BYTES_SWAP32( ((uint32*)data)[0] );
-       state->buffer[1] = BYTES_SWAP32( ((uint32*)data)[1] );
-       state->buffer[2] = BYTES_SWAP32( ((uint32*)data)[2] );
-       state->buffer[3] = BYTES_SWAP32( ((uint32*)data)[3] );
-       state->buffer[4] = BYTES_SWAP32( ((uint32*)data)[4] );
-       state->buffer[5] = BYTES_SWAP32( ((uint32*)data)[5] );
-       state->buffer[6] = BYTES_SWAP32( ((uint32*)data)[6] );
-       state->buffer[7] = BYTES_SWAP32( ((uint32*)data)[7] );
-
-       rnd512( state );
+       rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
+                      mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
        data += MSG_BLOCK_BYTE_LEN;
     }
 
-    // remaining partial block, if any
-    for ( i = 0; i < rem/4; i++ )
-       state->buffer[i] = BYTES_SWAP32( ((uint32*)data)[i] );
-
-    // padding of partial block
-    memset( p+rem+1, 0, (31-rem)*sizeof(uint8) );
-    p[rem] = 0x80;
-    for ( i = rem/4; i < 8; i++ )
-       state->buffer[i] = BYTES_SWAP32(state->buffer[i]);
-
-    rnd512( state );
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+      // remaining 16 data bytes + 16 bytes padding
+       rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
+                      mm_byteswap_epi32( cast_m128i( data ) ) );
+    else
+      // empty pad block
+      rnd512( state, _mm_setzero_si128(), 
+                     _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
 
     finalization512( state, (uint32*) output );
     if ( state->hashbitlen > 512 )
         finalization512( state, (uint32*)( output+128 ) );
-    return SUCCESS;
 
+    return SUCCESS;
 }
 
 /***************************************************/
 /* Round function         */
 /* state: hash context    */
 
-
-static void rnd512(hashState_luffa *state)
+static void rnd512( hashState_luffa *state, __m128i msg1, __m128i msg0 )
 {
     __m128i t[2];
     __m128i *chainv = state->chainv;
-    __m128i msg[2];
     __m128i tmp[2];
     __m128i x[8];
-    int i;
+
+//    _mm_prefetch( chainv,     _MM_HINT_T0 );
+//    _mm_prefetch( chainv + 4, _MM_HINT_T0 );
 
     t[0] = chainv[0];
     t[1] = chainv[1];
 
-    t[0] = _mm_xor_si128(t[0], chainv[2]);
-    t[1] = _mm_xor_si128(t[1], chainv[3]);
-    t[0] = _mm_xor_si128(t[0], chainv[4]);
-    t[1] = _mm_xor_si128(t[1], chainv[5]);
-    t[0] = _mm_xor_si128(t[0], chainv[6]);
-    t[1] = _mm_xor_si128(t[1], chainv[7]);
-    t[0] = _mm_xor_si128(t[0], chainv[8]);
-    t[1] = _mm_xor_si128(t[1], chainv[9]);
+    t[0] = _mm_xor_si128( t[0], chainv[2] );
+    t[1] = _mm_xor_si128( t[1], chainv[3] );
+    t[0] = _mm_xor_si128( t[0], chainv[4] );
+    t[1] = _mm_xor_si128( t[1], chainv[5] );
+    t[0] = _mm_xor_si128( t[0], chainv[6] );
+    t[1] = _mm_xor_si128( t[1], chainv[7] );
+    t[0] = _mm_xor_si128( t[0], chainv[8] );
+    t[1] = _mm_xor_si128( t[1], chainv[9] );
 
-    MULT2( t[0], t[1]);
+    MULT2( t[0], t[1] );
 
-    msg[0] = _mm_load_si128 ( (__m128i*)&state->buffer[0] );
-    msg[1] = _mm_load_si128 ( (__m128i*)&state->buffer[4] );
-    msg[0] = _mm_shuffle_epi32( msg[0], 27 );
-    msg[1] = _mm_shuffle_epi32( msg[1], 27 );
+    msg0 = _mm_shuffle_epi32( msg0, 27 );
+    msg1 = _mm_shuffle_epi32( msg1, 27 );
 
     chainv[0] = _mm_xor_si128( chainv[0], t[0] );
     chainv[1] = _mm_xor_si128( chainv[1], t[1] );
@@ -468,30 +442,30 @@ static void rnd512(hashState_luffa *state)
 
     MULT2( chainv[0], chainv[1] );
 
-    chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t[0] ), msg[0] );
-    chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t[1] ), msg[1] );
+    chainv[0] = _mm_xor_si128( _mm_xor_si128( chainv[0], t[0] ), msg0 );
+    chainv[1] = _mm_xor_si128( _mm_xor_si128( chainv[1], t[1] ), msg1 );
 
-    MULT2( msg[0], msg[1]);
+    MULT2( msg0, msg1);
 
-    chainv[2] = _mm_xor_si128( chainv[2], msg[0] );
-    chainv[3] = _mm_xor_si128( chainv[3], msg[1] );
+    chainv[2] = _mm_xor_si128( chainv[2], msg0 );
+    chainv[3] = _mm_xor_si128( chainv[3], msg1 );
 
-    MULT2( msg[0], msg[1]);
+    MULT2( msg0, msg1);
 
-    chainv[4] = _mm_xor_si128( chainv[4], msg[0] );
-    chainv[5] = _mm_xor_si128( chainv[5], msg[1] );
+    chainv[4] = _mm_xor_si128( chainv[4], msg0 );
+    chainv[5] = _mm_xor_si128( chainv[5], msg1 );
 
-    MULT2( msg[0], msg[1]);
+    MULT2( msg0, msg1);
 
-    chainv[6] = _mm_xor_si128( chainv[6], msg[0] );
-    chainv[7] = _mm_xor_si128( chainv[7], msg[1] );
+    chainv[6] = _mm_xor_si128( chainv[6], msg0 );
+    chainv[7] = _mm_xor_si128( chainv[7], msg1 );
 
-    MULT2( msg[0], msg[1]);
+    MULT2( msg0, msg1);
 
-    chainv[8] = _mm_xor_si128( chainv[8], msg[0] );
-    chainv[9] = _mm_xor_si128( chainv[9], msg[1] );
+    chainv[8] = _mm_xor_si128( chainv[8], msg0 );
+    chainv[9] = _mm_xor_si128( chainv[9], msg1 );
 
-    MULT2( msg[0], msg[1]);
+    MULT2( msg0, msg1);
 
     chainv[3] = _mm_or_si128( _mm_slli_epi32(chainv[3], 1),
                               _mm_srli_epi32(chainv[3], 31) );
@@ -548,16 +522,16 @@ static void rnd512(hashState_luffa *state)
 /* state: hash context    */
 /* b[8]: hash values      */
 
-static void finalization512(hashState_luffa *state, uint32 *b)
+static void finalization512( hashState_luffa *state, uint32 *b )
 {
+    uint32 hash[8] __attribute((aligned(64)));
     __m128i* chainv = state->chainv;
     __m128i t[2];
-    uint32 hash[8] __attribute((aligned(16)));
-    int i;
 
     /*---- blank round with m=0 ----*/
-    memset(state->buffer, 0, sizeof state->buffer );
-    rnd512(state);
+    rnd512( state, _mm_setzero_si128(), _mm_setzero_si128() );
+
+//    _mm_prefetch( b, _MM_HINT_T0 );
 
     t[0] = chainv[0];
     t[1] = chainv[1];
@@ -576,17 +550,10 @@ static void finalization512(hashState_luffa *state, uint32 *b)
     _mm_store_si128((__m128i*)&hash[0], t[0]);
     _mm_store_si128((__m128i*)&hash[4], t[1]);
 
-    b[0] = BYTES_SWAP32(hash[0]);
-    b[1] = BYTES_SWAP32(hash[1]);
-    b[2] = BYTES_SWAP32(hash[2]);
-    b[3] = BYTES_SWAP32(hash[3]);
-    b[4] = BYTES_SWAP32(hash[4]);
-    b[5] = BYTES_SWAP32(hash[5]);
-    b[6] = BYTES_SWAP32(hash[6]);
-    b[7] = BYTES_SWAP32(hash[7]);
+    casti_m128i( b, 0 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 1 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
 
-    memset(state->buffer, 0, sizeof state->buffer );
-    rnd512(state);
+    rnd512( state, _mm_setzero_si128(), _mm_setzero_si128() );
 
     t[0] = chainv[0];
     t[1] = chainv[1];
@@ -605,14 +572,8 @@ static void finalization512(hashState_luffa *state, uint32 *b)
     _mm_store_si128((__m128i*)&hash[0], t[0]);
     _mm_store_si128((__m128i*)&hash[4], t[1]);
 
-    b[ 8] = BYTES_SWAP32(hash[0]);
-    b[ 9] = BYTES_SWAP32(hash[1]);
-    b[10] = BYTES_SWAP32(hash[2]);
-    b[11] = BYTES_SWAP32(hash[3]);
-    b[12] = BYTES_SWAP32(hash[4]);
-    b[13] = BYTES_SWAP32(hash[5]);
-    b[14] = BYTES_SWAP32(hash[6]);
-    b[15] = BYTES_SWAP32(hash[7]);
+    casti_m128i( b, 2 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 3 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
 
     return;
 }
diff --git a/algo/luffa/sse2/luffa_for_sse2.h b/algo/luffa/sse2/luffa_for_sse2.h
index 1c9b618..6bd15a1 100644
--- a/algo/luffa/sse2/luffa_for_sse2.h
+++ b/algo/luffa/sse2/luffa_for_sse2.h
@@ -51,6 +51,7 @@ typedef struct {
 //    uint64 bitlen[2]; /* Message length in bits */
 //    uint32 rembitlen; /* Length of buffer data to be hashed */
     int hashbitlen;
+    int rembytes;
 } hashState_luffa;
 
 HashReturn init_luffa( hashState_luffa *state, int hashbitlen );
diff --git a/algo/lyra2/lyra2.c b/algo/lyra2/lyra2.c
index f987873..0d13d95 100644
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -67,7 +67,7 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
    //Tries to allocate enough space for the whole memory matrix
 
    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
    // for Lyra2REv2, nCols = 4, v1 was using 8
    const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
                                           : BLOCK_LEN_BLAKE2_SAFE_BYTES;
@@ -228,7 +228,7 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
     //Tries to allocate enough space for the whole memory matrix
 
     const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
-    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+//    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
 
     //==== Getting the password + salt + basil padded with 10*1 ============//
     //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
diff --git a/algo/lyra2/lyra2re.c b/algo/lyra2/lyra2re.c
index 7feffd5..b2aab8e 100644
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -53,7 +53,7 @@ void lyra2re_hash(void *state, const void *input)
         memcpy(&ctx, &lyra2re_ctx, sizeof(lyra2re_ctx));
 
 //        uint32_t _ALIGN(128) hashA[8], hashB[8];
-	uint32_t _ALIGN(128) hash[32];
+	uint8_t _ALIGN(128) hash[32*8];
         #define hashA hash
         #define hashB hash+16
 
@@ -61,7 +61,7 @@ void lyra2re_hash(void *state, const void *input)
         const int tail   = 80 - midlen;   // 16
 
         memcpy( &ctx.blake, &lyra2_blake_mid, sizeof lyra2_blake_mid );
-        sph_blake256( &ctx.blake, input + 64, 16 );
+        sph_blake256( &ctx.blake, input + midlen, tail );
 
 //	sph_blake256(&ctx.blake, input, 80);
 	sph_blake256_close(&ctx.blake, hashA);
diff --git a/algo/qubit/deep.c b/algo/qubit/deep.c
new file mode 100644
index 0000000..e932af9
--- /dev/null
+++ b/algo/qubit/deep.c
@@ -0,0 +1,162 @@
+#include "miner.h"
+#include "algo-gate-api.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "algo/luffa/sph_luffa.h"
+#include "algo/cubehash/sph_cubehash.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sph_simd.h"
+#include "algo/echo/sph_echo.h"
+
+#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/simd/sse2/nist.h"
+#include "algo/shavite/sph_shavite.h"
+
+#ifndef NO_AES_NI
+#include "algo/echo/aes_ni/hash_api.h"
+#endif
+
+typedef struct
+{
+        hashState_luffa         luffa;
+        cubehashParam           cubehash;
+        sph_shavite512_context  shavite;
+        hashState_sd            simd;
+#ifdef NO_AES_NI
+        sph_echo512_context echo;
+#else
+        hashState_echo          echo;
+#endif
+} deep_ctx_holder;
+
+deep_ctx_holder deep_ctx;
+static __thread hashState_luffa deep_luffa_mid;
+
+void init_deep_ctx()
+{
+        init_luffa( &deep_ctx.luffa, 512 );
+        cubehashInit( &deep_ctx.cubehash, 512, 16, 32 );
+#ifdef NO_AES_NI
+        sph_echo512_init( &deep_ctx.echo );
+#else
+        init_echo( &deep_ctx.echo, 512 );
+#endif
+};
+
+void deep_luffa_midstate( const void* input )
+{
+    memcpy( &deep_luffa_mid, &deep_ctx.luffa, sizeof deep_luffa_mid );
+    update_luffa( &deep_luffa_mid, input, 64 );
+}
+
+void deep_hash(void *output, const void *input)
+{
+        unsigned char hash[128] __attribute((aligned(64)));
+        #define hashB hash+64
+
+        deep_ctx_holder ctx;
+        memcpy( &ctx, &deep_ctx, sizeof(deep_ctx) );
+
+        const int midlen = 64;            // bytes
+        const int tail   = 80 - midlen;   // 16
+        memcpy( &ctx.luffa, &deep_luffa_mid, sizeof deep_luffa_mid );
+        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, 
+                                (const BitSequence*)input + midlen, tail );
+
+        cubehashUpdateDigest( &ctx.cubehash, (byte*)hash, 
+                              (const byte*) hash,64);
+
+#ifdef NO_AES_NI
+        sph_echo512 (&ctx.echo, (const void*) hash, 64);
+        sph_echo512_close(&ctx.echo, (void*) hash);
+#else
+        update_final_echo ( &ctx.echo, (BitSequence *) hash,
+                          (const BitSequence *) hash, 512);
+#endif
+
+        asm volatile ("emms");
+        memcpy(output, hash, 32);
+}
+
+int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done)
+{
+        uint32_t endiandata[20] __attribute__((aligned(64)));
+        uint32_t hash64[8] __attribute__((aligned(32)));
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+	uint32_t n = pdata[19] - 1;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+
+	uint64_t htmax[] = { 0, 0xF, 0xFF,  0xFFF, 0xFFFF, 0x10000000 };
+	uint32_t masks[] =
+          { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00, 0xFFFFF000, 0xFFFF0000, 0 };
+
+	// we need bigendian data...
+        swab32_array( endiandata, pdata, 20 );
+
+        deep_luffa_midstate( endiandata );
+
+#ifdef DEBUG_ALGO
+	printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
+	for ( int m=0; m < 6; m++ )
+        {
+	    if ( Htarg <= htmax[m] )
+            {
+	        uint32_t mask = masks[m];
+	        do
+                {
+	            pdata[19] = ++n;
+		    be32enc( &endiandata[19], n );
+		    deep_hash( hash64, endiandata );
+#ifndef DEBUG_ALGO
+		    if (!(hash64[7] & mask))
+                    {
+                       if ( fulltest(hash64, ptarget) )
+                       {
+		          *hashes_done = n - first_nonce + 1;
+		          return true;
+                       }
+//                       else
+//                       {
+//                          applog(LOG_INFO, "Result does not validate on CPU!");
+//                       }
+                     }
+#else
+                    if (!(n % 0x1000) && !thr_id) printf(".");
+	        	if (!(hash64[7] & mask)) {
+		            printf("[%d]",thr_id);
+			    if (fulltest(hash64, ptarget)) {
+                             *hashes_done = n - first_nonce + 1;
+				return true;
+	                    }
+ 	                }
+#endif
+                } while ( n < max_nonce && !work_restart[thr_id].restart );
+                // see blake.c if else to understand the loop on htmax => mask
+            break;
+          } 
+        }
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
+
+bool register_deep_algo( algo_gate_t* gate )
+{
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  init_deep_ctx();
+  gate->scanhash = (void*)&scanhash_deep;
+  gate->hash     = (void*)&deep_hash;
+//  gate->hash_alt = (void*)&qubithash_alt;
+  return true;
+};
+
diff --git a/algo/qubit/qubit.c b/algo/qubit/qubit.c
index f66320e..f7b8a30 100644
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -35,6 +35,7 @@ typedef struct
 } qubit_ctx_holder;
 
 qubit_ctx_holder qubit_ctx;
+static __thread hashState_luffa qubit_luffa_mid;
 
 void init_qubit_ctx()
 {
@@ -49,16 +50,25 @@ void init_qubit_ctx()
 #endif
 };
 
+void qubit_luffa_midstate( const void* input )
+{
+    memcpy( &qubit_luffa_mid, &qubit_ctx.luffa, sizeof qubit_luffa_mid );
+    update_luffa( &qubit_luffa_mid, input, 64 );
+}
+
 void qubithash(void *output, const void *input)
 {
-        unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
+        unsigned char hash[128] __attribute((aligned(64)));
         #define hashB hash+64
 
         qubit_ctx_holder ctx;
         memcpy( &ctx, &qubit_ctx, sizeof(qubit_ctx) );
 
-        update_luffa( &ctx.luffa, (const BitSequence*)input, 80 );
-        final_luffa( &ctx.luffa, (BitSequence*)hash);
+        const int midlen = 64;            // bytes
+        const int tail   = 80 - midlen;   // 16
+        memcpy( &ctx.luffa, &qubit_luffa_mid, sizeof qubit_luffa_mid );
+        update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                (const BitSequence*)input + midlen, tail );
 
         cubehashUpdate( &ctx.cubehash, (const byte*) hash,64);
         cubehashDigest( &ctx.cubehash, (byte*)hash);
@@ -114,7 +124,6 @@ void qubithash_alt(void *output, const void *input)
         memcpy(output, hash, 32);
 }
 
-
 int scanhash_qubit(int thr_id, struct work *work,
 		uint32_t max_nonce, uint64_t *hashes_done)
 {
@@ -133,6 +142,8 @@ int scanhash_qubit(int thr_id, struct work *work,
 	// we need bigendian data...
         swab32_array( endiandata, pdata, 20 );
 
+        qubit_luffa_midstate( endiandata );
+
 #ifdef DEBUG_ALGO
 	printf("[%d] Htarg=%X\n", thr_id, Htarg);
 #endif
diff --git a/algo/simd/sse2/nist.c b/algo/simd/sse2/nist.c
index b75776c..73b5131 100644
--- a/algo/simd/sse2/nist.c
+++ b/algo/simd/sse2/nist.c
@@ -294,7 +294,7 @@ HashReturn update_final_sd( hashState_sd *state, BitSequence *hashval,
       {
         memcpy( state->buffer+current/8, data, (databitlen+7)/8 );
         IncreaseCounter( state, databitlen );
-        return SUCCESS;
+        break;
       }
       else
       {
@@ -341,13 +341,13 @@ HashReturn update_final_sd( hashState_sd *state, BitSequence *hashval,
   for ( i=0; i < 2*state->n_feistels; i++ )
   {
     u32 x = state->A[i];
-    out[4*i  ] = x&0xff;
+    out[4*i  ] = x & 0xff;
     x >>= 8;
-    out[4*i+1] = x&0xff;
+    out[4*i+1] = x & 0xff;
     x >>= 8;
-    out[4*i+2] = x&0xff;
+    out[4*i+2] = x & 0xff;
     x >>= 8;
-    out[4*i+3] = x&0xff;
+    out[4*i+3] = x & 0xff;
   }
 
   memcpy( hashval, out, state->hashbitlen / 8 );
diff --git a/algo/timetravel.c b/algo/timetravel.c
index 442bd70..d948f21 100644
--- a/algo/timetravel.c
+++ b/algo/timetravel.c
@@ -147,17 +147,17 @@ void timetravel_hash(void *output, const void *input)
     switch ( permutation[i] )
     {
       case 0:
-//        if ( i == 0 )
-//        {
-//           memcpy( &ctx.blake, &tt_mid.blake, sizeof tt_mid.blake );
-//           sph_blake256( &ctx.blake, input + midlen, tail );
-//           sph_blake256_close( &ctx.blake, hashB );
-//        }
-//        else
-//        {
+        if ( i == 0 )
+        {
+           memcpy( &ctx.blake, &tt_mid.blake, sizeof tt_mid.blake );
+           sph_blake512( &ctx.blake, input + midlen, tail );
+           sph_blake512_close( &ctx.blake, hashB );
+        }
+        else
+        {
           sph_blake512( &ctx.blake, hashA, dataLen );
           sph_blake512_close( &ctx.blake, hashB );
-//        }
+        }
         break;
      case 1:
         if ( i == 0 )
@@ -239,28 +239,29 @@ void timetravel_hash(void *output, const void *input)
         }
         break;
      case 6:
-//        if ( i == 0 )
-//        {
-//           memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa );
-//           update_and_final_luffa( &ctx.luffa, hashB,
-//                                   input + 64, 16 );
-//        }
-//        else
-//        {
+        if ( i == 0 )
+        {
+           memcpy( &ctx.luffa, &tt_mid.luffa, sizeof tt_mid.luffa );
+           update_and_final_luffa( &ctx.luffa, hashB,
+                                   input + 64, 16 );
+        }
+        else
+        {
            update_and_final_luffa( &ctx.luffa, (BitSequence*)hashB,
-                                   hashA, dataLen );
-//        }
+                                   (const BitSequence *)hashA, dataLen );
+        }
         break;
      case 7:
         if ( i == 0 )
         {
            memcpy( &ctx.cube, &tt_mid.cube, sizeof tt_mid.cube );
-           cubehashUpdateDigest( &ctx.cube, hashB,
-                                 input + midlen, tail );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hashB,
+                                 (const byte*)input + midlen, tail );
         }
         else
         {
-           cubehashUpdateDigest( &ctx.cube, hashB, hashA, dataLen );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)hashA,
+                                 dataLen );
         }
         break;
      default:
@@ -306,8 +307,8 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
       switch ( permutation[0] )
       {
          case 0:
-//           memcpy( &tt_mid.blake, &tt_ctx.blake, sizeof(tt_mid.blake) );
-//           sph_blake256( &tt_mid.blake, endiandata, 64 );
+           memcpy( &tt_mid.blake, &tt_ctx.blake, sizeof(tt_mid.blake) );
+           sph_blake512( &tt_mid.blake, endiandata, 64 );
            break;
         case 1:
            memcpy( &tt_mid.bmw, &tt_ctx.bmw, sizeof(tt_mid.bmw) );
@@ -335,13 +336,12 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
            sph_keccak512( &tt_mid.keccak, endiandata, 64 );
            break;
         case 6:
-//           init_luffa( &tt_mid.luffa, 512 );
-//           memcpy( &tt_mid.luffa, &tt_ctx.luffa, sizeof(tt_mid.luffa ) );
-//           update_luffa( &tt_mid.luffa, endiandata, 64 );
+           memcpy( &tt_mid.luffa, &tt_ctx.luffa, sizeof(tt_mid.luffa ) );
+           update_luffa( &tt_mid.luffa, endiandata, 64 );
            break;
         case 7:
            memcpy( &tt_mid.cube, &tt_ctx.cube, sizeof(tt_mid.cube ) );
-           cubehashUpdate( &tt_mid.cube, endiandata, 64 );
+           cubehashUpdate( &tt_mid.cube, (const byte*)endiandata, 64 );
            break;
         default:
            break;
@@ -373,23 +373,6 @@ void timetravel_set_target( struct work* work, double job_diff )
  work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
 
-// set_data_endian is a reasonable gate to use, it's called upon receipt
-// of new work (new ntime) and has the right arg to access it.
-void timetravel_calc_perm( struct work *work )
-{
-   // We want to permute algorithms. To get started we
-   // initialize an array with a sorted sequence of unique
-   // integers where every integer represents its own algorithm.
-   int ntime, steps, i;
-   be32enc( &ntime, work->data[ STD_NTIME_INDEX ] );
-   steps = ( ntime - HASH_FUNC_BASE_TIMESTAMP )
-                         % HASH_FUNC_COUNT_PERMUTATIONS;
-   for ( i = 0; i < HASH_FUNC_COUNT; i++ ) 
-        permutation[i] = i;
-   for ( i = 0; i < steps; i++ ) 
-        next_permutation( permutation, permutation + HASH_FUNC_COUNT );
-}
-
 bool register_timetravel_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
@@ -398,7 +381,6 @@ bool register_timetravel_algo( algo_gate_t* gate )
   gate->hash       = (void*)&timetravel_hash;
   gate->set_target = (void*)&timetravel_set_target;
   gate->get_max64  = (void*)&get_max64_0xffffLL;
-//  gate->set_work_data_endian = (void*)&timetravel_calc_perm;
   return true;
 };
 
diff --git a/avxdefs.h b/avxdefs.h
index a54e87f..3be6771 100644
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -1,4 +1,5 @@
-//   Some tools to help using AVX and AVX2
+// Some tools to help using AVX and AVX2
+// AVX support is required to include this header file, AVX2 optional.
 
 #include <inttypes.h>
 #include <immintrin.h>
@@ -15,7 +16,7 @@
 
 typedef union
 {
-#if defined __AVX2__
+#if defined (__AVX2__)
 __m256i   v256;
 #endif
 __m128i   v128[ 2];
@@ -36,7 +37,8 @@ uint8_t   v8 [16];
 
 #if defined (__AVX2__)
 
-// Replacements for vectorized data
+// AVX2 replacements for vectorized data
+
 // n = number of __m256i (32 bytes)
 inline void memset_zero_m256i( __m256i *dst, int n )
 {
@@ -48,68 +50,65 @@ inline void memset_m256i( __m256i *dst, const __m256i a,  int n )
    for ( int i = 0; i < n; i++ ) dst[i] = a;
 }
 
-
-// optimized copying, first fit is usually best. If none of these works there
-// are __m128i versions or plain memcpy.
+// Optimized copying using vectors. For misaligned data or more ganuularity
+// use __m228i versions or plain memcpy as appropriate.
  
-// Fixed size
+// Copying fixed size
 
-// multi  buffered copy for 64 bytes, the size of a cache line.
-// minimum alignment is 32 bytes, optimum for cache is 64.
+// Multi buffered copy using __m256i.
+// minimum alignment is 32 bytes (_m1256i), optimum 64 (cache line).
 // src & dst are __m256i*
-inline void mcpy64_m256i( __m256i* dst, const __m256i* src )
+
+// Copy 64 bytes (2x__m256i, one cache line), double buffered
+inline void mcpy64_m256i( __m256i* dest, const __m256i* srce )
 {
-  const __m256i* dest = dst;
-  const __m256i* srce = src;
-  __m256i a = _mm256_load_si256( srce   );
+  __m256i a = _mm256_load_si256( srce     );
   __m256i b = _mm256_load_si256( srce + 1 );
-             _mm256_store_si256( dest,   a );
-             _mm256_store_si256( dest + 1, b );
+  _mm256_store_si256( dest,     a );
+  _mm256_store_si256( dest + 1, b );
 }  
 
-inline void mcpy96_m256i( __m256i* dst, const __m256i* src )
+// Copy 96 bytes (3x__m256i), triple buffered
+inline void mcpy96_m256i( __m256i* dest, const __m256i* srce )
 {
-  const __m256i* dest = dst;
-  const __m256i* srce = src;
-  __m256i a = _mm256_load_si256( srce   );
+  __m256i a = _mm256_load_si256( srce     );
   __m256i b = _mm256_load_si256( srce + 1 );
-             _mm256_store_si256( dest,     a );
   __m256i c = _mm256_load_si256( srce + 2 );
-             _mm256_store_si256( dest + 1, b );
-             _mm256_store_si256( dest + 2, c );
+  _mm256_store_si256( dest,     a );
+  _mm256_store_si256( dest + 1, b );
+  _mm256_store_si256( dest + 2, c );
 }           
 
-inline void mcpy128_m256i( __m256i* dst, const __m256i* src )
+// Copy 128 bytes (4x__m256i), quad buffered
+inline void mcpy128_m256i( __m256i* dest, const __m256i* srce )
 {
-   const __m256i* dest = dst;
-   const __m256i* srce = src;
    __m256i a = _mm256_load_si256( srce     );
    __m256i b = _mm256_load_si256( srce + 1 );
    __m256i c = _mm256_load_si256( srce + 2 );
-              _mm256_store_si256( dest    , a );
    __m256i d = _mm256_load_si256( srce + 3 );
-              _mm256_store_si256( dest + 1, b );
-           a = _mm256_load_si256( srce + 4 );
-              _mm256_store_si256( dest + 2, c );
-           b = _mm256_load_si256( srce + 5 );
-              _mm256_store_si256( dest + 3, d );
-           c = _mm256_load_si256( srce + 6 );
-              _mm256_store_si256( dest + 4, a );
-           d = _mm256_load_si256( srce + 7 );
-              _mm256_store_si256( dest + 5, b );
-              _mm256_store_si256( dest + 6, c );
-              _mm256_store_si256( dest + 7, d );
+   _mm256_store_si256( dest    , a );
+   a = _mm256_load_si256( srce + 4 );
+   _mm256_store_si256( dest + 1, b );
+   b = _mm256_load_si256( srce + 5 );
+   _mm256_store_si256( dest + 2, c );
+   c = _mm256_load_si256( srce + 6 );
+   _mm256_store_si256( dest + 3, d );
+   d = _mm256_load_si256( srce + 7 );
+   _mm256_store_si256( dest + 4, a );
+   _mm256_store_si256( dest + 5, b );
+   _mm256_store_si256( dest + 6, c );
+   _mm256_store_si256( dest + 7, d );
 }
 
-// Variable size
-
+// Copy variable size
+//
 // copy multiples of 64 bytes using quad buffering with interleave
 // of first read of next line with last write of current line.
 // n is a multiple of  32 bytes (_m256i size)
 // minimum alignment:  32 bytes
 // optimum alignment:  64 bytes (cache line size)
 // minimum size.....: 128 bytes (4*n)
-// recommended size.: 256+ bytes
+// recommended size.: 256+ bytes (8*n)
 // minimum increment: 128 bytes
 // Only the first load or store in a cache line triggers a memory access.
 // the subsequent actions are trivial because they benefit from data
@@ -120,17 +119,16 @@ inline void mcpy128_m256i( __m256i* dst, const __m256i* src )
 
 inline void mcpy_m256i_x4( __m256i *dst, const __m256i *src, const int n )
 {
-   const __m256i* dest = dst;
-   const __m256i* srce = src;
+   __m256i* end  = dst + n;
 
    // preload 1 cache line to absorb startup latency
-   __m256i a = _mm256_load_si256( srce     );
-   __m256i b = _mm256_load_si256( srce + 1 );
-   // start loading second line, queue while waiting
-   __m256i c = _mm256_load_si256( srce + 2 );
+   __m256i a = _mm256_load_si256( src     );
+   __m256i b = _mm256_load_si256( src + 1 );
+   // start loading second line, queued while waiting for 1st line.
+   __m256i c = _mm256_load_si256( src + 2 );
    // start writing first line, as soon as data available,
    // second line read will have priority on the bus
-              _mm256_store_si256( dest, a );
+              _mm256_store_si256( dst, a );
    __m256i d;
 
    int i;
@@ -139,39 +137,37 @@ inline void mcpy_m256i_x4( __m256i *dst, const __m256i *src, const int n )
    for ( i = 0; i < loops; i++ )
    {
       const int i4 = i*4;
-      const __m256i* si4 = (__m256i*)(srce + i4);
-      const __m256i* di4 = (__m256i*)(dest + i4);
+      const __m256i* si4 = src + i4;
+            __m256i* di4 = dst + i4;
 
       d = _mm256_load_si256( si4 + 3 );
-         _mm256_store_si256( di4 + 1, b );
+      _mm256_store_si256( di4 + 1, b );
       // start loading next line
       a = _mm256_load_si256( si4 + 4 );
-         _mm256_store_si256( di4 + 2, c );
+      _mm256_store_si256( di4 + 2, c );
       b = _mm256_load_si256( si4 + 5 );
-         _mm256_store_si256( di4 + 3, d );
+      _mm256_store_si256( di4 + 3, d );
       c = _mm256_load_si256( si4 + 6 );
       // start writing next line
-          _mm256_store_si256( di4 + 4, a );
+      _mm256_store_si256( di4 + 4, a );
    }
    // finish last line
-   d = _mm256_load_si256( srce + n - 4 );
-      _mm256_store_si256( dest + n - 3, b );
-      _mm256_store_si256( dest + n - 2, c );
-      _mm256_store_si256( dest + n - 1, d );
+   d = _mm256_load_si256( end - 4 );
+   _mm256_store_si256( end - 3, b );
+   _mm256_store_si256( end - 2, c );
+   _mm256_store_si256( end - 1, d );
 }
 
-// basic __m256i memcpy
-
+// basic aligned __m256i memcpy
 inline void memcpy_m256i( __m256i *dst, const __m256i *src, int n )
 {
    for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
 }
 
-
 // For cheating with pointer types
 
 // p = any aligned pointer
-// returns p as pointer to vector type
+// returns p as pointer to vector type, not very useful
 #define castp_m256i(p) ((__m256i*)(p))
 #define castp_m128i(p) ((__m128i*)(p))
 
@@ -198,7 +194,7 @@ inline void memcpy_m256i( __m256i *dst, const __m256i *src, int n )
 //__m256i _mm256_inserti128_si256(__m256i a, __m128i b, const int mask);
 
 // Rotate bits in 4 uint64  (3 instructions)
-// __m256i mm256_rotr_64( __256i, int )
+// w = packed 64 bit data, n= number of bits to rotate
 #define  mm256_rotr_64( w, c ) \
     _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64 - c) )
 
@@ -219,6 +215,7 @@ inline void memcpy_m256i( __m256i *dst, const __m256i *src, int n )
      _mm256_permute4x64_epi64( w, 0x93 )
 
 // shift 256 bits by n*64 bits (4 uint64 by n uint64)
+// mm256_slli256_nx64( w )
 #define mm256_slli256_1x64( w ) \
     _mm256_and_si256( mm256_rotl256_1x64( w ), \
                       _mm256_set_epi64x( 0, \
@@ -231,6 +228,7 @@ inline void memcpy_m256i( __m256i *dst, const __m256i *src, int n )
                                          0 ) )
 */
 
+// these ones probably are backward
 #define mm256_slli256_2x64( w ) \
     _mm256_and_si256( mm256_swap128( w ), \
                       _mm256_set_epi64x( 0xffffffffffffffffull, \
@@ -271,9 +269,26 @@ inline void memcpy_m256i( __m256i *dst, const __m256i *src, int n )
                                          0xffffffffffffffffull ) )
 */
 
+// vectored version of BYTES_SWAP32
+inline __m256i  mm256_byteswap_epi32( __m256i x )
+{
+  __m256i x1 = _mm256_and_si256( x,
+          _mm256_set_epi32( 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00,
+                            0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 ) );
+  __m256i x2 = _mm256_and_si256( x,
+          _mm256_set_epi32( 0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000,
+                            0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000 ) );
+  __m256i x0 = _mm256_slli_epi32( x, 24 );   // x0 = x << 24
+          x1 = _mm256_slli_epi32( x1, 8 );   // x1 = mask(x) << 8
+          x2 = _mm256_srli_epi32( x2, 8 );   // x2 = mask(x) >> 8
+  __m256i x3 = _mm256_srli_epi32( x, 24 );   // x3 = x >> 24
+  return _mm256_or_si256( _mm256_or_si256( x0, x1 ),
+                          _mm256_or_si256( x2, x3 ) );
+}
+
 #endif  // AVX2
 
-// Replacements for vectorized data
+// AVX replacements for vectorized data
 
 inline void memset_zero_m128i( __m128i *dst,  int n )
 {
@@ -287,52 +302,80 @@ inline void memset_m128i( __m128i *dst, const __m128i a,  int n )
 
 // __m128i versions of optimized copying
 
-inline void mcpy32_m128i( __m128i* dst, const __m128i* src )
+// Copy 32 bytes (2x__m128i), double buffered
+inline void mcpy32_m128i( __m128i* dest, const __m128i* srce )
 {
-   const __m128i* dest = dst;
-   const __m128i* srce = src;
    // 4 loads fills cache line
    __m128i a = _mm_load_si128( srce     );
    __m128i b = _mm_load_si128( srce + 1 );
-              _mm_store_si128( dest,     a );
-              _mm_store_si128( dest + 1, b );
+   _mm_store_si128( dest,     a );
+   _mm_store_si128( dest + 1, b );
 }
 
-inline void mcpy64_m128i( __m128i* dst, const __m128i* src )
+// Copy 64 Bytes (4x__m128i), quad buffered
+inline void mcpy64_m128i( __m128i* dest, const __m128i* srce )
 {
-   const __m128i* dest = dst;
-   const __m128i* srce = src;
    // 4 loads fills cache line
    __m128i a = _mm_load_si128( srce     );
    __m128i b = _mm_load_si128( srce + 1 );
    __m128i c = _mm_load_si128( srce + 2 );
    __m128i d = _mm_load_si128( srce + 3 );
    // need to store a before overwriting it
-              _mm_store_si128( dest,     a );
-           a = _mm_load_si128( srce + 4 );
-              _mm_store_si128( dest + 1, b );
-           b = _mm_load_si128( srce + 5 );
-              _mm_store_si128( dest + 2, c );
-           c = _mm_load_si128( srce + 6 );
-              _mm_store_si128( dest + 3, d );
-           d = _mm_load_si128( srce + 7 );
-              _mm_store_si128( dest + 4, a );
-           d = _mm_load_si128( srce + 7 );
-              _mm_store_si128( dest + 5, b );
-              _mm_store_si128( dest + 6, c );
-              _mm_store_si128( dest + 7, d );
+   _mm_store_si128( dest,     a );
+   a = _mm_load_si128( srce + 4 );
+   _mm_store_si128( dest + 1, b );
+   b = _mm_load_si128( srce + 5 );
+   _mm_store_si128( dest + 2, c );
+   c = _mm_load_si128( srce + 6 );
+   _mm_store_si128( dest + 3, d );
+   d = _mm_load_si128( srce + 7 );
+   _mm_store_si128( dest + 4, a );
+   _mm_store_si128( dest + 5, b );
+   _mm_store_si128( dest + 6, c );
+   _mm_store_si128( dest + 7, d );
+}
+
+// Copy 96 Bytes (6x__m128i), quad buffered
+inline void mcpy96_m128i( __m128i* dest, const __m128i* srce )
+{
+   // 4 loads fills cache line
+   __m128i a = _mm_load_si128( srce     );
+   __m128i b = _mm_load_si128( srce +  1 );
+   __m128i c = _mm_load_si128( srce +  2 );
+   __m128i d = _mm_load_si128( srce +  3 );
+   // need to store a before overwriting it
+   _mm_store_si128( dest,      a );
+   a = _mm_load_si128( srce +  4 );
+   _mm_store_si128( dest +  1, b );
+   b = _mm_load_si128( srce +  5 );
+   _mm_store_si128( dest +  2, c );
+   c = _mm_load_si128( srce +  6 );
+   _mm_store_si128( dest +  3, d );
+   d = _mm_load_si128( srce +  7 );
+   _mm_store_si128( dest +  4, a );
+   a = _mm_load_si128( srce +  8 );
+   _mm_store_si128( dest +  5, b );
+   b = _mm_load_si128( srce +  9 );
+   _mm_store_si128( dest +  6, c );
+   c = _mm_load_si128( srce + 10 );
+   _mm_store_si128( dest +  7, d );
+   d = _mm_load_si128( srce + 11 );
+   _mm_store_si128( dest +  8,  a );
+   _mm_store_si128( dest +  9, b );
+   _mm_store_si128( dest + 10, c );
+   _mm_store_si128( dest + 11, d );
 }
 
 // Variable length
-
-// copy multiples of 16 bytes using quad buffering.
+//
+// Copy multiples of 16 bytes (__m128i) using quad buffering.
 // n is a multiple of 16 bytes (__m128i size)
 // minimum alignment: 16 bytes
 // optimum alignment: 64 bytes (cache line size)
-// minimum size.....: 32 bytes (4*n)
-// recommended size.: 96+ bytes
-// minimum increment: 32 bytes
-inline void memcpy_m128i_x4( __m128i *dst, const __m128i *src, const int n )
+// minimum size.....: 64 bytes (4*n)
+// recommended size.: 128+ bytes (8*n)
+// minimum increment: 64 bytes
+inline void mcpy_m128i_x4( __m128i *dst, const __m128i *src, const int n )
 {
    // preload 1 cache line to absorb startup latency
    __m128i a = _mm_load_si128( src     );
@@ -342,36 +385,92 @@ inline void memcpy_m128i_x4( __m128i *dst, const __m128i *src, const int n )
 
    int i;
    const int loops = n/4 - 1;
-   const __m128i* dst_n = (__m128i*)(dst + n);
+   __m128i* end = dst + n;
  
    for ( i = 0; i < loops; i++ )
    {
       const int i4 = i*4;
-      const __m128i* si4 = (__m128i*)(src + i4);
-      const __m128i* di4 = (__m128i*)(dst + i4);
+      const __m128i* si4 = src + i4;
+            __m128i* di4 = dst + i4;
 
       // need to free a before overwriting it
-         _mm_store_si128( di4,     a );
-      a = _mm_load_si128( di4 + 4 );
-         _mm_store_si128( di4 + 1, b );
-      b = _mm_load_si128( di4 + 5 );
-         _mm_store_si128( di4 + 2, c );
-      c = _mm_load_si128( di4 + 6 );
-         _mm_store_si128( di4 + 3, d );
-      d = _mm_load_si128( di4 + 7 );
+      _mm_store_si128( di4,     a );
+      a = _mm_load_si128( si4 + 4 );
+      _mm_store_si128( di4 + 1, b );
+      b = _mm_load_si128( si4 + 5 );
+      _mm_store_si128( di4 + 2, c );
+      c = _mm_load_si128( si4 + 6 );
+      _mm_store_si128( di4 + 3, d );
+      d = _mm_load_si128( si4 + 7 );
    }
-   _mm_store_si128( dst_n - 4, a );
-   _mm_store_si128( dst_n - 3, b );
-   _mm_store_si128( dst_n - 2, c );
-   _mm_store_si128( dst_n - 1, d );
+   _mm_store_si128( end - 4, a );
+   _mm_store_si128( end - 3, b );
+   _mm_store_si128( end - 2, c );
+   _mm_store_si128( end - 1, d );
 }
 
-// basic __m128i copy
+// basic aligned __m128i copy
 inline void memcpy_m128i( __m128i *dst, const __m128i *src, int n )
 {
    for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
 }
 
+inline void memcpy_64( uint64_t* dst, const uint64_t* src, int n )
+{
+   for ( int i = 0; i < n; i++ )
+       dst[i] = src[i];
+}
+
+// Smart generic mem copy optimized for copying large data, n = bytes.
+// Most efficient with 256 bit aligned data and size a multiple of 4*256,
+// but fkexible enough to handle any any alignment, any size with performance
+// considerations. For common fixed sizes use the approppriate functions above.
+inline void mcpy( void* dst, const void* src, int n )
+{
+// enforce alignment and minimum size for quad buffered vector copy
+#if defined (__AVX2__)
+  // Try 256 bit copy
+  if ( ( (uint64_t)dst % 32 == 0 ) && ( (const uint64_t)src % 32 == 0 ) )
+  {
+     if ( n % 128 == 0 )
+     {
+        mcpy_m256i_x4( (__m256i*)dst, (const __m256i*)src, n/32 );
+        return;
+     }
+     else
+     {
+        memcpy_m256i( (__m256i*)dst, (const __m256i*)src, n/32 );
+        return;
+     }
+  }
+  else
+#endif
+  // Try 128 bit copy
+  if ( ( (uint64_t)dst % 16 == 0 ) && ( (const uint64_t)src % 16 == 0 ) )
+  {
+     if ( n % 64 == 0 )
+     {
+        mcpy_m128i_x4( (__m128i*)dst, (const __m128i*)src, n/16 );
+        return;
+     }
+     else
+     {   
+        memcpy_m128i( (__m128i*)dst, (const __m128i*)src, n/16 );
+        return;
+      }
+  }
+  // Try 64 bit copy
+  else if ( ( (uint64_t)dst % 8 == 0 ) && ( (const uint64_t)src % 8 == 0 )
+           && ( n/8 == 0 ) )
+  {
+      memcpy_64( (uint64_t*)dst, (const uint64_t*)src, n/8 );
+      return;
+  }
+  // slow copy
+  memcpy( dst, src, n );
+}
+
+
 // For cheating with pointer types
 
 // p = any aligned pointer
@@ -408,14 +507,16 @@ inline void memcpy_m128i( __m128i *dst, const __m128i *src, int n )
 // mm256_rotl256_1x64 when avx2 is not available or data is alreeady in __m128i
 // format. uses one local
 //void mm128_rotl256_1x64( __m128i, __m128i )
-#define mm128_rotl256_1x64(s0, s1) do { \
+#define mm128_rotl256_1x64(s0,s1) do { \
    __m128i t; \
-   s0 = mm128_swap64( s0); \
-   s1 = mm128_swap64( s1); \
-   t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
-                     _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
-   s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
-                      _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
+   s0 = mm128_swap64(s0); \
+   s1 = mm128_swap64(s1); \
+   t = _mm_or_si128( \
+           _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
+           _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
+   s1 = _mm_or_si128( \
+           _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
+           _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
    s0 = t; \
 } while(0)
 
@@ -423,10 +524,26 @@ inline void memcpy_m128i( __m128i *dst, const __m128i *src, int n )
    __m128i t; \
    s0 = mm128_swap64( s0); \
    s1 = mm128_swap64( s1); \
-   t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
-                        _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
-   s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
-                      _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
+   t = _mm_or_si128( \
+          _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
+          _mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
+   s1 = _mm_or_si128( \
+          _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
+          _mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
    s0 = t; \
 } while(0)
 
+// vectored version of BYTES_SWAP32
+inline __m128i  mm_byteswap_epi32( __m128i x )
+{
+  __m128i x1 = _mm_and_si128( x, _mm_set_epi32( 0x0000ff00, 0x0000ff00,
+                                                0x0000ff00, 0x0000ff00 ) );
+  __m128i x2 = _mm_and_si128( x, _mm_set_epi32( 0x00ff0000, 0x00ff0000,
+                                                0x00ff0000, 0x00ff0000 ) );
+  __m128i x0 = _mm_slli_epi32( x, 24 );   // x0 = x << 24
+          x1 = _mm_slli_epi32( x1, 8 );   // x1 = mask(x) << 8
+          x2 = _mm_srli_epi32( x2, 8 );   // x2 = mask(x) >> 8
+  __m128i x3 = _mm_srli_epi32( x, 24 );   // x3 = x >> 24
+  return _mm_or_si128( _mm_or_si128( x0, x1 ), _mm_or_si128( x2, x3 ) );
+}
+
diff --git a/configure.ac b/configure.ac
index 4b0df71..02b0004 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.5.8])
+AC_INIT([cpuminer-opt], [3.5.9])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index e1276a6..699c3be 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -227,7 +227,7 @@ static void affine_to_cpu_mask(int id, unsigned long mask) { }
 // not very useful, just index the arrray directly.
 // but declaring this fuinction in miner.h eliminates
 // an annoying compiler warning for not using a static.
-char* algo_name( enum algos a ) {return algo_names[a];}
+const char* algo_name( enum algos a ) {return algo_names[a];}
 
 void get_currentalgo(char* buf, int sz)
 {
@@ -1432,7 +1432,7 @@ int64_t get_max64_0x3fffffLL() { return 0x3fffffLL; }
 int64_t get_max64_0x1ffff()    { return 0x1ffff;    }
 int64_t get_max64_0xffffLL()   { return 0xffffLL;   };
 
-
+// default
 void sha256d_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 {
   sha256d(merkle_root, sctx->job.coinbase, (int) sctx->job.coinbase_size);
@@ -1470,14 +1470,6 @@ void swab_work_data( struct work *work )
       work->data[i] = swab32( work->data[i] );
 }
 
-void std_build_extraheader( struct work* work, struct stratum_ctx* sctx )
-{
-   work->data[ algo_gate.ntime_index ] = le32dec(sctx->job.ntime);
-   work->data[ algo_gate.nbits_index ] = le32dec(sctx->job.nbits);
-   work->data[20] = 0x80000000;
-   work->data[31] = 0x00000280;
-}
-
 double std_calc_network_diff( struct work* work )
 {
    // sample for diff 43.281 : 1c05ea29
@@ -1551,6 +1543,17 @@ void jr2_get_new_work( struct work* work, struct work* g_work, int thr_id,
        ++(*nonceptr);
 }
 
+bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
+                           int thr_id )
+{
+   if ( have_stratum && !work->data[0] && !opt_benchmark )
+   {
+      sleep(1);
+      return false;
+   }
+   return true;
+}
+
 static void *miner_thread( void *userdata )
 {
    struct   thr_info *mythr = (struct thr_info *) userdata;
@@ -1676,8 +1679,9 @@ static void *miner_thread( void *userdata )
        } // do_this_thread
        algo_gate.resync_threads( &work );
 
-       // prevent dupes is called on every loop and has useful args so it
-       // is being used by zcoin to pass along the work height.
+       if ( !algo_gate.ready_to_mine( &work, &stratum, thr_id ) )
+          continue;
+/*
        if ( algo_gate.prevent_dupes( &work, &stratum, thr_id ) )
           continue;
        // prevent scans before a job is received
@@ -1686,6 +1690,7 @@ static void *miner_thread( void *userdata )
           sleep(1);
           continue;
        }
+*/
        // conditional mining
        if (!wanna_mine(thr_id))
        {
@@ -2068,23 +2073,16 @@ out:
 	return ret;
 }
 
-void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
+void std_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
-   unsigned char merkle_root[64] = { 0 };
-   int i;
+   uchar merkle_root[64] = { 0 };
    size_t t;
-
-   pthread_mutex_lock( &sctx->work_lock );
-   free( g_work->job_id );
-   g_work->job_id = strdup( sctx->job.job_id );
-   g_work->xnonce2_len = sctx->xnonce2_size;
-   g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size );
-   memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size );
+   int i;
 
    algo_gate.gen_merkle_root( merkle_root, sctx );
-   /* Increment extranonce2 */
+   // Increment extranonce2
    for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
-   /* Assemble block header */
+   // Assemble block header
    memset( g_work->data, 0, sizeof(g_work->data) );
    g_work->data[0] = le32dec( sctx->job.version );
    for ( i = 0; i < 8; i++ )
@@ -2092,7 +2090,23 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
    for ( i = 0; i < 8; i++ )
       g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
 
+   g_work->data[ algo_gate.ntime_index ] = le32dec(sctx->job.ntime);
+   g_work->data[ algo_gate.nbits_index ] = le32dec(sctx->job.nbits);
+   g_work->data[20] = 0x80000000;
+   g_work->data[31] = 0x00000280;
+}
+
+void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
+{
+   pthread_mutex_lock( &sctx->work_lock );
+   free( g_work->job_id );
+   g_work->job_id = strdup( sctx->job.job_id );
+   g_work->xnonce2_len = sctx->xnonce2_size;
+   g_work->xnonce2 = (uchar*) realloc( g_work->xnonce2, sctx->xnonce2_size );
+   memcpy( g_work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size );
+
    algo_gate.build_extraheader( g_work, sctx );
+
    net_diff = algo_gate.calc_network_diff( g_work );
    algo_gate.set_work_data_endian( g_work );
    pthread_mutex_unlock( &sctx->work_lock );
@@ -2105,7 +2119,7 @@ void std_stratum_gen_work( struct stratum_ctx *sctx, struct work *g_work )
                     g_work->job_id, xnonce2str, swab32( g_work->data[17] ) );
        free( xnonce2str );
    }
-   /* set target */
+
    algo_gate.set_target( g_work, sctx->job.diff );
 
    if ( stratum_diff != sctx->job.diff )
@@ -2144,98 +2158,100 @@ static void *stratum_thread(void *userdata )
     {
 	int failures = 0;
 
-	if (stratum_need_reset) {
+	if ( stratum_need_reset )
+        {
            stratum_need_reset = false;
-	   stratum_disconnect(&stratum);
-	   if (strcmp(stratum.url, rpc_url)) {
-		free(stratum.url);
-		stratum.url = strdup(rpc_url);
+	   stratum_disconnect( &stratum );
+	   if ( strcmp( stratum.url, rpc_url ) )
+           {
+		free( stratum.url );
+		stratum.url = strdup( rpc_url );
 		applog(LOG_BLUE, "Connection changed to %s", short_url);
-	   } else if (!opt_quiet) {
-		applog(LOG_DEBUG, "Stratum connection reset");
 	   }
+           else if ( !opt_quiet )
+		applog(LOG_DEBUG, "Stratum connection reset");
 	}
 
-  while ( !stratum.curl )
-  {
-     pthread_mutex_lock(&g_work_lock);
-     g_work_time = 0;
-     pthread_mutex_unlock(&g_work_lock);
-     restart_threads();
-     if (!stratum_connect(&stratum, stratum.url)
-         || !stratum_subscribe(&stratum)
-         || !stratum_authorize(&stratum, rpc_user, rpc_pass))
-     {
-         stratum_disconnect(&stratum);
-         if (opt_retries >= 0 && ++failures > opt_retries)
-         {
-              applog(LOG_ERR, "...terminating workio thread");
-              tq_push(thr_info[work_thr_id].q, NULL);
-              goto out;
-         }
-         if (!opt_benchmark)
-            applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
-            sleep(opt_fail_pause);
-     }
+        while ( !stratum.curl )
+        {
+           pthread_mutex_lock( &g_work_lock );
+           g_work_time = 0;
+           pthread_mutex_unlock( &g_work_lock );
+           restart_threads();
+           if ( !stratum_connect( &stratum, stratum.url )
+                || !stratum_subscribe( &stratum )
+                || !stratum_authorize( &stratum, rpc_user, rpc_pass ) )
+           {
+              stratum_disconnect( &stratum );
+              if (opt_retries >= 0 && ++failures > opt_retries)
+              {
+                 applog(LOG_ERR, "...terminating workio thread");
+                 tq_push(thr_info[work_thr_id].q, NULL);
+                 goto out;
+              }
+              if (!opt_benchmark)
+                  applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause);
+              sleep(opt_fail_pause);
+           }
 
-     if (jsonrpc_2)
-     {
-	work_free(&g_work);
-	work_copy(&g_work, &stratum.work);
-     }
-  }
+           if (jsonrpc_2)
+           {
+              work_free(&g_work);
+	      work_copy(&g_work, &stratum.work);
+           }
+        }
 
-  if (stratum.job.job_id &&
-         (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id)) )
-  {
-      pthread_mutex_lock(&g_work_lock);
-      algo_gate.stratum_gen_work( &stratum, &g_work );
-      time(&g_work_time);
-      pthread_mutex_unlock(&g_work_lock);
-      restart_threads();
+        if ( stratum.job.job_id &&
+             ( !g_work_time || strcmp( stratum.job.job_id, g_work.job_id ) ) )
+        {
+           pthread_mutex_lock(&g_work_lock);
+           algo_gate.stratum_gen_work( &stratum, &g_work );
+           time(&g_work_time);
+           pthread_mutex_unlock(&g_work_lock);
+//           restart_threads();
 
-      if (stratum.job.clean || jsonrpc_2)
-      {
-         static uint32_t last_bloc_height;
-         if ( last_bloc_height != stratum.bloc_height )
-         {
-            last_bloc_height = stratum.bloc_height;
-            if ( !opt_quiet )
-            {
-               if (net_diff > 0.)
-	           applog(LOG_BLUE, "%s block %d, diff %.3f",
-                       algo_names[opt_algo], stratum.bloc_height, net_diff);
-	       else
-	           applog(LOG_BLUE, "%s %s block %d", short_url,
-                       algo_names[opt_algo], stratum.bloc_height);
-	    }
-	 }
-         restart_threads();
-       }
-       else if (opt_debug && !opt_quiet)
-       {
+           if (stratum.job.clean || jsonrpc_2)
+           {
+              static uint32_t last_bloc_height;
+              if ( last_bloc_height != stratum.bloc_height )
+              {
+                 last_bloc_height = stratum.bloc_height;
+                 if ( !opt_quiet )
+                 {
+                    if (net_diff > 0.)
+	               applog(LOG_BLUE, "%s block %d, diff %.3f",
+                           algo_names[opt_algo], stratum.bloc_height, net_diff);
+                    else
+	               applog(LOG_BLUE, "%s %s block %d", short_url,
+                           algo_names[opt_algo], stratum.bloc_height);
+	         }
+              }
+              restart_threads();
+           }
+           else if (opt_debug && !opt_quiet)
+           {
 		applog(LOG_BLUE, "%s asks job %d for block %d", short_url,
 		strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
-       }
-   }
+           }
+        }  // stratum.job.job_id
 
-   if ( !stratum_socket_full( &stratum, opt_timeout ) )
-   {
-	applog(LOG_ERR, "Stratum connection timeout");
-	s = NULL;
-   }
-   else
-	s = stratum_recv_line(&stratum);
-   if ( !s )
-   {
-	stratum_disconnect(&stratum);
-	applog(LOG_ERR, "Stratum connection interrupted");
-	continue;
-   }
-   if (!stratum_handle_method(&stratum, s))
-	stratum_handle_response(s);
-   free(s);
-   }
+       if ( !stratum_socket_full( &stratum, opt_timeout ) )
+       {
+          applog(LOG_ERR, "Stratum connection timeout");
+	  s = NULL;
+       }
+       else
+           s = stratum_recv_line(&stratum);
+       if ( !s )
+       {
+          stratum_disconnect(&stratum);
+	  applog(LOG_ERR, "Stratum connection interrupted");
+	  continue;
+       }
+       if (!stratum_handle_method(&stratum, s))
+          stratum_handle_response(s);
+       free(s);
+   }  // loop
 out:
 	return NULL;
 }
diff --git a/miner.h b/miner.h
index 6204b8b..7d559e6 100644
--- a/miner.h
+++ b/miner.h
@@ -47,14 +47,14 @@
 # endif
 #endif
 
-/*
+
 #ifndef min
-#define min(a,b) (a>b ? b : a)
+#define min(a,b) (a>b ? (b) :(a))
 #endif
 #ifndef max 
-#define max(a,b) (a<b ? b : a)
+#define max(a,b) (a<b ? (b) : (a))
 #endif
-*/
+
 
 //#ifdef HAVE_ALLOCA_H
 //# include <alloca.h>
@@ -479,12 +479,14 @@ enum algos {
         ALGO_BASTION,
         ALGO_BLAKE,       
         ALGO_BLAKECOIN,   
+//        ALGO_BLAKE2B,
         ALGO_BLAKE2S,     
         ALGO_BMW,        
         ALGO_C11,         
         ALGO_CRYPTOLIGHT, 
         ALGO_CRYPTONIGHT, 
         ALGO_DECRED,
+        ALGO_DEEP,
         ALGO_DROP,        
         ALGO_FRESH,       
         ALGO_GROESTL,     
@@ -537,12 +539,14 @@ static const char* const algo_names[] = {
         "bastion",
         "blake",
         "blakecoin",
+//        "blake2b",
         "blake2s",
         "bmw",
         "c11",
         "cryptolight",
         "cryptonight",
         "decred",
+        "deep",
         "drop",
         "fresh",
         "groestl",
@@ -589,7 +593,7 @@ static const char* const algo_names[] = {
         "\0"
 };
 
-char* algo_name( enum algos a );
+const char* algo_name( enum algos a );
 
 extern enum algos opt_algo;
 extern bool opt_debug;
@@ -650,12 +654,14 @@ Options:\n\
                           bastion\n\
                           blake        Blake-256 (SFR)\n\
                           blakecoin    blake256r8\n\
+"/*                          blake2b      Sia\n*/"\
                           blake2s      Blake-2 S\n\
                           bmw          BMW 256\n\
                           c11          Flax\n\
                           cryptolight  Cryptonight-light\n\
                           cryptonight  cryptonote, Monero (XMR)\n\
                           decred\n\
+                          deep         Deepcoin (DCN)\n\
                           drop         Dropcoin\n\
                           fresh        Fresh\n\
                           groestl      groestl\n\
diff --git a/util.c b/util.c
index 07cab73..d5dbe02 100644
--- a/util.c
+++ b/util.c
@@ -1797,10 +1797,10 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
 	sctx->next_diff = diff;
 	pthread_mutex_unlock(&sctx->work_lock);
 
-//	/* store for api stats */
-//	stratum_diff = diff;
-//
-//	applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
+	/* store for api stats */
+	stratum_diff = diff;
+
+	applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
 
 	return true;
 }