v3.9.8.1

v3.9.8
v3.9.7
2025-09-17 23:44:27 +00:00 · 2019-10-01 14:17:36 -04:00 · 2019-09-26 22:37:26 -04:00 · 2019-08-03 10:39:54 -04:00 · 2019-07-30 10:16:43 -04:00 · 2019-07-18 19:46:57 -04:00
108 changed files with 4459 additions and 3199 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,7 +18,6 @@ dist_man_MANS	= cpuminer.1
 cpuminer_SOURCES = \
  cpu-miner.c \
  util.c \
-  uint256.cpp \
  api.c \
  sysinfos.c \
  algo-gate-api.c\
@@ -51,12 +50,15 @@ cpuminer_SOURCES = \
  algo/blake/blake.c \
  algo/blake/blake-4way.c \
  algo/blake/sph_blake2b.c \
-  algo/blake/blake2b.c \
  algo/blake/sph-blake2s.c \
  algo/blake/blake2s-hash-4way.c \
  algo/blake/blake2s.c \
  algo/blake/blake2s-gate.c \
  algo/blake/blake2s-4way.c \
+  algo/blake/blake2b-hash-4way.c \
+  algo/blake/blake2b.c \
+  algo/blake/blake2b-gate.c \
+  algo/blake/blake2b-4way.c \
  algo/blake/blakecoin-gate.c \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
@@ -169,7 +171,8 @@ cpuminer_SOURCES = \
  algo/scryptjane/scrypt-jane.c \
  algo/sha/sph_sha2.c \
  algo/sha/sph_sha2big.c \
-  algo/sha/sha2-hash-4way.c \
+  algo/sha/sha256-hash-4way.c \
+  algo/sha/sha512-hash-4way.c \
  algo/sha/sha256_hash_11way.c \
  algo/sha/sha2.c \
  algo/sha/sha256t-gate.c \
@@ -259,8 +262,13 @@ cpuminer_SOURCES = \
  algo/x16/x16r-gate.c \
  algo/x16/x16r.c \
  algo/x16/x16r-4way.c \
+  algo/x16/x16rv2.c \
+  algo/x16/x16rv2-4way.c \
  algo/x16/x16rt.c \
  algo/x16/x16rt-4way.c \
+  algo/x16/hex.c \
+  algo/x16/x21s-4way.c \
+  algo/x16/x21s.c \
  algo/x17/x17-gate.c \
  algo/x17/x17.c \
  algo/x17/x17-4way.c \
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Requirements

 1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
 Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
-optimizations a CPU with AES_NI is required. This includes Intel Westbridge
+optimizations a CPU with AES_NI is required. This includes Intel Westmere
 and newer and AMD equivalents. Further optimizations are available on some
 algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.

@@ -55,8 +55,9 @@ Supported Algorithms
                          axiom         Shabal-256 MemoHash
                          bastion
                          blake         Blake-256 (SFR)
-                          blakecoin     blake256r8
+                          blake2b       Blake2b 256
                          blake2s       Blake-2 S
+                          blakecoin     blake256r8
                          bmw           BMW 256
                          bmw512        BMW 512
                          c11           Chaincoin
@@ -67,6 +68,7 @@ Supported Algorithms
                          fresh         Fresh
                          groestl       Groestl coin
                          heavy         Heavy
+                          hex           x16r-hex
                          hmq1725       Espers
                          hodl          Hodlcoin
                          jha           Jackpotcoin
@@ -85,8 +87,9 @@ Supported Algorithms
                          neoscrypt     NeoScrypt(128, 2, 1)
                          nist5         Nist5
                          pentablake    Pentablake
-                          phi1612       phi, LUX coin (original algo)
-                          phi2          LUX coin (new algo)
+                          phi1612       phi
+                          phi2          Luxcoin (LUX)
+                          phi2-lux      identical to phi2
                          pluck         Pluck:128 (Supcoin)
                          polytimos     Ninja
                          quark         Quark
@@ -118,11 +121,13 @@ Supported Algorithms
                          x13sm3        hsr (Hshare)
                          x14           X14
                          x15           X15
-                          x16r          Ravencoin (RVN)
+                          x16r          Ravencoin (RVN) (original algo)
+                          x16rv2        Ravencoin (RVN) (new algo)
                          x16rt         Gincoin (GIN)
                          x16rt_veil    Veil (VEIL)
                          x16s          Pigeoncoin (PGN)
                          x17
+                          x21s
                          xevan         Bitsend (BSD)
                          yescrypt      Globalboost-Y (BSTY)
                          yescryptr8    BitZeny (ZNY)
@@ -152,14 +157,15 @@ Benchmark testing does not work for x11evo.
 Bugs
 ----

-Users are encouraged to post their bug reports on the Bitcoin Talk
-forum at:
+Users are encouraged to post their bug reports using git issues or on the
+Bitcoin Talk forum at:

 https://bitcointalk.org/index.php?topic=1326803.0

-All problem reports must be accompanied by a proper definition.
+All problem reports must be accompanied by a proper problem definition.
 This should include how the problem occurred, the command line and
-output from the miner showing the startup and any errors.
+output from the miner showing the startup messages and any errors.
+A history is also useful, ie did it work before.

 Donations
 ---------
--- a/51
+++ b/51
@@ -38,6 +38,57 @@ supported.
 Change Log
 ----------

+v3.9.8.1
+
+Summary log report will be generated on stratum diff change or after 5 minutes,
+whichever comes first, to prevent incorrect data in the report.
+
+Removed phi2-lux alias (introduced in v3.9.8) due to Luxcoin's planned fork
+to a new algo. The new Luxcoin algo is not supported by cpuminer-opt.
+Until the fork Luxcoin can be mined using phi2 algo.
+
+--hide-diff option is deprecated and has no effect. It will be removed in a
+future release.
+
+v3.9.8
+
+Changes to log output to provide data more relevant to actual mining
+performance.
+phi2 can now handle pools with a mix of coins that use and don't use roots.
+phi2-lux added as an alias for phi2 as they are identical except for roots.
+Add x16rv2 algo for Ravencoin fork.
+
+v3.9.7
+
+Command line option changes:
+
+"-R" is no longer used as a shortcut for "--retry-pause", users must
+use the long option.
+
+New options:
+
+-N, --param-n: set the N parameter for yescrypt, yespower or scrypt algos
+-R, --param-r: set the R parameter for yescrypt or yespower algos, scrypt is
+     hardcoded with R=1
+-K, --param-key: set the client key/pers parameter for yescrypt/yespower algos.
+
+These options can be used to mine yescrypt or yespower variations using
+the generic yescrypt or yespower algo name and specifying the parameters
+manually. They can even be used to mine variations that aren't formally
+supported by a unique algo name. Existing algos can continue to to be mined
+using their original name without parameters.
+
+v3.9.6.2
+
+New algo blake2b.
+Faster myr-gr on Ryzen using SHA.
+Faster blake2s SSE2.
+Small speedup of around 1% for several other algos.
+
+v3.9.6.1
+
+New algos: x21s, hex (alias x16r-hex).
+
 v3.9.6

 New algos: bmw512, x16rt, x16rt-veil (alias veil), x13bcd (alias bcd).
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -122,7 +122,6 @@ void init_algo_gate( algo_gate_t* gate )
   gate->stratum_gen_work        = (void*)&std_stratum_gen_work;
   gate->build_stratum_request   = (void*)&std_le_build_stratum_request;
   gate->malloc_txs_request      = (void*)&std_malloc_txs_request;
-   gate->set_target              = (void*)&std_set_target;
   gate->submit_getwork_result   = (void*)&std_le_submit_getwork_result;
   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
@@ -167,9 +166,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_AXIOM:         register_axiom_algo         ( gate ); break;
    case ALGO_BASTION:       register_bastion_algo       ( gate ); break;
    case ALGO_BLAKE:         register_blake_algo         ( gate ); break;
-    case ALGO_BLAKECOIN:     register_blakecoin_algo     ( gate ); break;
-//    case ALGO_BLAKE2B:      register_blake2b_algo     ( gate ); break;
+    case ALGO_BLAKE2B:       register_blake2b_algo       ( gate ); break;
    case ALGO_BLAKE2S:       register_blake2s_algo       ( gate ); break;
+    case ALGO_BLAKECOIN:     register_blakecoin_algo     ( gate ); break;
    case ALGO_BMW512:        register_bmw512_algo        ( gate ); break;
    case ALGO_C11:           register_c11_algo           ( gate ); break;
    case ALGO_CRYPTOLIGHT:   register_cryptolight_algo   ( gate ); break;
@@ -182,6 +181,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_FRESH:         register_fresh_algo         ( gate ); break;
    case ALGO_GROESTL:       register_groestl_algo       ( gate ); break;
    case ALGO_HEAVY:         register_heavy_algo         ( gate ); break;
+    case ALGO_HEX:           register_hex_algo           ( gate ); break;
    case ALGO_HMQ1725:       register_hmq1725_algo       ( gate ); break;
    case ALGO_HODL:          register_hodl_algo          ( gate ); break;
    case ALGO_JHA:           register_jha_algo           ( gate ); break;
@@ -233,10 +233,12 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
    case ALGO_X14:           register_x14_algo           ( gate ); break;
    case ALGO_X15:           register_x15_algo           ( gate ); break;
    case ALGO_X16R:          register_x16r_algo          ( gate ); break;
+    case ALGO_X16RV2:        register_x16rv2_algo        ( gate ); break;
    case ALGO_X16RT:         register_x16rt_algo         ( gate ); break;
    case ALGO_X16RT_VEIL:    register_x16rt_veil_algo    ( gate ); break;
    case ALGO_X16S:          register_x16s_algo          ( gate ); break;
    case ALGO_X17:           register_x17_algo           ( gate ); break;
+    case ALGO_X21S:          register_x21s_algo          ( gate ); break;
    case ALGO_XEVAN:         register_xevan_algo         ( gate ); break;
 /*    case ALGO_YESCRYPT:     register_yescrypt_05_algo     ( gate ); break;
     case ALGO_YESCRYPTR8:   register_yescryptr8_05_algo   ( gate ); break;
@@ -335,10 +337,10 @@ const char* const algo_alias_map[][2] =
  { "myriad",            "myr-gr"       },
  { "neo",               "neoscrypt"    },
  { "phi",               "phi1612"      },
-//  { "sia",               "blake2b"      },
  { "sib",               "x11gost"      },
  { "timetravel8",       "timetravel"   },
  { "veil",              "x16rt-veil"   },
+  { "x16r-hex",          "hex"          },
  { "yenten",            "yescryptr16"  },
  { "ziftr",             "zr5"          },
  { NULL,                NULL           }   
@@ -362,40 +364,3 @@ void get_algo_alias( char** algo_or_alias )
 #undef ALIAS
 #undef PROPER

-bool submit_solution( struct work *work, void *hash,
-                      struct thr_info *thr )
-{
-     work_set_target_ratio( work, hash );
-     if ( submit_work( thr, work ) )
-     {
-         if ( !opt_quiet )
-            applog( LOG_BLUE, "Share %d submitted by thread %d, job %s.",
-                    accepted_share_count + rejected_share_count + 1,
-                    thr->id, work->job_id );
-         return true;
-     }
-     else
-          applog( LOG_WARNING, "Failed to submit share." );
-     return false;
-}
-
-bool submit_lane_solution( struct work *work, void *hash,
-                           struct thr_info *thr, int lane )
-{
-     work_set_target_ratio( work, hash );
-     if ( submit_work( thr, work ) )
-     {
-         if ( !opt_quiet )
-//            applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d.",
-//                    accepted_share_count + rejected_share_count + 1,
-//                    thr->id, lane );
-            applog( LOG_BLUE, "Share %d submitted by thread %d, lane %d, job %s.",
-                    accepted_share_count + rejected_share_count + 1, thr->id,
-                    lane, work->job_id );
-         return true;
-     }
-     else
-          applog( LOG_WARNING, "Failed to submit share." );
-     return false;
-}
-
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -132,7 +132,6 @@ void ( *decode_extra_data )      ( struct work*, uint64_t* );
 void ( *wait_for_diff )          ( struct stratum_ctx* );
 int64_t ( *get_max64 )           ();
 bool ( *work_decode )            ( const json_t*, struct work* );
-void ( *set_target)              ( struct work*, double );
 bool ( *submit_getwork_result )  ( CURL*, struct work* );
 void ( *gen_merkle_root )        ( char*, struct stratum_ctx* );
 void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );
@@ -193,15 +192,6 @@ void four_way_not_tested();
 // allways returns failure
 int null_scanhash();

-// Allow algos to submit from scanhash loop.
-bool submit_solution( struct work *work, void *hash,
-                      struct thr_info *thr );
-bool submit_lane_solution( struct work *work, void *hash,
-                          struct thr_info *thr, int lane );
-
- 
-bool submit_work( struct thr_info *thr, const struct work *work_in );
-
 // displays warning
 void null_hash    ();
 void null_hash_suw();
@@ -232,10 +222,6 @@ int64_t get_max64_0x3fffffLL();
 int64_t get_max64_0x1ffff();
 int64_t get_max64_0xffffLL();

-void std_set_target(    struct work *work, double job_diff );
-void alt_set_target(    struct work* work, double job_diff );
-void scrypt_set_target( struct work *work, double job_diff );
-
 bool std_le_work_decode( const json_t *val, struct work *work );
 bool std_be_work_decode( const json_t *val, struct work *work );
 bool jr2_work_decode( const json_t *val, struct work *work );
--- a/algo/argon2/argon2a/argon2a.c
+++ b/algo/argon2/argon2a/argon2a.c
@@ -85,8 +85,9 @@ bool register_argon2_algo( algo_gate_t* gate )
  gate->scanhash        = (void*)&scanhash_argon2;
  gate->hash            = (void*)&argon2hash;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
-  gate->set_target      = (void*)&scrypt_set_target;
  gate->get_max64       = (void*)&argon2_get_max64;
+  opt_target_factor = 65536.0;
+
  return true;
 };

--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -67,8 +67,8 @@ bool register_argon2d_crds_algo( algo_gate_t* gate )
 {
        gate->scanhash = (void*)&scanhash_argon2d_crds;
        gate->hash = (void*)&argon2d_crds_hash;
-        gate->set_target = (void*)&scrypt_set_target;
        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        opt_target_factor = 65536.0;
        return true;
 }

@@ -135,8 +135,8 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )
 {
        gate->scanhash = (void*)&scanhash_argon2d_dyn;
        gate->hash = (void*)&argon2d_dyn_hash;
-        gate->set_target = (void*)&scrypt_set_target;
        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        opt_target_factor = 65536.0;
        return true;
 }

@@ -184,9 +184,9 @@ int64_t get_max64_0x1ff() { return 0x1ff; }
 bool register_argon2d4096_algo( algo_gate_t* gate )
 {
        gate->scanhash = (void*)&scanhash_argon2d4096;
-        gate->set_target = (void*)&scrypt_set_target;
        gate->get_max64  = (void*)&get_max64_0x1ff;
        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        opt_target_factor = 65536.0;
        return true;
 }

--- a/algo/argon2/argon2d/argon2d/core.c
+++ b/algo/argon2/argon2d/argon2d/core.c
@@ -28,6 +28,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <mm_malloc.h>

 #include "core.h"
 #include "argon2d_thread.h"
@@ -99,7 +100,8 @@ int allocate_memory(const argon2_context *context, uint8_t **memory,
    if (context->allocate_cbk) {
        (context->allocate_cbk)(memory, memory_size);
    } else {
-        *memory = malloc(memory_size);
+        *memory = _mm_malloc( memory_size, 64 );
+//        *memory = malloc(memory_size);
    }

    if (*memory == NULL) {
@@ -116,7 +118,8 @@ void free_memory(const argon2_context *context, uint8_t *memory,
    if (context->free_cbk) {
        (context->free_cbk)(memory, memory_size);
    } else {
-        free(memory);
+//        free(memory);
+        _mm_free( memory );
    }
 }

--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -96,14 +96,14 @@ static void fill_block(__m256i *state, const block *ref_block,
    if (with_xor) {
        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
            state[i] = _mm256_xor_si256(
-                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+                state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
            block_XY[i] = _mm256_xor_si256(
-                state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i));
+                state[i], _mm256_load_si256((const __m256i *)next_block->v + i));
        }
    } else {
        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
            block_XY[i] = state[i] = _mm256_xor_si256(
-                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+                state[i], _mm256_load_si256((const __m256i *)ref_block->v + i));
        }
    }

@@ -139,7 +139,7 @@ static void fill_block(__m256i *state, const block *ref_block,

    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
        state[i] = _mm256_xor_si256(state[i], block_XY[i]);
-        _mm256_storeu_si256((__m256i *)next_block->v + i, state[i]);
+        _mm256_store_si256((__m256i *)next_block->v + i, state[i]);
    }
 }

--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -29,6 +29,8 @@
 #include <x86intrin.h>
 #endif

+#include "simd-utils.h"
+
 #if !defined(__AVX512F__)
 #if !defined(__AVX2__)
 #if !defined(__XOP__)
@@ -182,64 +184,63 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {

 #include <immintrin.h>

-#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
-#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
-#define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
-#define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
+#define  rotr32  mm256_swap32_64
+#define  rotr24  mm256_ror3x8_64
+#define  rotr16  mm256_ror1x16_64
+#define  rotr63( x ) mm256_rol_64( x, 1 )
+
+//#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
+//#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+//#define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+//#define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))

 #define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
    do { \
-        __m256i ml = _mm256_mul_epu32(A0, B0); \
-        ml = _mm256_add_epi64(ml, ml); \
-        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        __m256i ml0, ml1; \
+        ml0 = _mm256_mul_epu32(A0, B0); \
+        ml1 = _mm256_mul_epu32(A1, B1); \
+        ml0 = _mm256_add_epi64(ml0, ml0); \
+        ml1 = _mm256_add_epi64(ml1, ml1); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml0)); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml1)); \
        D0 = _mm256_xor_si256(D0, A0); \
-        D0 = rotr32(D0); \
-        \
-        ml = _mm256_mul_epu32(C0, D0); \
-        ml = _mm256_add_epi64(ml, ml); \
-        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
-        \
-        B0 = _mm256_xor_si256(B0, C0); \
-        B0 = rotr24(B0); \
-        \
-        ml = _mm256_mul_epu32(A1, B1); \
-        ml = _mm256_add_epi64(ml, ml); \
-        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
        D1 = _mm256_xor_si256(D1, A1); \
+        D0 = rotr32(D0); \
        D1 = rotr32(D1); \
-        \
-        ml = _mm256_mul_epu32(C1, D1); \
-        ml = _mm256_add_epi64(ml, ml); \
-        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
-        \
+        ml0 = _mm256_mul_epu32(C0, D0); \
+        ml1 = _mm256_mul_epu32(C1, D1); \
+        ml0 = _mm256_add_epi64(ml0, ml0); \
+        ml1 = _mm256_add_epi64(ml1, ml1); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml0)); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml1)); \
+        B0 = _mm256_xor_si256(B0, C0); \
        B1 = _mm256_xor_si256(B1, C1); \
+        B0 = rotr24(B0); \
        B1 = rotr24(B1); \
    } while((void)0, 0);

 #define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
    do { \
-        __m256i ml = _mm256_mul_epu32(A0, B0); \
-        ml = _mm256_add_epi64(ml, ml); \
-        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        __m256i ml0, ml1; \
+        ml0 = _mm256_mul_epu32(A0, B0); \
+        ml1 = _mm256_mul_epu32(A1, B1); \
+        ml0 = _mm256_add_epi64(ml0, ml0); \
+        ml1 = _mm256_add_epi64(ml1, ml1); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml0)); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml1)); \
        D0 = _mm256_xor_si256(D0, A0); \
-        D0 = rotr16(D0); \
-        \
-        ml = _mm256_mul_epu32(C0, D0); \
-        ml = _mm256_add_epi64(ml, ml); \
-        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
-        B0 = _mm256_xor_si256(B0, C0); \
-        B0 = rotr63(B0); \
-        \
-        ml = _mm256_mul_epu32(A1, B1); \
-        ml = _mm256_add_epi64(ml, ml); \
-        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
        D1 = _mm256_xor_si256(D1, A1); \
+        D0 = rotr16(D0); \
        D1 = rotr16(D1); \
-        \
-        ml = _mm256_mul_epu32(C1, D1); \
-        ml = _mm256_add_epi64(ml, ml); \
-        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        ml0 = _mm256_mul_epu32(C0, D0); \
+        ml1 = _mm256_mul_epu32(C1, D1); \
+        ml0 = _mm256_add_epi64(ml0, ml0); \
+        ml1 = _mm256_add_epi64(ml1, ml1); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml0)); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml1)); \
+        B0 = _mm256_xor_si256(B0, C0); \
        B1 = _mm256_xor_si256(B1, C1); \
+        B0 = rotr63(B0); \
        B1 = rotr63(B1); \
    } while((void)0, 0);

@@ -259,16 +260,14 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
        B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
-        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        \
        tmp1 = C0; \
+        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
        C0 = C1; \
-        C1 = tmp1; \
-        \
-        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
-        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        C1 = tmp1; \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
    } while(0);

 #define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
@@ -287,16 +286,14 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
        B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
-        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
-        \
        tmp1 = C0; \
+        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
        C0 = C1; \
-        C1 = tmp1; \
-        \
-        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
        tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
-        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        C1 = tmp1; \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
    } while((void)0, 0);

 #define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
--- a/algo/blake/blake256-hash-4way.c
+++ b/algo/blake/blake256-hash-4way.c
@@ -308,12 +308,12 @@ static const sph_u32 CS[16] = {
 #define GS_4WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
-                 _mm_set_epi32( c1, c1, c1, c1 ), m0 ), b ), a ); \
+                                   _mm_set1_epi32( c1 ), m0 ), b ), a ); \
   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
-                 _mm_set_epi32( c0, c0, c0, c0 ), m1 ), b ), a ); \
+                                   _mm_set1_epi32( c0 ), m1 ), b ), a ); \
   d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
   c = _mm_add_epi32( c, d ); \
   b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
@@ -508,14 +508,18 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = _mm_xor_si128( S0, _mm_set1_epi32( CS0 ) ); \
-   V9 = _mm_xor_si128( S1, _mm_set1_epi32( CS1 ) ); \
-   VA = _mm_xor_si128( S2, _mm_set1_epi32( CS2 ) ); \
-   VB = _mm_xor_si128( S3, _mm_set1_epi32( CS3 ) ); \
-   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
-   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
-   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
-   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
+   V8 = _mm_xor_si128( S0, m128_const1_64( 0x243F6A88243F6A88 ) ); \
+   V9 = _mm_xor_si128( S1, m128_const1_64( 0x85A308D385A308D3 ) ); \
+   VA = _mm_xor_si128( S2, m128_const1_64( 0x13198A2E13198A2E ) ); \
+   VB = _mm_xor_si128( S3, m128_const1_64( 0x0370734403707344 ) ); \
+   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), \
+                           m128_const1_64( 0xA4093822A4093822 ) ); \
+   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), \
+                           m128_const1_64( 0x299F31D0299F31D0 ) ); \
+   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), \
+                           m128_const1_64( 0x082EFA98082EFA98 ) ); \
+   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), \
+                           m128_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
   BLAKE256_4WAY_BLOCK_BSWAP32; \
   ROUND_S_4WAY(0); \
   ROUND_S_4WAY(1); \
@@ -631,16 +635,20 @@ do { \
   V5 = H5; \
   V6 = H6; \
   V7 = H7; \
-   V8 = _mm256_xor_si256( S0, _mm256_set1_epi32( CS0 ) ); \
-   V9 = _mm256_xor_si256( S1, _mm256_set1_epi32( CS1 ) ); \
-   VA = _mm256_xor_si256( S2, _mm256_set1_epi32( CS2 ) ); \
-   VB = _mm256_xor_si256( S3, _mm256_set1_epi32( CS3 ) ); \
-   VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS4 ) ); \
-   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
-   VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
-   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
-   shuf_bswap32 = _mm256_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
-                                     0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
+   V8 = _mm256_xor_si256( S0, m256_const1_64( 0x243F6A88243F6A88 ) ); \
+   V9 = _mm256_xor_si256( S1, m256_const1_64( 0x85A308D385A308D3 ) ); \
+   VA = _mm256_xor_si256( S2, m256_const1_64( 0x13198A2E13198A2E ) ); \
+   VB = _mm256_xor_si256( S3, m256_const1_64( 0x0370734403707344 ) ); \
+   VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
+                              m256_const1_64( 0xA4093822A4093822 ) ); \
+   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ),\
+                              m256_const1_64( 0x299F31D0299F31D0 ) ); \
+   VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
+                              m256_const1_64( 0x082EFA98082EFA98 ) ); \
+   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
+                              m256_const1_64( 0xEC4E6C89EC4E6C89 ) ); \
+   shuf_bswap32 = m256_const_64( 0x0c0d0e0f08090a0b, 0x0405060700010203, \
+                                 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \
   M0 = _mm256_shuffle_epi8( * buf    , shuf_bswap32 ); \
   M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap32 ); \
   M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap32 ); \
@@ -696,14 +704,14 @@ blake32_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
                   const uint32_t *salt, int rounds )
 {
   __m128i zero = m128_zero;
-   casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
-   casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
-   casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
-   casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
-   casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
-   casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
-   casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
-   casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
+   casti_m128i( ctx->H, 0 ) = m128_const1_64( 0x6A09E6676A09E667 );
+   casti_m128i( ctx->H, 1 ) = m128_const1_64( 0xBB67AE85BB67AE85 );
+   casti_m128i( ctx->H, 2 ) = m128_const1_64( 0x3C6EF3723C6EF372 );
+   casti_m128i( ctx->H, 3 ) = m128_const1_64( 0xA54FF53AA54FF53A );
+   casti_m128i( ctx->H, 4 ) = m128_const1_64( 0x510E527F510E527F );
+   casti_m128i( ctx->H, 5 ) = m128_const1_64( 0x9B05688C9B05688C );
+   casti_m128i( ctx->H, 6 ) = m128_const1_64( 0x1F83D9AB1F83D9AB );
+   casti_m128i( ctx->H, 7 ) = m128_const1_64( 0x5BE0CD195BE0CD19 );

   casti_m128i( ctx->S, 0 ) = zero;
   casti_m128i( ctx->S, 1 ) = zero;
@@ -778,12 +786,13 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
   else
      ctx->T0 -= 512 - bit_len;

-   buf[vptr] = _mm_set1_epi32( 0x80 );
+   buf[vptr] = m128_const1_64( 0x0000008000000080 );

   if ( vptr < 12 )
   {
      memset_zero_128( buf + vptr + 1, 13 - vptr  );
-      buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
+      buf[ 13 ] = _mm_or_si128( buf[ 13 ],
+                                m128_const1_64( 0x0100000001000000ULL ) );
      buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
      buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
      blake32_4way( ctx, buf + vptr, 64 - ptr );
@@ -795,7 +804,8 @@ blake32_4way_close( blake_4way_small_context *ctx, unsigned ub, unsigned n,
      ctx->T0 = 0xFFFFFE00UL;
      ctx->T1 = 0xFFFFFFFFUL;
      memset_zero_128( buf, 56>>2 );
-      buf[ 13 ] = _mm_or_si128( buf[ 13 ], _mm_set1_epi32( 0x01000000UL ) );
+      buf[ 13 ] = _mm_or_si128( buf[ 13 ],
+                                m128_const1_64( 0x0100000001000000ULL ) );
      buf[ 14 ] = mm128_bswap_32( _mm_set1_epi32( th ) );
      buf[ 15 ] = mm128_bswap_32( _mm_set1_epi32( tl ) );
      blake32_4way( ctx, buf, 64 );
@@ -815,20 +825,18 @@ blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
                   const sph_u32 *salt, int rounds )
 {
   __m256i zero = m256_zero;
-   casti_m256i( sc->H, 0 ) = _mm256_set1_epi32( iv[0] );
-   casti_m256i( sc->H, 1 ) = _mm256_set1_epi32( iv[1] );
-   casti_m256i( sc->H, 2 ) = _mm256_set1_epi32( iv[2] );
-   casti_m256i( sc->H, 3 ) = _mm256_set1_epi32( iv[3] );
-   casti_m256i( sc->H, 4 ) = _mm256_set1_epi32( iv[4] );
-   casti_m256i( sc->H, 5 ) = _mm256_set1_epi32( iv[5] );
-   casti_m256i( sc->H, 6 ) = _mm256_set1_epi32( iv[6] );
-   casti_m256i( sc->H, 7 ) = _mm256_set1_epi32( iv[7] );
-
+   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E6676A09E667 );
+   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE85BB67AE85 );
+   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF3723C6EF372 );
+   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53AA54FF53A );
+   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527F510E527F );
+   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C9B05688C );
+   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9AB1F83D9AB );
+   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD195BE0CD19 );
   casti_m256i( sc->S, 0 ) = zero;
   casti_m256i( sc->S, 1 ) = zero;
   casti_m256i( sc->S, 2 ) = zero;
   casti_m256i( sc->S, 3 ) = zero;
-
   sc->T0 = sc->T1 = 0;
   sc->ptr = 0;
   sc->rounds = rounds;
@@ -887,7 +895,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
+   buf[ptr>>2] = m256_const1_64( 0x0000008000000080ULL );
   tl = sc->T0 + bit_len;
   th = sc->T1;

@@ -909,7 +917,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
       memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
       if ( out_size_w32 == 8 )
           buf[52>>2] = _mm256_or_si256( buf[52>>2],
-                                           _mm256_set1_epi32( 0x01000000UL ) );
+                                m256_const1_64( 0x0100000001000000ULL ) );
       *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
       *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
       blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
@@ -922,7 +930,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
        sc->T1 = SPH_C32(0xFFFFFFFFUL);
        memset_zero_256( buf, 56>>2 );
       if ( out_size_w32 == 8 )
-           buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
+           buf[52>>2] = m256_const1_64( 0x0100000001000000ULL );
        *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
        *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
        blake32_8way( sc, buf, 64 );
--- a/algo/blake/blake256-hash-4way.c.new
+++ b/algo/blake/blake256-hash-4way.c.new
@@ -1,322 +0,0 @@
-// convert blake256 32 bit to use 64 bit with serial vectoring
-//
-//  cut calls to GS in half
-//
-// combine V
-// v0 = {V0,V1}
-// v1 = {V2,V3}
-// v2 = {V4,V5}
-// v3 = {V6,V7}
-// v4 = {V8,V9}
-// v5 = {VA,VB}
-// v6 = {VC,VD}
-// v7 = {CE,VF}
-//
-// v6x = {VD,VC}      swap(VC,VD)   swap(v6)
-// v7x = {VF,VE}      swap(VE,VF)   swap(v7)
-//
-// V0 = v1v0
-// V1 = v3v2
-// V2 = v5v4
-// V3 = v7v6
-// V4 = v9v8
-// V5 = vbva
-// V6 = vdvc
-// V7 = vfve
-//
-// The rotate in ROUND is to effect straddle and unstraddle for the third
-// and 4th iteration of GS.
-// It concatenates 2 contiguous 256 bit vectors and extracts the middle
-// 256 bits. After the transform they must be restored with only the
-// chosen bits modified in the original 2 vectors.
-// ror1x128 achieves this by putting the chosen bits in arg1, the "low"
-// 256 bit vector and saves the untouched bits temporailly in arg0, the
-// "high" 256 bit vector. Simply reverse the process to restore data back
-// to original positions.
-
-// Use standard 4way when AVX2 is not available use x2 mode with AVX2.
-//
-// Data is organised the same as 32 bit 4 way, in effect serial vectoring
-// on top of parallel vectoring. Same data in the same place just taking
-// two chunks at a time.
-//
-// Transparent to user, x2 mode used when AVX2 detected.
-// Use existing 4way context but revert to scalar types.
-// Same interleave function (128 bit) or x2 with 256 bit?
-// User trsnaparency would have to apply to interleave as well.
-//
-// Use common 4way update and close
-
-/*
-typedef struct {
-   unsigned char buf[64<<2];
-   uint32_t H[8<<2];
-   uint32_t S[4<<2];
-   size_t ptr;
-   uint32_t T0, T1;
-   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
-} blakex2_4way_small_context __attribute__ ((aligned (64)));
-*/
-
-static void
-blake32x2_4way_init( blake_4way_small_context *ctx, const uint32_t *iv,
-                   const uint32_t *salt, int rounds )
-{
-   casti_m128i( ctx->H, 0 ) = _mm_set1_epi32( iv[0] );
-   casti_m128i( ctx->H, 1 ) = _mm_set1_epi32( iv[1] );
-   casti_m128i( ctx->H, 2 ) = _mm_set1_epi32( iv[2] );
-   casti_m128i( ctx->H, 3 ) = _mm_set1_epi32( iv[3] );
-   casti_m128i( ctx->H, 4 ) = _mm_set1_epi32( iv[4] );
-   casti_m128i( ctx->H, 5 ) = _mm_set1_epi32( iv[5] );
-   casti_m128i( ctx->H, 6 ) = _mm_set1_epi32( iv[6] );
-   casti_m128i( ctx->H, 7 ) = _mm_set1_epi32( iv[7] );
-
-   casti_m128i( ctx->S, 0 ) = m128_zero;
-   casti_m128i( ctx->S, 1 ) = m128_zero;
-   casti_m128i( ctx->S, 2 ) = m128_zero;
-   casti_m128i( ctx->S, 3 ) = m128_zero;
-/*
-   sc->S[0] = _mm_set1_epi32( salt[0] );
-   sc->S[1] = _mm_set1_epi32( salt[1] );
-   sc->S[2] = _mm_set1_epi32( salt[2] );
-   sc->S[3] = _mm_set1_epi32( salt[3] );
-*/
-   ctx->T0 = ctx->T1 = 0;
-   ctx->ptr = 0;
-   ctx->rounds = rounds;
-}
-
-static void
-blake32x2( blake_4way_small_context *ctx, const void *data, size_t len )
-{
-   __m128i *buf = (__m256i*)ctx->buf;
-   size_t  bptr = ctx->ptr << 2;
-   size_t  vptr = ctx->ptr >> 3;
-   size_t  blen = len << 2;
-//    unsigned char *buf = ctx->buf;
-//    size_t ptr         = ctx->ptr<<4;  // repurposed
-    DECL_STATE32x2
-
-//    buf = sc->buf;
-//    ptr = sc->ptr;
-
-// adjust len for use with ptr, clen, all absolute bytes.
-//    int blen = len<<2;
-
-    if ( blen < (sizeof ctx->buf) - bptr )
-    {
-        memcpy( buf + vptr, data, blen );
-        ptr += blen;
-        ctx->ptr = bptr >> 2;;
-        return;
-    }
-
-    READ_STATE32( ctx );
-    while ( blen > 0 )
-    {
-        size_t clen;
-
-        clen = ( sizeof sc->buf ) - ptr;
-        if ( clen > blen )
-            clen = blen;
-        memcpy( buf + vptr, data, clen );
-        bptr += clen;
-        vptr = bptr >> 5;
-	data = (const unsigned char *)data + clen;
-        blen -= clen;
-        if ( bptr == sizeof ctx->buf )
-       	{
-           if ( ( T0 = T0 + 512 ) < 512 ) // not needed, will never rollover
-               T1 += 1;
-           COMPRESS32x2_4WAY( ctx->rounds );
-           ptr = 0;
-        }
-    }
-    WRITE_STATE32x2( ctx );
-    ctx->ptr = bptr >> 2;
-}
-
-static void
-blake32x2_4way_close( blake_4way_small_context *ctx, void *dst )
-{
-   __m256i buf[8] __attribute__ ((aligned (64)));
-   size_t   ptr     = ctx->ptr;
-   size_t   vptr    = ctx->ptr>>2;
-   unsigned bit_len = ( (unsigned)ptr << 3 );  // one lane
-   uint32_t th      = ctx->T1;
-   uint32_t tl      = ctx->T0 + bit_len;
-
-   if ( ptr == 0 )
-   {
-        ctx->T0 = 0xFFFFFE00UL;
-        ctx->T1 = 0xFFFFFFFFUL;
-   }
-   else if ( ctx->T0 == 0 )
-   {
-      ctx->T0 = 0xFFFFFE00UL + bit_len;
-      ctx->T1 -= 1;
-   }
-   else
-      ctx->T0 -= 512 - bit_len;
-
-   // memset doesn't do ints
-   buf[ vptr ] = _mm256_set_epi32( 0,0,0,0, 0x80, 0x80, 0x80, 0x80 );
-
-   if ( vptr < 5 )
-   {
-       memset_zero_256( buf + vptr + 1, 6 - vptr  );
-       buf[ 6 ] = _mm256_or_si256( vbuf[ 6 ], _mm256_set_epi32(
-             0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL, 0,0,0,0 ) ); 
-       buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl,tl,tl,tl,
-			                            th,th,th,th ) );
-       blake32x2_4way( ctx, buf + vptr, 64 - ptr );
-   }
-   else
-   {
-       memset_zero_256( vbuf + vptr + 1, 7 - vptr );
-       blake32x2_4way( ctx,  vbuf + ptr, 64 - ptr );
-       ctx->T0 = 0xFFFFFE00UL;
-       ctx->T1 = 0xFFFFFFFFUL;
-       buf[ 6 ] = mm256_zero;
-       buf[ 6 ] = _mm256_set_epi32( 0,0,0,0,
-		         0x01000000UL,0x01000000UL,0x01000000UL,0x01000000UL );
-       buf[ 7 ] = mm256_bswap_32( _mm256_set_epi32( tl, tl, tl, tl,
-                                                    th, th, th, th );
-       blake32x2_4way( ctx, buf, 64 );
-   }
-
-   casti_m256i( dst, 0 ) = mm256_bswap_32( casti_m256i( ctx->H, 0 ) );
-   casti_m256i( dst, 1 ) = mm256_bswap_32( casti_m256i( ctx->H, 1 ) );
-   casti_m256i( dst, 2 ) = mm256_bswap_32( casti_m256i( ctx->H, 2 ) );
-   casti_m256i( dst, 3 ) = mm256_bswap_32( casti_m256i( ctx->H, 3 ) );
-}
-
-
-
-
-#define DECL_STATE32x2_4WAY \
-   __m256i H0, H1, H2, H3; \
-   __m256i S0, S1; \
-   uint32_t T0, T1;
-
-#define READ_STATE32x2_4WAY(state)  do \
-{ \
-   H0 = casti_m256i( state->H, 0 ); \
-   H1 = casti_m256i( state->H, 1 ); \
-   H2 = casti_m256i( state->H, 2 ); \
-   H3 = casti_m256i( state->H, 3 ); \
-   S0 = casti_m256i( state->S, 0 ); \
-   S1 = casti_m256i( state->S, 1 ); \
-   T0 = state->T0; \
-   T1 = state->T1; \
-
-#define WRITE_STATE32x2_4WAY(state)   do { \
-   casti_m256i( state->H, 0 ) = H0; \
-   casti_m256i( state->H, 1 ) = H1; \
-   casti_m256i( state->H, 2 ) = H2; \
-   casti_m256i( state->H, 3 ) = H3; \
-   casti_m256i( state->S, 0 ) = S0; \
-   casti_m256i( state->S, 1 ) = S1; \
-   state->T0 = T0; \
-   state->T1 = T1; \
-} while (0)
-
-
-#define GSx2_4WAY( m0m2, m1m3, c0c2, c1c3, a, b, c, d ) do \
-{ \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
-          _mm256_set_epi32( c1,c3, c1,c3, c1,c3, c1,c3 ), \
-	  _mm256_set_epi32( m0,m2, m0,m2, m0,m2, m0,m2 ) ), b ), a ); \
-   d = mm256_ror_32( _mm_xor_si128( d, a ), 16 ); \
-   c = _mm256_add_epi32( c, d ); \
-   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
-   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
-          _mm256_set_epi32( c0,c2, c0,c2, c0,c2, c0,c2 ), \
-	  _mm256_set_epi32( m1,m3, m1,m3, m1,m3, m1,m3 ) ), b ), a ); \
-   d = mm256_ror_32( _mm256_xor_si256( d, a ), 8 ); \
-   c = _mm256_add_epi32( c, d ); \
-   b = mm256_ror_32( _mm256_xor_si256( b, c ), 7 ); \
-} while (0)
-
-#define ROUND_Sx2_4WAY(r)   do \
-{ \
-  GS2_4WAY( Mx(r, 0),  Mx(r, 1),  Mx(r, 2),  Mx(r, 3), \
-           CSx(r, 0), CSx(r, 1), CSx(r, 2), CSx(r, 3), V0, V2, V4, V6 ); \
-  GS2_4WAY( Mx(r, 4),  Mx(r, 5),  Mx(r, 6),  Mx(r, 7), \
-           CSx(r, 4), CSx(r, 5), CSx(r, 6), CSx(r, 7), V1, V3, V5, V7 ); \
-  mm256_ror1x128_512( V3, V2 ); \
-  mm256_ror1x128_512( V6, V7 ); \
-  GS2_4WAY( Mx(r, 8),  Mx(r, 9),  Mx(r, A),  Mx(r, B), \
-           CSx(r, 8), CSx(r, 9), CSx(r, A), CSx(r, B), V0, V2, V5, V7 ); \
-  GS2_4WAY( Mx(r, C),  Mx(r, D),  Mx(r, C),  Mx(r, D), \
-           CSx(r, C), CSx(r, D), CSx(r, C), CSx(r, D), V1, V3, V4, V6 ); \
-  mm256_rol1x128_512( V2, V3 ); \
-  mm256_rol1x128_512( V7, V6 ); 
-
-#define COMPRESS32x2_4WAY( rounds ) do \
-{ \
-   __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
-   __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
-   unsigned r; \
-   V0 = H0; \
-   V1 = H1; \
-   V2 = H2; \
-   V3 = H3; \
-   V4 = _mm256_xor_si256( S0, _mm256_set_epi32( CS1, CS1, CS1, CS1, \
-			                        CS0, CS0, CS0, CS0 ) ); \
-   V5 = _mm256_xor_si256( S1, _mm256_set_epi32( CS3, CS3, CS3, CS3, \
-                                                CS2, CS2, CS2, CS2 ) ); \
-   V6 = _mm256_xor_si256( _mm256_set1_epi32( T0 ), \
-                              _mm256_set_epi32( CS5, CS5, CS5, CS5, \
-		                                CS4, CS4, CS4, CS4 ) ); \
-   V7 = _mm256_xor_si256( _mm256_set1_epi32( T1 ), \
-                              _mm256_set_epi32( CS7, CS7, CS7, CS7, \
-                                                CS6, CS6, CS6, CS6 ) ); \
-   M0 = mm256_bswap_32( buf[ 0] ); \
-   M1 = mm256_bswap_32( buf[ 1] ); \
-   M2 = mm256_bswap_32( buf[ 2] ); \
-   M3 = mm256_bswap_32( buf[ 3] ); \
-   M4 = mm256_bswap_32( buf[ 4] ); \
-   M5 = mm256_bswap_32( buf[ 5] ); \
-   M6 = mm256_bswap_32( buf[ 6] ); \
-   M7 = mm256_bswap_32( buf[ 7] ); \
-   ROUND_Sx2_4WAY(0); \
-   ROUND_Sx2_4WAY(1); \
-   ROUND_Sx2_4WAY(2); \
-   ROUND_Sx2_4WAY(3); \
-   ROUND_Sx2_4WAY(4); \
-   ROUND_Sx2_4WAY(5); \
-   ROUND_Sx2_4WAY(6); \
-   ROUND_Sx2_4WAY(7); \
-   if (rounds == 14) \
-   { \
-      ROUND_Sx2_4WAY(8); \
-      ROUND_Sx2_4WAY(9); \
-      ROUND_Sx2_4WAY(0); \
-      ROUND_Sx2_4WAY(1); \
-      ROUND_Sx2_4WAY(2); \
-      ROUND_Sx2_4WAY(3); \
-   } \
-   H0 = _mm256_xor_si256( _mm256_xor_si256( \
-			           _mm256_xor_si256( V8, V0 ), S0 ), H0 ); \
-   H1 = _mm256_xor_si256( _mm256_xor_si256( \
-			           _mm256_xor_si256( V9, V1 ), S1 ), H1 ); \
-   H2 = _mm256_xor_si256( _mm256_xor_si256( \
-			           _mm256_xor_si256( VA, V2 ), S2 ), H2 ); \
-   H3 = _mm256_xor_si256( _mm256_xor_si256( \
-			           _mm256_xor_si256( VB, V3 ), S3 ), H3 ); \
-} while (0)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
--- a/algo/blake/blake2b-4way.c
+++ b/algo/blake/blake2b-4way.c
@@ -0,0 +1,67 @@
+/**
+ * Blake2-B Implementation
+ * tpruvot@github 2015-2016
+ */
+
+#include "blake2b-gate.h"
+
+#if defined(BLAKE2B_4WAY)
+
+#include <string.h>
+#include <stdint.h>
+#include "blake2b-hash-4way.h"
+
+// Function not used, code inlined.
+void blake2b_4way_hash(void *output, const void *input)
+{
+    blake2b_4way_ctx ctx;
+    blake2b_4way_init( &ctx );
+    blake2b_4way_update( &ctx, input, 80 );
+    blake2b_4way_final( &ctx, output );
+}
+
+int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done, struct thr_info *mythr )
+{
+	uint32_t hash[8*4] __attribute__ ((aligned (64)));;
+   uint32_t vdata[20*4] __attribute__ ((aligned (32)));;
+   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
+   blake2b_4way_ctx ctx __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[25]);   // 3*8+1
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+   int thr_id = mythr->id;
+   __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+
+	uint32_t n = first_nonce;
+
+   mm256_bswap32_intrlv80_4x64( vdata, pdata );
+
+	do {
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+                _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+      blake2b_4way_init( &ctx ); 
+      blake2b_4way_update( &ctx, vdata, 80 );
+      blake2b_4way_final( &ctx, hash );
+
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( hash7[ lane<<1 ] < Htarg )
+      {
+          extr_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
+          {
+              pdata[19] = n + lane;
+              submit_lane_solution( work, lane_hash, mythr, lane );
+          }
+      }
+      n += 4;
+   } while ( (n < max_nonce-4) && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
--- a/algo/blake/blake2b-gate.c
+++ b/algo/blake/blake2b-gate.c
@@ -0,0 +1,25 @@
+#include "blake2b-gate.h"
+
+/*
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blake2s_get_max64 ()
+{
+   return 0x7ffffLL;
+}
+*/
+
+bool register_blake2b_algo( algo_gate_t* gate )
+{
+#if defined(BLAKE2B_4WAY)
+  gate->scanhash  = (void*)&scanhash_blake2b_4way;
+  gate->hash      = (void*)&blake2b_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_blake2b;
+  gate->hash      = (void*)&blake2b_hash;
+#endif
+//  gate->get_max64 = (void*)&blake2s_get_max64;
+  gate->optimizations =  AVX2_OPT;
+  return true;
+};
+
+
--- a/algo/blake/blake2b-gate.h
+++ b/algo/blake/blake2b-gate.h
@@ -0,0 +1,26 @@
+#ifndef __BLAKE2B_GATE_H__
+#define __BLAKE2B_GATE_H__ 1
+
+#include <stdint.h>
+#include "algo-gate-api.h"
+
+#if defined(__AVX2__)
+  #define BLAKE2B_4WAY
+#endif
+
+bool register_blake2b_algo( algo_gate_t* gate );
+
+#if defined(BLAKE2B_4WAY)
+
+void blake2b_4way_hash( void *state, const void *input );
+int scanhash_blake2b_4way( struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done, struct thr_info *mythr );
+#else
+
+void blake2b_hash( void *state, const void *input );
+int scanhash_blake2b( struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr );
+
+#endif
+
+#endif
--- a/algo/blake/blake2b-hash-4way.c
+++ b/algo/blake/blake2b-hash-4way.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2009 Colin Percival, 2014 savale
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "blake2b-hash-4way.h"
+
+#if defined(__AVX2__)
+
+// G Mixing function.
+
+#define B2B_G(a, b, c, d, x, y) \
+{ \
+   v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), x ); \
+	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 32 ); \
+	v[c] = _mm256_add_epi64( v[c], v[d] ); \
+	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 24 ); \
+	v[a] = _mm256_add_epi64( _mm256_add_epi64( v[a], v[b] ), y ); \
+	v[d] = mm256_ror_64( _mm256_xor_si256( v[d], v[a] ), 16 ); \
+	v[c] = _mm256_add_epi64( v[c], v[d] ); \
+	v[b] = mm256_ror_64( _mm256_xor_si256( v[b], v[c] ), 63 ); \
+}
+
+// Initialization Vector.
+/*
+static const uint64_t blake2b_iv[8] = {
+	0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+	0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+	0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+	0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+};
+*/
+
+static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last )
+{
+	const uint8_t sigma[12][16] = {
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+		{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+		{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+	};
+	int i;
+	__m256i v[16], m[16];
+
+   v[ 0] = ctx->h[0];
+   v[ 1] = ctx->h[1];
+   v[ 2] = ctx->h[2];
+   v[ 3] = ctx->h[3];
+   v[ 4] = ctx->h[4];
+   v[ 5] = ctx->h[5];
+   v[ 6] = ctx->h[6];
+   v[ 7] = ctx->h[7];
+   v[ 8] = m256_const1_64( 0x6A09E667F3BCC908 );
+   v[ 9] = m256_const1_64( 0xBB67AE8584CAA73B );
+   v[10] = m256_const1_64( 0x3C6EF372FE94F82B );
+   v[11] = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   v[12] = m256_const1_64( 0x510E527FADE682D1 );
+   v[13] = m256_const1_64( 0x9B05688C2B3E6C1F );
+   v[14] = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   v[15] = m256_const1_64( 0x5BE0CD19137E2179 );
+
+   v[12] = _mm256_xor_si256( v[12], _mm256_set1_epi64x( ctx->t[0] ) );
+   v[13] = _mm256_xor_si256( v[13], _mm256_set1_epi64x( ctx->t[1] ) );
+
+   if ( last )   
+		v[14] = mm256_not( v[14] );
+
+   m[ 0] = ctx->b[ 0];
+   m[ 1] = ctx->b[ 1];
+   m[ 2] = ctx->b[ 2];
+   m[ 3] = ctx->b[ 3];
+   m[ 4] = ctx->b[ 4];
+   m[ 5] = ctx->b[ 5];
+   m[ 6] = ctx->b[ 6];
+   m[ 7] = ctx->b[ 7];
+   m[ 8] = ctx->b[ 8];
+   m[ 9] = ctx->b[ 9];
+   m[10] = ctx->b[10];
+   m[11] = ctx->b[11];
+   m[12] = ctx->b[12];
+   m[13] = ctx->b[13];
+   m[14] = ctx->b[14];
+   m[15] = ctx->b[15];
+   
+	for ( i = 0; i < 12; i++ )
+   { 
+		B2B_G( 0, 4,  8, 12, m[ sigma[i][ 0] ], m[ sigma[i][ 1] ] );
+		B2B_G( 1, 5,  9, 13, m[ sigma[i][ 2] ], m[ sigma[i][ 3] ] );
+		B2B_G( 2, 6, 10, 14, m[ sigma[i][ 4] ], m[ sigma[i][ 5] ] );
+		B2B_G( 3, 7, 11, 15, m[ sigma[i][ 6] ], m[ sigma[i][ 7] ] );
+		B2B_G( 0, 5, 10, 15, m[ sigma[i][ 8] ], m[ sigma[i][ 9] ] );
+		B2B_G( 1, 6, 11, 12, m[ sigma[i][10] ], m[ sigma[i][11] ] );
+		B2B_G( 2, 7,  8, 13, m[ sigma[i][12] ], m[ sigma[i][13] ] );
+		B2B_G( 3, 4,  9, 14, m[ sigma[i][14] ], m[ sigma[i][15] ] );
+	}
+
+   ctx->h[0] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[0], v[0] ), v[ 8] );
+   ctx->h[1] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[1], v[1] ), v[ 9] );
+   ctx->h[2] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[2], v[2] ), v[10] );
+   ctx->h[3] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[3], v[3] ), v[11] );
+   ctx->h[4] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[4], v[4] ), v[12] );
+   ctx->h[5] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[5], v[5] ), v[13] );
+   ctx->h[6] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[6], v[6] ), v[14] );
+   ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] );
+}
+
+int blake2b_4way_init( blake2b_4way_ctx *ctx ) 
+{
+	size_t i;
+
+   ctx->h[0] = m256_const1_64( 0x6A09E667F3BCC908 );
+   ctx->h[1] = m256_const1_64( 0xBB67AE8584CAA73B );
+   ctx->h[2] = m256_const1_64( 0x3C6EF372FE94F82B );
+   ctx->h[3] = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   ctx->h[4] = m256_const1_64( 0x510E527FADE682D1 );
+   ctx->h[5] = m256_const1_64( 0x9B05688C2B3E6C1F );
+   ctx->h[6] = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   ctx->h[7] = m256_const1_64( 0x5BE0CD19137E2179 );
+
+   ctx->h[0] = _mm256_xor_si256( ctx->h[0], m256_const1_64( 0x01010020 ) );
+
+	ctx->t[0] = 0;
+	ctx->t[1] = 0;
+	ctx->c = 0;
+	ctx->outlen = 32;
+
+   for ( i = 0; i < 16; i++ )
+     ctx->b[i] = m256_zero;
+
+	return 0;
+}
+
+void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+                          size_t inlen ) 
+{
+   __m256i* in =(__m256i*)input;
+
+	size_t i, c;
+   c = ctx->c >> 3; 
+
+	for ( i = 0; i < (inlen >> 3); i++ )
+   {
+		if ( ctx->c == 128 )
+      { 
+			ctx->t[0] += ctx->c;
+			if ( ctx->t[0] < ctx->c )
+				ctx->t[1]++;
+			blake2b_4way_compress( ctx, 0 );
+			ctx->c = 0;
+		}
+      ctx->b[ c++ ] = in[i];
+      ctx->c += 8;
+   }
+}
+
+void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out )
+{
+	size_t c;
+   c = ctx->c >> 3;
+
+	ctx->t[0] += ctx->c;
+	if ( ctx->t[0] < ctx->c )
+		ctx->t[1]++;
+
+	while ( ctx->c < 128 )
+   {
+      ctx->b[c++] = m256_zero;
+      ctx->c += 8;
+   }
+
+   blake2b_4way_compress( ctx, 1 );           // final block flag = 1
+
+   casti_m256i( out, 0 ) = ctx->h[0];
+   casti_m256i( out, 1 ) = ctx->h[1];
+   casti_m256i( out, 2 ) = ctx->h[2];
+   casti_m256i( out, 3 ) = ctx->h[3];
+}
+
+#endif
--- a/algo/blake/blake2b-hash-4way.h
+++ b/algo/blake/blake2b-hash-4way.h
@@ -0,0 +1,35 @@
+#pragma once
+#ifndef __BLAKE2B_HASH_4WAY_H__
+#define __BLAKE2B_HASH_4WAY_H__
+
+#if defined(__AVX2__)
+
+#include "simd-utils.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <inttypes.h>
+#define inline __inline
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+// state context
+ALIGN(64) typedef struct {
+	__m256i b[16]; // input buffer
+	__m256i h[8];  // chained state
+	uint64_t t[2];  // total number of bytes
+	size_t c;       // pointer for b[]
+	size_t outlen;  // digest size
+} blake2b_4way_ctx __attribute__((aligned(64)));
+
+int blake2b_4way_init( blake2b_4way_ctx *ctx );
+void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input,
+                          size_t inlen );
+void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out );
+
+#endif
+
+#endif
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -3,13 +3,11 @@
 * tpruvot@github 2015-2016
 */

-#include "algo-gate-api.h"
+#include "blake2b-gate.h"
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/sph_blake2b.h"

-//static __thread sph_blake2b_ctx s_midstate;
-//static __thread sph_blake2b_ctx s_ctx;
 #define MIDLEN 76
 #define A 64

@@ -25,16 +23,6 @@ void blake2b_hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-/*
-static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
-{
-	s_ctx.outlen = MIDLEN;
-	memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN);
-	sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
-	sph_blake2b_final(&s_ctx, (uint8_t*) output);
-}
-*/
-
 int scanhash_blake2b( struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done, struct thr_info *mythr )
 {
@@ -45,7 +33,7 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
   int thr_id = mythr->id;  // thr_id arg is deprecated

 	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[8];
+	const uint32_t first_nonce = pdata[19];

 	uint32_t n = first_nonce;

@@ -53,179 +41,23 @@ int scanhash_blake2b( struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[i], pdata[i]);
 	}

-	// midstate (untested yet)
-	//blake2b_init(&s_midstate, 32, NULL, 0);
-	//blake2b_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
-	//memcpy(&s_ctx, &s_midstate, sizeof(blake2b_ctx));
-
 	do {
-		be32enc(&endiandata[8], n);
+		be32enc(&endiandata[19], n);
 		//blake2b_hash_end(vhashcpu, endiandata);
 		blake2b_hash(vhashcpu, endiandata);

 		if (vhashcpu[7] < Htarg && fulltest(vhashcpu, ptarget)) {
 			work_set_target_ratio(work, vhashcpu);
 			*hashes_done = n - first_nonce + 1;
-			pdata[8] = n;
+			pdata[19] = n;
 			return 1;
 		}
 		n++;

 	} while (n < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = n - first_nonce + 1;
-	pdata[8] = n;
+	pdata[19] = n;

 	return 0;
 }

-static inline void swab256(void *dest_p, const void *src_p)
-{
-	uint32_t *dest = (uint32_t *)dest_p;
-	const uint32_t *src = (uint32_t *)src_p;
-
-	dest[0] = swab32(src[7]);
-	dest[1] = swab32(src[6]);
-	dest[2] = swab32(src[5]);
-	dest[3] = swab32(src[4]);
-	dest[4] = swab32(src[3]);
-	dest[5] = swab32(src[2]);
-	dest[6] = swab32(src[1]);
-	dest[7] = swab32(src[0]);
-}
-
-/* compute nbits to get the network diff */
-void blake2b_calc_network_diff(struct work *work)
-{
-        // sample for diff 43.281 : 1c05ea29
-        uint32_t nbits = work->data[11]; // unsure if correct
-        uint32_t bits = (nbits & 0xffffff);
-        int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
-
-        double d = (double)0x0000ffff / (double)bits;
-        for (int m=shift; m < 29; m++) d *= 256.0;
-        for (int m=29; m < shift; m++) d /= 256.0;
-        if (opt_debug_diff)
-                applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
-        net_diff = d;
-}
-
-void blake2b_be_build_stratum_request( char *req, struct work *work )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime,       nonce;
-   char     ntimestr[9], noncestr[9];
-   be32enc( &ntime, work->data[ algo_gate.ntime_index ] );
-   be32enc( &nonce, work->data[ algo_gate.nonce_index ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   uint16_t high_nonce = swab32(work->data[9]) >> 16;
-   xnonce2str = abin2hex((unsigned char*)(&high_nonce), 2);
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free( xnonce2str );
-}
-
-#define min(a,b) (a>b ? (b) :(a))
-
-// merkle root handled here, no need for gen_merkle_root gate target
-void blake2b_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-    uchar merkle_root[64] = { 0 };
-    uint32_t extraheader[32] = { 0 };
-    int headersize = 0;
-    size_t t;
-    int i;
-
-    // merkle root
-    memcpy( merkle_root, sctx->job.coinbase, 32 );
-    headersize = min( (int)sctx->job.coinbase_size - 32, sizeof(extraheader) );
-    memcpy( extraheader, &sctx->job.coinbase[32], headersize );
-    // Increment extranonce2 
-    for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
-    // Assemble block header 
-    memset( g_work->data, 0, sizeof(g_work->data) );
-//    g_work->data[0] = le32dec( sctx->job.version );
-//    for ( i = 0; i < 8; i++ )
-//       g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
-    for ( i = 0; i < 8; i++ )
-       g_work->data[i] = ((uint32_t*)sctx->job.prevhash)[7-i];
-//    for ( i = 0; i < 8; i++ )
-//       g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
-    g_work->data[8]  = 0; // nonce
-    g_work->data[9]  = swab32( extraheader[0] ) | ( rand() & 0xf0 );
-    g_work->data[10] = be32dec( sctx->job.ntime );
-    g_work->data[11] = be32dec( sctx->job.nbits );
-    for ( i = 0; i < 8; i++ )
-       g_work->data[12+i] = ( (uint32_t*)merkle_root )[i];
-}
-
-#undef min
-
-void blake2b_get_new_work( struct work* work, struct work* g_work, int thr_id,
-                           uint32_t* end_nonce_ptr, bool clean_job )
-{
-   const int wkcmp_sz = 32;  // bytes
-   const int wkcmp_off = 32 + 16; 
-   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
-
-   if ( memcmp( &work->data[ wkcmp_off ], &g_work->data[ wkcmp_off ], wkcmp_sz )
-      && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) 
-      || strcmp( work->job_id, g_work->job_id ) ) )
-   {
-      work_free( work );
-      work_copy( work, g_work );
-      *nonceptr = ( 0xffffffffU / opt_n_threads ) * thr_id;
-      if ( opt_randomize )
-         *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
-      *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20;
-   }
-   else
-       ++(*nonceptr);
-
-   // suprnova job_id check without data/target/height change...
-   // we just may have copied new g_wwork to work so why this test here?
-//   if (  have_stratum && strcmp( work->job_id, g_work->job_id ) )
-      // exit thread loop
-//      continue;
-//   else
-//   {
-//      nonceptr[1] += 0x10;
-//      nonceptr[1] |= thr_id;
-//   }
-}
-
-bool blake2b_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
-                           int thr_id )
-{
-   if ( have_stratum && strcmp( stratum->job.job_id, work->job_id ) )
-      // need to regen g_work..
-      return false;
-   // extradata: prevent duplicates
-   work->data[ 8     ] += 0x10;
-   work->data[ 8 + 1 ] |= thr_id;
-   return true;
-}
-
-double blake2b_get_max64() { return 0x1fffffLL; }
-
-bool register_blake2b_algo( algo_gate_t* gate )
-{
-  algo_not_tested();
-  gate->ntime_index   = 10;
-  gate->nbits_index   = 11;
-  gate->nonce_index   =  8;
-  gate->work_cmp_size = 32;
-  gate->scanhash              = (void*)&scanhash_blake2b;
-  gate->hash                  = (void*)&blake2b_hash;
-  gate->calc_network_diff     = (void*)&blake2b_calc_network_diff;
-  gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request;
-  gate->work_decode           = (void*)&std_be_work_decode;
-  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->build_extraheader     = (void*)&blake2b_build_extraheader;
-  gate->get_new_work          = (void*)&blake2b_get_new_work;
-  gate->get_max64             = (void*)&blake2b_get_max64;
-  gate->ready_to_mine         = (void*)&blake2b_ready_to_mine;
-  have_gbt = false;
-  return true;
-}
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -20,7 +20,7 @@ bool register_blake2s_algo( algo_gate_t* gate )
  gate->hash      = (void*)&blake2s_hash;
 #endif
  gate->get_max64 = (void*)&blake2s_get_max64;
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
  return true;
 };

--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -4,7 +4,8 @@
 #include <stdint.h>
 #include "algo-gate-api.h"

-#if defined(__SSE4_2__)
+//#if defined(__SSE4_2__)
+#if defined(__SSE2__)
  #define BLAKE2S_4WAY
 #endif
 #if defined(__AVX2__)
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -17,7 +17,9 @@
 #include <string.h>
 #include <stdio.h>

-#if defined(__SSE4_2__)
+//#if defined(__SSE4_2__)
+#if defined(__SSE2__)
+

 static const uint32_t blake2s_IV[8] =
 {
@@ -57,8 +59,18 @@ int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
   memset( P->personal, 0, sizeof( P->personal ) );

   memset( S, 0, sizeof( blake2s_4way_state ) );
-   for( int i = 0; i < 8; ++i )
-      S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
+
+   S->h[0] = m128_const1_64( 0x6A09E6676A09E667ULL );
+   S->h[1] = m128_const1_64( 0xBB67AE85BB67AE85ULL );
+   S->h[2] = m128_const1_64( 0x3C6EF3723C6EF372ULL );
+   S->h[3] = m128_const1_64( 0xA54FF53AA54FF53AULL );
+   S->h[4] = m128_const1_64( 0x510E527F510E527FULL );
+   S->h[5] = m128_const1_64( 0x9B05688C9B05688CULL );
+   S->h[6] = m128_const1_64( 0x1F83D9AB1F83D9ABULL );
+   S->h[7] = m128_const1_64( 0x5BE0CD195BE0CD19ULL );
+   
+//   for( int i = 0; i < 8; ++i )
+//      S->h[i] = _mm_set1_epi32( blake2s_IV[i] );

   uint32_t *p = ( uint32_t * )( P );

@@ -267,8 +279,18 @@ int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
   memset( P->personal, 0, sizeof( P->personal ) );

   memset( S, 0, sizeof( blake2s_8way_state ) );
-   for( int i = 0; i < 8; ++i )
-      S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
+   S->h[0] = m256_const1_64( 0x6A09E6676A09E667ULL );
+   S->h[1] = m256_const1_64( 0xBB67AE85BB67AE85ULL );
+   S->h[2] = m256_const1_64( 0x3C6EF3723C6EF372ULL );
+   S->h[3] = m256_const1_64( 0xA54FF53AA54FF53AULL );
+   S->h[4] = m256_const1_64( 0x510E527F510E527FULL );
+   S->h[5] = m256_const1_64( 0x9B05688C9B05688CULL );
+   S->h[6] = m256_const1_64( 0x1F83D9AB1F83D9ABULL );
+   S->h[7] = m256_const1_64( 0x5BE0CD195BE0CD19ULL );
+
+
+//   for( int i = 0; i < 8; ++i )
+//      S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );

   uint32_t *p = ( uint32_t * )( P );

--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -14,7 +14,8 @@
 #ifndef __BLAKE2S_HASH_4WAY_H__
 #define __BLAKE2S_HASH_4WAY_H__ 1

-#if defined(__SSE4_2__)
+//#if defined(__SSE4_2__)
+#if defined(__SSE2__)

 #include "simd-utils.h"

--- a/algo/blake/blake512-hash-4way.c
+++ b/algo/blake/blake512-hash-4way.c
@@ -307,12 +307,12 @@ static const sph_u64 CB[16] = {

 #define GB_4WAY(m0, m1, c0, c1, a, b, c, d)   do { \
   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
-                 _mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \
+                 _mm256_set1_epi64x( c1 ), m0 ), b ), a ); \
   d = mm256_ror_64( _mm256_xor_si256( d, a ), 32 ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 25 ); \
   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
-                 _mm256_set_epi64x( c0, c0, c0, c0 ), m1 ), b ), a ); \
+                 _mm256_set1_epi64x( c0 ), m1 ), b ), a ); \
   d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
   c = _mm256_add_epi64( c, d ); \
   b = mm256_ror_64( _mm256_xor_si256( b, c ), 11 ); \
@@ -479,20 +479,20 @@ static const sph_u64 CB[16] = {
  V5 = H5; \
  V6 = H6; \
  V7 = H7; \
-  V8 = _mm256_xor_si256( S0, _mm256_set1_epi64x( CB0 ) );  \
-  V9 = _mm256_xor_si256( S1, _mm256_set1_epi64x( CB1 ) );  \
-  VA = _mm256_xor_si256( S2, _mm256_set1_epi64x( CB2 ) );  \
-  VB = _mm256_xor_si256( S3, _mm256_set1_epi64x( CB3 ) );  \
+  V8 = _mm256_xor_si256( S0, m256_const1_64( CB0 ) );  \
+  V9 = _mm256_xor_si256( S1, m256_const1_64( CB1 ) );  \
+  VA = _mm256_xor_si256( S2, m256_const1_64( CB2 ) );  \
+  VB = _mm256_xor_si256( S3, m256_const1_64( CB3 ) );  \
  VC = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                         _mm256_set1_epi64x( CB4 ) );  \
+                         m256_const1_64( CB4 ) );  \
  VD = _mm256_xor_si256( _mm256_set1_epi64x( T0 ), \
-                         _mm256_set1_epi64x( CB5 ) );  \
+                         m256_const1_64( CB5 ) );  \
  VE = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                         _mm256_set1_epi64x( CB6 ) );  \
+                         m256_const1_64( CB6 ) );  \
  VF = _mm256_xor_si256( _mm256_set1_epi64x( T1 ), \
-                         _mm256_set1_epi64x( CB7 ) );  \
-  shuf_bswap64 = _mm256_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
-                                    0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
+                         m256_const1_64( CB7 ) );  \
+  shuf_bswap64 = m256_const_64( 0x08090a0b0c0d0e0f, 0x0001020304050607, \
+                                0x08090a0b0c0d0e0f, 0x0001020304050607 ); \
  M0 = _mm256_shuffle_epi8( *(buf+ 0), shuf_bswap64 ); \
  M1 = _mm256_shuffle_epi8( *(buf+ 1), shuf_bswap64 ); \
  M2 = _mm256_shuffle_epi8( *(buf+ 2), shuf_bswap64 ); \
@@ -544,14 +544,14 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
              const sph_u64 *salt )
 {
   __m256i zero = m256_zero;
-   casti_m256i( sc->H, 0 ) = _mm256_set1_epi64x( iv[0] );
-   casti_m256i( sc->H, 1 ) = _mm256_set1_epi64x( iv[1] );
-   casti_m256i( sc->H, 2 ) = _mm256_set1_epi64x( iv[2] );
-   casti_m256i( sc->H, 3 ) = _mm256_set1_epi64x( iv[3] );
-   casti_m256i( sc->H, 4 ) = _mm256_set1_epi64x( iv[4] );
-   casti_m256i( sc->H, 5 ) = _mm256_set1_epi64x( iv[5] );
-   casti_m256i( sc->H, 6 ) = _mm256_set1_epi64x( iv[6] );
-   casti_m256i( sc->H, 7 ) = _mm256_set1_epi64x( iv[7] );
+   casti_m256i( sc->H, 0 ) = m256_const1_64( 0x6A09E667F3BCC908 );
+   casti_m256i( sc->H, 1 ) = m256_const1_64( 0xBB67AE8584CAA73B );
+   casti_m256i( sc->H, 2 ) = m256_const1_64( 0x3C6EF372FE94F82B );
+   casti_m256i( sc->H, 3 ) = m256_const1_64( 0xA54FF53A5F1D36F1 );
+   casti_m256i( sc->H, 4 ) = m256_const1_64( 0x510E527FADE682D1 );
+   casti_m256i( sc->H, 5 ) = m256_const1_64( 0x9B05688C2B3E6C1F );
+   casti_m256i( sc->H, 6 ) = m256_const1_64( 0x1F83D9ABFB41BD6B );
+   casti_m256i( sc->H, 7 ) = m256_const1_64( 0x5BE0CD19137E2179 );

   casti_m256i( sc->S, 0 ) = zero;
   casti_m256i( sc->S, 1 ) = zero;
@@ -642,11 +642,9 @@ blake64_4way_close( blake_4way_big_context *sc,
       memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
       if ( out_size_w64 == 8 )
          buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
-                                 _mm256_set1_epi64x( 0x0100000000000000ULL ) );
-       *(buf+(112>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(buf+(120>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
+                                 m256_const1_64( 0x0100000000000000ULL ) );
+       *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
+       *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );

       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
   }
@@ -659,11 +657,9 @@ blake64_4way_close( blake_4way_big_context *sc,
       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
       memset_zero_256( buf, 112>>3 ); 
       if ( out_size_w64 == 8 )
-           buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
-       *(buf+(112>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(buf+(120>>3)) = mm256_bswap_64(
-                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
+           buf[104>>3] = m256_const1_64( 0x0100000000000000ULL );
+       *(buf+(112>>3)) = _mm256_set1_epi64x( bswap_64( th ) );
+       *(buf+(120>>3)) = _mm256_set1_epi64x( bswap_64( tl ) );

       blake64_4way( sc, buf, 128 );
   }
--- a/algo/blake/sph_blake2b.c
+++ b/algo/blake/sph_blake2b.c
@@ -103,7 +103,6 @@ static void blake2b_compress( sph_blake2b_ctx *ctx, int last )
 	v[13] ^= ctx->t[1];                 // high 64 bits
 	if (last)                           // last block flag set ?
 		v[14] = ~v[14];
-
 	for (i = 0; i < 16; i++)            // get little-endian words
 		m[i] = B2B_GET64(&ctx->b[8 * i]);

@@ -184,7 +183,8 @@ void sph_blake2b_final( sph_blake2b_ctx *ctx, void *out )

 	while (ctx->c < 128)                // fill up with zeros
 		ctx->b[ctx->c++] = 0;
-	blake2b_compress(ctx, 1);           // final block flag = 1
+
+   blake2b_compress(ctx, 1);           // final block flag = 1

 	// little endian convert and store
 	for (i = 0; i < ctx->outlen; i++) {
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -62,7 +62,7 @@ typedef struct {

 typedef bmw_4way_small_context bmw256_4way_context;

-void bmw256_4way_init(void *cc);
+void bmw256_4way_init( bmw256_4way_context *ctx );

 void bmw256_4way(void *cc, const void *data, size_t len);

--- a/algo/bmw/bmw256-hash-4way.c
+++ b/algo/bmw/bmw256-hash-4way.c
@@ -48,7 +48,7 @@ extern "C"{
 #if defined(__SSE2__)

 // BMW-256 4 way 32
-
+/*
 static const uint32_t IV256[] = {
 	0x40414243, 0x44454647,
 	0x48494A4B, 0x4C4D4E4F,
@@ -59,6 +59,7 @@ static const uint32_t IV256[] = {
 	0x70717273, 0x74757677,
 	0x78797A7B, 0x7C7D7E7F
 };
+*/

 #define ss0(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
@@ -462,13 +463,30 @@ static const __m128i final_s[16] =
   { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
 };
 */
-static void
-bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
+void bmw256_4way_init( bmw256_4way_context *ctx )
 {
-   for ( int i = 0; i < 16; i++ )
-      sc->H[i] = _mm_set1_epi32( iv[i] );
-   sc->ptr = 0;
-   sc->bit_count = 0;
+   ctx->H[ 0] = m128_const1_64( 0x4041424340414243 );
+   ctx->H[ 1] = m128_const1_64( 0x4445464744454647 );
+   ctx->H[ 2] = m128_const1_64( 0x48494A4B48494A4B );
+   ctx->H[ 3] = m128_const1_64( 0x4C4D4E4F4C4D4E4F );
+   ctx->H[ 4] = m128_const1_64( 0x5051525350515253 );
+   ctx->H[ 5] = m128_const1_64( 0x5455565754555657 );
+   ctx->H[ 6] = m128_const1_64( 0x58595A5B58595A5B );
+   ctx->H[ 7] = m128_const1_64( 0x5C5D5E5F5C5D5E5F );
+   ctx->H[ 8] = m128_const1_64( 0x6061626360616263 );
+   ctx->H[ 9] = m128_const1_64( 0x6465666764656667 );
+   ctx->H[10] = m128_const1_64( 0x68696A6B68696A6B );
+   ctx->H[11] = m128_const1_64( 0x6C6D6E6F6C6D6E6F );
+   ctx->H[12] = m128_const1_64( 0x7071727370717273 );
+   ctx->H[13] = m128_const1_64( 0x7475767774757677 );
+   ctx->H[14] = m128_const1_64( 0x78797A7B78797A7B );
+   ctx->H[15] = m128_const1_64( 0x7C7D7E7F7C7D7E7F );
+
+
+//   for ( int i = 0; i < 16; i++ )
+//      sc->H[i] = _mm_set1_epi32( iv[i] );
+   ctx->ptr = 0;
+   ctx->bit_count = 0;
 }

 static void
@@ -525,7 +543,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,

   buf = sc->buf;
   ptr = sc->ptr;
-   buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
+   buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 );
   ptr += 4;
   h = sc->H;

@@ -551,11 +569,13 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
      casti_m128i( dst, u ) = h1[v];
 }

+/*
 void
 bmw256_4way_init(void *cc)
 {
 	bmw32_4way_init(cc, IV256);
 }
+*/

 void
 bmw256_4way(void *cc, const void *data, size_t len)
@@ -1003,25 +1023,24 @@ static const __m256i final_s8[16] =

 void bmw256_8way_init( bmw256_8way_context *ctx )
 {
-   ctx->H[ 0] = _mm256_set1_epi32( IV256[ 0] );
-   ctx->H[ 1] = _mm256_set1_epi32( IV256[ 1] );
-   ctx->H[ 2] = _mm256_set1_epi32( IV256[ 2] );
-   ctx->H[ 3] = _mm256_set1_epi32( IV256[ 3] );
-   ctx->H[ 4] = _mm256_set1_epi32( IV256[ 4] );
-   ctx->H[ 5] = _mm256_set1_epi32( IV256[ 5] );
-   ctx->H[ 6] = _mm256_set1_epi32( IV256[ 6] );
-   ctx->H[ 7] = _mm256_set1_epi32( IV256[ 7] );
-   ctx->H[ 8] = _mm256_set1_epi32( IV256[ 8] );
-   ctx->H[ 9] = _mm256_set1_epi32( IV256[ 9] );
-   ctx->H[10] = _mm256_set1_epi32( IV256[10] );
-   ctx->H[11] = _mm256_set1_epi32( IV256[11] );
-   ctx->H[12] = _mm256_set1_epi32( IV256[12] );
-   ctx->H[13] = _mm256_set1_epi32( IV256[13] );
-   ctx->H[14] = _mm256_set1_epi32( IV256[14] );
-   ctx->H[15] = _mm256_set1_epi32( IV256[15] );
+   ctx->H[ 0] = m256_const1_64( 0x4041424340414243 );
+   ctx->H[ 1] = m256_const1_64( 0x4445464744454647 );
+   ctx->H[ 2] = m256_const1_64( 0x48494A4B48494A4B );
+   ctx->H[ 3] = m256_const1_64( 0x4C4D4E4F4C4D4E4F );
+   ctx->H[ 4] = m256_const1_64( 0x5051525350515253 );
+   ctx->H[ 5] = m256_const1_64( 0x5455565754555657 );
+   ctx->H[ 6] = m256_const1_64( 0x58595A5B58595A5B );
+   ctx->H[ 7] = m256_const1_64( 0x5C5D5E5F5C5D5E5F );
+   ctx->H[ 8] = m256_const1_64( 0x6061626360616263 );
+   ctx->H[ 9] = m256_const1_64( 0x6465666764656667 );
+   ctx->H[10] = m256_const1_64( 0x68696A6B68696A6B );
+   ctx->H[11] = m256_const1_64( 0x6C6D6E6F6C6D6E6F );
+   ctx->H[12] = m256_const1_64( 0x7071727370717273 );
+   ctx->H[13] = m256_const1_64( 0x7475767774757677 );
+   ctx->H[14] = m256_const1_64( 0x78797A7B78797A7B );
+   ctx->H[15] = m256_const1_64( 0x7C7D7E7F7C7D7E7F );
   ctx->ptr       = 0;
   ctx->bit_count = 0;
-
 }

 void bmw256_8way( bmw256_8way_context *ctx, const void *data, size_t len )
@@ -1074,7 +1093,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )

   buf = ctx->buf;
   ptr = ctx->ptr;
-   buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
+   buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 );
   ptr += 4;
   h = ctx->H;

@@ -1089,7 +1108,6 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )
   buf[ (buf_size - 8) >> 2 ] = _mm256_set1_epi32( ctx->bit_count );
   buf[ (buf_size - 4) >> 2 ] = m256_zero;

-
   compress_small_8way( buf, h, h2 );

   for ( u = 0; u < 16; u ++ )
--- a/algo/bmw/bmw512-gate.c
+++ b/algo/bmw/bmw512-gate.c
@@ -5,8 +5,8 @@ int64_t bmw512_get_max64() { return 0x7ffffLL; }
 bool register_bmw512_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT;
-  gate->set_target      = (void*)&alt_set_target;
  gate->get_max64       = (void*)&bmw512_get_max64;
+  opt_target_factor = 256.0;
 #if defined (BMW512_4WAY)
  gate->scanhash  = (void*)&scanhash_bmw512_4way;
  gate->hash      = (void*)&bmw512hash_4way;
--- a/algo/bmw/bmw512-hash-4way.c
+++ b/algo/bmw/bmw512-hash-4way.c
@@ -961,8 +961,22 @@ static const __m256i final_b[16] =
 static void
 bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
 {
-   for ( int i = 0; i < 16; i++ )
-      sc->H[i] = _mm256_set1_epi64x( iv[i] );
+   sc->H[ 0] = m256_const1_64( 0x8081828384858687 );
+   sc->H[ 1] = m256_const1_64( 0x88898A8B8C8D8E8F );
+   sc->H[ 2] = m256_const1_64( 0x9091929394959697 );
+   sc->H[ 3] = m256_const1_64( 0x98999A9B9C9D9E9F );
+   sc->H[ 4] = m256_const1_64( 0xA0A1A2A3A4A5A6A7 );
+   sc->H[ 5] = m256_const1_64( 0xA8A9AAABACADAEAF );
+   sc->H[ 6] = m256_const1_64( 0xB0B1B2B3B4B5B6B7 );
+   sc->H[ 7] = m256_const1_64( 0xB8B9BABBBCBDBEBF );
+   sc->H[ 8] = m256_const1_64( 0xC0C1C2C3C4C5C6C7 );
+   sc->H[ 9] = m256_const1_64( 0xC8C9CACBCCCDCECF );
+   sc->H[10] = m256_const1_64( 0xD0D1D2D3D4D5D6D7 );
+   sc->H[11] = m256_const1_64( 0xD8D9DADBDCDDDEDF );
+   sc->H[12] = m256_const1_64( 0xE0E1E2E3E4E5E6E7 );
+   sc->H[13] = m256_const1_64( 0xE8E9EAEBECEDEEEF );
+   sc->H[14] = m256_const1_64( 0xF0F1F2F3F4F5F6F7 );
+   sc->H[15] = m256_const1_64( 0xF8F9FAFBFCFDFEFF );
   sc->ptr = 0;
   sc->bit_count = 0;
 }
@@ -1014,13 +1028,11 @@ bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
   __m256i *buf;
   __m256i h1[16], h2[16], *h;
   size_t ptr, u, v;
-   unsigned z;
   const int buf_size = 128;  // bytes of one lane, compatible with len

   buf = sc->buf;
   ptr = sc->ptr;
-   z = 0x80 >> n;
-   buf[ ptr>>3 ] = _mm256_set1_epi64x( z );
+   buf[ ptr>>3 ] = m256_const1_64( 0x80 );
   ptr += 8;
   h = sc->H;

--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -94,19 +94,14 @@ int scanhash_groestl( struct work *work, uint32_t max_nonce,
 	return 0;
 }

-void groestl_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
 bool register_dmd_gr_algo( algo_gate_t* gate )
 {
    init_groestl_ctx();
    gate->optimizations   = SSE2_OPT | AES_OPT;
    gate->scanhash        = (void*)&scanhash_groestl;
    gate->hash            = (void*)&groestlhash;
-    gate->set_target      = (void*)&groestl_set_target;
    gate->get_max64       = (void*)&get_max64_0x3ffff;
+    opt_target_factor = 256.0;
    return true;
 };

--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -10,7 +10,7 @@
 #else
  #include "aes_ni/hash-groestl.h"
 #endif
-#include "algo/sha/sph_sha2.h"
+#include <openssl/sha.h>

 typedef struct {
 #ifdef NO_AES_NI
@@ -18,7 +18,7 @@ typedef struct {
 #else
    hashState_groestl       groestl;
 #endif
-    sph_sha256_context sha;
+    SHA256_CTX              sha;
 } myrgr_ctx_holder;

 myrgr_ctx_holder myrgr_ctx;
@@ -28,15 +28,15 @@ void init_myrgr_ctx()
 #ifdef NO_AES_NI
     sph_groestl512_init( &myrgr_ctx.groestl );
 #else
-     init_groestl (&myrgr_ctx.groestl, 64 );
+     init_groestl ( &myrgr_ctx.groestl, 64 );
 #endif
-     sph_sha256_init(&myrgr_ctx.sha);
+     SHA256_Init( &myrgr_ctx.sha );
 }

 void myriad_hash(void *output, const void *input)
 {
-        myrgr_ctx_holder ctx;
-        memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
+   myrgr_ctx_holder ctx;
+   memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );

 	uint32_t _ALIGN(32) hash[16];

@@ -44,23 +44,22 @@ void myriad_hash(void *output, const void *input)
 	sph_groestl512(&ctx.groestl, input, 80);
 	sph_groestl512_close(&ctx.groestl, hash);
 #else
-        update_groestl( &ctx.groestl, (char*)input, 640 );
-        final_groestl( &ctx.groestl, (char*)hash);
+   update_groestl( &ctx.groestl, (char*)input, 640 );
+   final_groestl( &ctx.groestl, (char*)hash);
 #endif

-	sph_sha256(&ctx.sha, hash, 64);
-	sph_sha256_close(&ctx.sha, hash);
+   SHA256_Update( &ctx.sha, (unsigned char*)hash, 64 );
+   SHA256_Final( (unsigned char*)hash, &ctx.sha );

 	memcpy(output, hash, 32);
 }

-int scanhash_myriad( struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr)
+int scanhash_myriad( struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-
 	uint32_t _ALIGN(64) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
   int thr_id = mythr->id;  // thr_id arg is deprecated
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -8,7 +8,7 @@
 #include <string.h>

 #include "aes_ni/hash-groestl.h"
-#include "algo/sha/sha2-hash-4way.h"
+#include "algo/sha/sha-hash-4way.h"

 typedef struct {
    hashState_groestl       groestl;
--- a/algo/groestl/myrgr-gate.h
+++ b/algo/groestl/myrgr-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__) && defined(__AES__)
+#if defined(__AVX2__) && defined(__AES__) && !defined(__SHA__)
  #define MYRGR_4WAY
 #endif

--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -15,11 +15,6 @@ pthread_barrier_t hodl_barrier;
 // need to be passed.
 unsigned char *hodl_scratchbuf = NULL;

-void hodl_set_target( struct work* work, double diff )
-{
-     diff_to_target(work->target, diff / 8388608.0 );
-}
-
 void hodl_le_build_stratum_request( char* req, struct work* work,
                                    struct stratum_ctx *sctx ) 
 {
@@ -170,7 +165,6 @@ bool register_hodl_algo( algo_gate_t* gate )
  gate->scanhash              = (void*)&hodl_scanhash;
  gate->get_new_work          = (void*)&hodl_get_new_work;
  gate->longpoll_rpc_call     = (void*)&hodl_longpoll_rpc_call;
-  gate->set_target            = (void*)&hodl_set_target;
  gate->build_stratum_request = (void*)&hodl_le_build_stratum_request;
  gate->malloc_txs_request    = (void*)&hodl_malloc_txs_request;
  gate->build_block_header    = (void*)&hodl_build_block_header;
@@ -179,6 +173,7 @@ bool register_hodl_algo( algo_gate_t* gate )
  gate->work_cmp_size         = 76;
  hodl_scratchbuf = (unsigned char*)malloc( 1 << 30 );
  allow_getwork = false;
+  opt_target_factor = 8388608.0;
  return ( hodl_scratchbuf != NULL );
 }

--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -246,18 +246,12 @@ do { \
 	} while (0)
 */

-#define W0(x)   Wz(x, _mm256_set_epi64x( 0x5555555555555555, \
-       0x5555555555555555, 0x5555555555555555, 0x5555555555555555 ), 1 )
-#define W1(x)   Wz(x, _mm256_set_epi64x( 0x3333333333333333, \
-       0x3333333333333333, 0x3333333333333333, 0x3333333333333333 ), 2 )
-#define W2(x)   Wz(x, _mm256_set_epi64x( 0x0F0F0F0F0F0F0F0F, \
-       0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F ), 4 )
-#define W3(x)   Wz(x, _mm256_set_epi64x( 0x00FF00FF00FF00FF, \
-       0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF ), 8 ) 
-#define W4(x)   Wz(x, _mm256_set_epi64x( 0x0000FFFF0000FFFF, \
-       0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF ), 16 )
-#define W5(x)   Wz(x, _mm256_set_epi64x( 0x00000000FFFFFFFF, \
-       0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF ), 32 )
+#define W0(x)   Wz(x, m256_const1_64( 0x5555555555555555 ),  1 )
+#define W1(x)   Wz(x, m256_const1_64( 0x3333333333333333 ),  2 )
+#define W2(x)   Wz(x, m256_const1_64( 0x0F0F0F0F0F0F0F0F ),  4 )
+#define W3(x)   Wz(x, m256_const1_64( 0x00FF00FF00FF00FF ),  8 ) 
+#define W4(x)   Wz(x, m256_const1_64( 0x0000FFFF0000FFFF ), 16 )
+#define W5(x)   Wz(x, m256_const1_64( 0x00000000FFFFFFFF ), 32 )
 #define W6(x) \
 do { \
   __m256i t = x ## h; \
@@ -331,14 +325,14 @@ do { \
 	__m256i m2l = buf[5]; \
 	__m256i m3h = buf[6]; \
 	__m256i m3l = buf[7]; \
-        h0h = _mm256_xor_si256( h0h, m0h ); \
-        h0l = _mm256_xor_si256( h0l, m0l ); \
-        h1h = _mm256_xor_si256( h1h, m1h ); \
-        h1l = _mm256_xor_si256( h1l, m1l ); \
-        h2h = _mm256_xor_si256( h2h, m2h ); \
-        h2l = _mm256_xor_si256( h2l, m2l ); \
-        h3h = _mm256_xor_si256( h3h, m3h ); \
-        h3l = _mm256_xor_si256( h3l, m3l ); \
+   h0h = _mm256_xor_si256( h0h, m0h ); \
+   h0l = _mm256_xor_si256( h0l, m0l ); \
+   h1h = _mm256_xor_si256( h1h, m1h ); \
+   h1l = _mm256_xor_si256( h1l, m1l ); \
+   h2h = _mm256_xor_si256( h2h, m2h ); \
+   h2l = _mm256_xor_si256( h2l, m2l ); \
+   h3h = _mm256_xor_si256( h3h, m3h ); \
+   h3l = _mm256_xor_si256( h3l, m3l ); \

 #define INPUT_BUF2 \
   h4h = _mm256_xor_si256( h4h, m0h ); \
@@ -477,13 +471,48 @@ static const sph_u64 IV512[] = {

 #endif

-static void
-jh_4way_init( jh_4way_context *sc, const void *iv )
+void jh256_4way_init( jh_4way_context *sc )
 {
-    uint64_t *v = (uint64_t*)iv;
-    
-    for ( int i = 0; i < 16; i++ )
-        sc->H[i] = _mm256_set_epi64x( v[i], v[i], v[i], v[i] );
+    // bswapped IV256
+    sc->H[ 0] = m256_const1_64( 0xebd3202c41a398eb );
+    sc->H[ 1] = m256_const1_64( 0xc145b29c7bbecd92 );
+    sc->H[ 2] = m256_const1_64( 0xfac7d4609151931c );
+    sc->H[ 3] = m256_const1_64( 0x038a507ed6820026 );
+    sc->H[ 4] = m256_const1_64( 0x45b92677269e23a4 );
+    sc->H[ 5] = m256_const1_64( 0x77941ad4481afbe0 );
+    sc->H[ 6] = m256_const1_64( 0x7a176b0226abb5cd );
+    sc->H[ 7] = m256_const1_64( 0xa82fff0f4224f056 );
+    sc->H[ 8] = m256_const1_64( 0x754d2e7f8996a371 );
+    sc->H[ 9] = m256_const1_64( 0x62e27df70849141d );
+    sc->H[10] = m256_const1_64( 0x948f2476f7957627 );
+    sc->H[11] = m256_const1_64( 0x6c29804757b6d587 );
+    sc->H[12] = m256_const1_64( 0x6c0d8eac2d275e5c );
+    sc->H[13] = m256_const1_64( 0x0f7a0557c6508451 );
+    sc->H[14] = m256_const1_64( 0xea12247067d3e47b );
+    sc->H[15] = m256_const1_64( 0x69d71cd313abe389 );
+    sc->ptr = 0;
+    sc->block_count = 0;
+}
+
+void jh512_4way_init( jh_4way_context *sc )
+{
+    // bswapped IV512
+    sc->H[ 0] = m256_const1_64( 0x17aa003e964bd16f );
+    sc->H[ 1] = m256_const1_64( 0x43d5157a052e6a63 );
+    sc->H[ 2] = m256_const1_64( 0x0bef970c8d5e228a );
+    sc->H[ 3] = m256_const1_64( 0x61c3b3f2591234e9 );
+    sc->H[ 4] = m256_const1_64( 0x1e806f53c1a01d89 );
+    sc->H[ 5] = m256_const1_64( 0x806d2bea6b05a92a );
+    sc->H[ 6] = m256_const1_64( 0xa6ba7520dbcc8e58 );
+    sc->H[ 7] = m256_const1_64( 0xf73bf8ba763a0fa9 );
+    sc->H[ 8] = m256_const1_64( 0x694ae34105e66901 );
+    sc->H[ 9] = m256_const1_64( 0x5ae66f2e8e8ab546 );
+    sc->H[10] = m256_const1_64( 0x243c84c1d0a74710 );
+    sc->H[11] = m256_const1_64( 0x99c15a2db1716e3b );
+    sc->H[12] = m256_const1_64( 0x56f8b19decf657cf );
+    sc->H[13] = m256_const1_64( 0x56b116577c8806a7 );
+    sc->H[14] = m256_const1_64( 0xfb1785e6dffcc2e3 );
+    sc->H[15] = m256_const1_64( 0x4bdd8ccc78465a54 );
    sc->ptr = 0;
    sc->block_count = 0;
 }
@@ -542,7 +571,7 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
   size_t numz, u;
   sph_u64 l0, l1, l0e, l1e;

-   buf[0] = _mm256_set_epi64x( 0x80, 0x80, 0x80, 0x80 );
+   buf[0] = m256_const1_64( 0x80ULL );

   if ( sc->ptr == 0 )
       numz = 48;
@@ -555,8 +584,8 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
   l1 = SPH_T64(sc->block_count >> 55);
   sph_enc64be( &l0e, l0 );
   sph_enc64be( &l1e, l1 );
-   *(buf + (numz>>3)    ) = _mm256_set_epi64x( l1e, l1e, l1e, l1e );
-   *(buf + (numz>>3) + 1) = _mm256_set_epi64x( l0e, l0e, l0e, l0e ); 
+   *(buf + (numz>>3)    ) = _mm256_set1_epi64x( l1e );
+   *(buf + (numz>>3) + 1) = _mm256_set1_epi64x( l0e ); 

   jh_4way_core( sc, buf, numz + 16 );

@@ -566,11 +595,13 @@ jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
    memcpy_256( dst256, buf, 8 );
 }

+/*
 void
 jh256_4way_init(void *cc)
 {
-	jh_4way_init(cc, IV256);
+	jhs_4way_init(cc, IV256);
 }
+*/

 void
 jh256_4way(void *cc, const void *data, size_t len)
@@ -584,11 +615,13 @@ jh256_4way_close(void *cc, void *dst)
 	jh_4way_close(cc, 0, 0, dst, 8, IV256);
 }

+/*
 void
 jh512_4way_init(void *cc)
 {
-	jh_4way_init(cc, IV512);
+	jhb_4way_init(cc, IV512);
 }
+*/

 void
 jh512_4way(void *cc, const void *data, size_t len)
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -79,13 +79,13 @@ typedef jh_4way_context jh256_4way_context;

 typedef jh_4way_context jh512_4way_context;

-void jh256_4way_init(void *cc);
+void jh256_4way_init( jh_4way_context *sc);

 void jh256_4way(void *cc, const void *data, size_t len);

 void jh256_4way_close(void *cc, void *dst);

-void jh512_4way_init(void *cc);
+void jh512_4way_init( jh_4way_context *sc );

 void jh512_4way(void *cc, const void *data, size_t len);

--- a/algo/jh/jha-gate.c
+++ b/algo/jh/jha-gate.c
@@ -12,7 +12,7 @@ bool register_jha_algo( algo_gate_t* gate )
  gate->hash             = (void*)&jha_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-  gate->set_target       = (void*)&scrypt_set_target;
+  opt_target_factor = 65536.0;
  return true;
 };

--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -39,10 +39,10 @@ int scanhash_keccak_4way( struct work *work, uint32_t max_nonce,
      keccakhash_4way( hash, vdata );

      for ( int lane = 0; lane < 4; lane++ )
-      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
+      if ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 )
      {
          extr_lane_4x64( lane_hash, hash, lane, 256 );
-          if ( fulltest( lane_hash, ptarget ) )
+          if ( fulltest( lane_hash, ptarget ) && !opt_benchmark )
          {
              pdata[19] = n + lane;
              submit_lane_solution( work, lane_hash, mythr, lane );
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -1,18 +1,13 @@
 #include "keccak-gate.h"

-void keccak_set_target( struct work* work, double job_diff )
-{
-  work_set_target( work, job_diff / (128.0 * opt_diff_factor) );
-}
-
 int64_t keccak_get_max64() { return 0x7ffffLL; }

 bool register_keccak_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
-  gate->set_target      = (void*)&keccak_set_target;
  gate->get_max64       = (void*)&keccak_get_max64;
+  opt_target_factor = 128.0;
 #if defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
@@ -23,17 +18,12 @@ bool register_keccak_algo( algo_gate_t* gate )
  return true;
 };

-void keccakc_set_target( struct work* work, double job_diff )
-{
-  work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
 bool register_keccakc_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT;
  gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
-  gate->set_target      = (void*)&keccakc_set_target;
  gate->get_max64       = (void*)&keccak_get_max64;
+  opt_target_factor = 256.0;
 #if defined (KECCAK_4WAY)
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -370,18 +370,23 @@ static const sph_u64 RC[] = {

 static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
 {
-   int i;
-   for (i = 0; i < 25; i ++)
-          kc->w[i] = _mm256_setzero_si256();
+   __m256i zero = m256_zero;
+   __m256i neg1 = m256_neg1;

   // Initialization for the "lane complement".
-   kc->w[ 1] = m256_neg1;
-   kc->w[ 2] = m256_neg1;
-   kc->w[ 8] = m256_neg1;
-   kc->w[12] = m256_neg1;
-   kc->w[17] = m256_neg1;
-   kc->w[20] = m256_neg1;
-   kc->ptr = 0;
+   kc->w[ 0] = zero;   kc->w[ 1] = neg1;
+   kc->w[ 2] = neg1;   kc->w[ 3] = zero;
+   kc->w[ 4] = zero;   kc->w[ 5] = zero;
+   kc->w[ 6] = zero;   kc->w[ 7] = zero;
+   kc->w[ 8] = neg1;   kc->w[ 9] = zero;
+   kc->w[10] = zero;   kc->w[11] = zero;
+   kc->w[12] = neg1;   kc->w[13] = zero;
+   kc->w[14] = zero;   kc->w[15] = zero;
+   kc->w[16] = zero;   kc->w[17] = neg1;
+   kc->w[18] = zero;   kc->w[19] = zero;
+   kc->w[20] = neg1;   kc->w[21] = zero;
+   kc->w[22] = zero;   kc->w[23] = zero;
+   kc->w[24] = zero;   kc->ptr = 0;
   kc->lim = 200 - (out_size >> 2);
 }

@@ -441,8 +446,8 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    eb = 0x100  >> 8;
    if ( kc->ptr == (lim - 8) )
    {
-        uint64_t t = eb | 0x8000000000000000;
-        u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
+        const uint64_t t = eb | 0x8000000000000000;
+        u.tmp[0] = m256_const1_64( t );
        j = 8;
    }
    else
@@ -450,8 +455,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
        j = lim - kc->ptr;
        u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
        memset_zero_256( u.tmp + 1, (j>>3) - 2 );
-        u.tmp[ (j>>3) - 1] = _mm256_set_epi64x( 0x8000000000000000,
-                0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+        u.tmp[ (j>>3) - 1] = m256_const1_64( 0x8000000000000000 );
    }
    keccak64_core( kc, u.tmp, j, lim );
    /* Finalize the "lane complement" */
@@ -461,9 +465,7 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    NOT64( kc->w[12], kc->w[12] );
    NOT64( kc->w[17], kc->w[17] );
    NOT64( kc->w[20], kc->w[20] );
-    for ( j = 0; j < m256_len; j++ )
-         u.tmp[j] =  kc->w[j]; 
-    memcpy_256( dst, u.tmp, m256_len );
+    memcpy_256( dst, kc->w, m256_len );
 }

 void keccak256_4way_init( void *kc )
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -71,7 +71,7 @@ bool register_lyra2rev3_algo( algo_gate_t* gate )
 #endif
  gate->optimizations = SSE2_OPT | SSE42_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
-  gate->set_target        = (void*)&alt_set_target;
+  opt_target_factor = 256.0;
  return true;
 };

@@ -105,7 +105,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate )
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
-  gate->set_target        = (void*)&alt_set_target;
+  opt_target_factor = 256.0;
  return true;
 };

@@ -128,7 +128,7 @@ bool register_lyra2z_algo( algo_gate_t* gate )
 #endif
  gate->optimizations = SSE42_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&alt_set_target;
+  opt_target_factor = 256.0;
  return true;
 };

@@ -148,7 +148,7 @@ bool register_lyra2h_algo( algo_gate_t* gate )
 #endif
  gate->optimizations = SSE42_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&alt_set_target;
+  opt_target_factor = 256.0;
  return true;
 };

@@ -168,8 +168,8 @@ bool register_allium_algo( algo_gate_t* gate )
  gate->hash      = (void*)&allium_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
-  gate->set_target        = (void*)&alt_set_target;
  gate->get_max64         = (void*)&allium_get_max64_0xFFFFLL;
+  opt_target_factor = 256.0;
  return true;
 };

@@ -182,6 +182,7 @@ int phi2_get_work_data_size() { return phi2_use_roots ? 144 : 128; }

 void phi2_decode_extra_data( struct work *work )
 {
+   phi2_use_roots = false;
   if ( work->data[0] & ( 1<<30 ) ) phi2_use_roots = true;
   else for ( int i = 20; i < 36; i++ )
   {
@@ -213,8 +214,8 @@ bool register_phi2_algo( algo_gate_t* gate )
   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
   gate->build_extraheader  = (void*)&phi2_build_extraheader;
-   gate->set_target         = (void*)&alt_set_target; 
   gate->get_max64          = (void*)&get_max64_0xffffLL;
+   opt_target_factor = 256.0;
 #if defined(PHI2_4WAY)
   gate->scanhash           = (void*)&scanhash_phi2_4way;
 #else
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -60,7 +60,7 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-   int64_t i; //auxiliary iteration counter
+//   int64_t i; //auxiliary iteration counter
   int64_t v64; // 64bit var for memcpy
   //====================================================================/

@@ -128,17 +128,22 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   //================= Initializing the Sponge State ====================//
   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)

-   initState( state );
+//   initState( state );

   //========================= Setup Phase =============================//
   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
   
   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+/*
   for (i = 0; i < nBlocksInput; i++)
   {
       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
   }
+*/
+
   //Initializes M[0] and M[1]
   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here

@@ -227,7 +232,7 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-   int64_t i; //auxiliary iteration counter
+//   int64_t i; //auxiliary iteration counter
   int64_t v64; // 64bit var for memcpy
   uint64_t instance = 0;
   //====================================================================/
@@ -302,17 +307,21 @@ int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   //================= Initializing the Sponge State ====================//
   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)

-   initState( state );
+//   initState( state );

   //========================= Setup Phase =============================//
   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits

   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+/*
   for (i = 0; i < nBlocksInput; i++)
   {
       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
   }
+*/
   //Initializes M[0] and M[1]
   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here

@@ -405,7 +414,7 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
    int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
    int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
    int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
-    int64_t i; //auxiliary iteration counter
+//    int64_t i; //auxiliary iteration counter
    //=======================================================================/

    //======= Initializing the Memory Matrix and pointers to it =============//
@@ -459,17 +468,21 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
 //        if (state == NULL) {
 //                return -1;
 //        }
-    initState( state );
+//    initState( state );

    //============================== Setup Phase =============================//
    //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
-        uint64_t *ptrWord = wholeMatrix;
+    uint64_t *ptrWord = wholeMatrix;
+
+    absorbBlockBlake2Safe( state, ptrWord, nBlocksInput,
+                           BLOCK_LEN_BLAKE2_SAFE_INT64 );
+/*
    for ( i = 0; i < nBlocksInput; i++ )
    {
      absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
      ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
    }
-
+*/
    //Initializes M[0] and M[1]
        reducedSqueezeRow0(state, &wholeMatrix[0], nCols); //The locally copied password is most likely overwritten here
        reducedDuplexRow1(state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64], nCols);
@@ -623,17 +636,21 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
   //================= Initializing the Sponge State ====================//
   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)

-   initState( state );
+//   initState( state );

   //========================= Setup Phase =============================//
   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits

   ptrWord = wholeMatrix;
+
+   absorbBlockBlake2Safe( state, ptrWord, nBlocksInput, BLOCK_LEN );
+/*
   for (i = 0; i < nBlocksInput; i++)
   {
       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
   }
+*/
   //Initializes M[0] and M[1]
   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here

--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -118,11 +118,6 @@ int64_t lyra2re_get_max64 ()
  return 0xffffLL;
 }

-void lyra2re_set_target ( struct work* work, double job_diff )
-{
-   work_set_target(work, job_diff / (128.0 * opt_diff_factor) );
-}
-
 bool register_lyra2re_algo( algo_gate_t* gate )
 {
  init_lyra2re_ctx();
@@ -130,7 +125,7 @@ bool register_lyra2re_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2re;
  gate->hash       = (void*)&lyra2re_hash;
  gate->get_max64  = (void*)&lyra2re_get_max64;
-  gate->set_target = (void*)&lyra2re_set_target;
+  opt_target_factor = 128.0;
  return true;
 };

--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -86,7 +86,7 @@ void lyra2rev3_8way_hash( void *state, const void *input )

   }

-int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev3_8way( struct work *work, const uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t hash[8*8] __attribute__ ((aligned (64)));
@@ -94,12 +94,12 @@ int scanhash_lyra2rev3_8way( struct work *work, uint32_t max_nonce,
   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  // thr_id arg is deprecated

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -186,7 +186,7 @@ void lyra2rev3_4way_hash( void *state, const void *input )
   bmw256_4way_close( &ctx.bmw, state );
 }

-int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
+int scanhash_lyra2rev3_4way( struct work *work, const uint32_t max_nonce,
                             uint64_t *hashes_done, struct thr_info *mythr ) 
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
@@ -194,12 +194,12 @@ int scanhash_lyra2rev3_4way( struct work *work, uint32_t max_nonce,
   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   const uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;  // thr_id arg is deprecated
   
   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
--- a/algo/lyra2/lyra2rev3.c
+++ b/algo/lyra2/lyra2rev3.c
@@ -32,27 +32,27 @@ void l2v3_blake256_midstate( const void* input )

 void lyra2rev3_hash( void *state, const void *input )
 {
-        lyra2v3_ctx_holder ctx __attribute__ ((aligned (64))); 
-        memcpy( &ctx, &lyra2v3_ctx, sizeof(lyra2v3_ctx) );
-        uint8_t hash[128] __attribute__ ((aligned (64)));
-        #define hashA hash
-        #define hashB hash+64
-        const int midlen = 64;            // bytes
-        const int tail   = 80 - midlen;   // 16
+   lyra2v3_ctx_holder ctx __attribute__ ((aligned (64))); 
+   memcpy( &ctx, &lyra2v3_ctx, sizeof(lyra2v3_ctx) );
+   uint8_t hash[128] __attribute__ ((aligned (64)));
+   #define hashA hash
+   #define hashB hash+64
+   const int midlen = 64;            // bytes
+   const int tail   = 80 - midlen;   // 16

-        memcpy( &ctx.blake, &l2v3_blake_mid, sizeof l2v3_blake_mid );
-	sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
-	sph_blake256_close( &ctx.blake, hash );
+   memcpy( &ctx.blake, &l2v3_blake_mid, sizeof l2v3_blake_mid );
+   sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
+   sph_blake256_close( &ctx.blake, hash );

-        LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 );

-        cubehashUpdateDigest( &ctx.cube, (byte*) hashA,
-                              (const byte*) hash, 32 );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hashA,
+                         (const byte*) hash, 32 );

-	LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 );

-	sph_bmw256( &ctx.bmw, hash, 32 );
-	sph_bmw256_close( &ctx.bmw, hash );
+   sph_bmw256( &ctx.bmw, hash, 32 );
+   sph_bmw256_close( &ctx.bmw, hash );

 	memcpy( state, hash, 32 );
 }
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -53,11 +53,6 @@ int scanhash_lyra2z330( struct work *work, uint32_t max_nonce,
   return 0;
 }

-void lyra2z330_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
 bool lyra2z330_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 256; // nCols
@@ -76,7 +71,7 @@ bool register_lyra2z330_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_lyra2z330;
  gate->hash       = (void*)&lyra2z330_hash;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&lyra2z330_set_target;
+  opt_target_factor = 256.0;
  return true;
 };

--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -40,29 +40,32 @@
 */
 inline void initState( uint64_t State[/*16*/] )
 {
+
+   /*
 #if defined (__AVX2__)

  __m256i* state = (__m256i*)State;
- 
-  state[0] = _mm256_setzero_si256();
-  state[1] = _mm256_setzero_si256();
-  state[2] = _mm256_set_epi64x( blake2b_IV[3], blake2b_IV[2],
-                                blake2b_IV[1], blake2b_IV[0] );
-  state[3] = _mm256_set_epi64x( blake2b_IV[7], blake2b_IV[6],
-                                blake2b_IV[5], blake2b_IV[4] );
+  const __m256i zero = m256_zero; 
+  state[0] = zero;
+  state[1] = zero;
+  state[2] = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
+                            0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state[3] = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
+                            0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );

 #elif defined (__SSE2__)

  __m128i* state = (__m128i*)State;
+  const __m128i zero = m128_zero;   

-  state[0] = _mm_setzero_si128();
-  state[1] = _mm_setzero_si128();
-  state[2] = _mm_setzero_si128();
-  state[3] = _mm_setzero_si128();
-  state[4] = _mm_set_epi64x( blake2b_IV[1], blake2b_IV[0] );
-  state[5] = _mm_set_epi64x( blake2b_IV[3], blake2b_IV[2] );
-  state[6] = _mm_set_epi64x( blake2b_IV[5], blake2b_IV[4] );
-  state[7] = _mm_set_epi64x( blake2b_IV[7], blake2b_IV[6] );
+  state[0] = zero;
+  state[1] = zero;
+  state[2] = zero;
+  state[3] = zero;
+  state[4] = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state[5] = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
+  state[6] = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
+  state[7] = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );

 #else
    //First 512 bis are zeros
@@ -77,6 +80,8 @@ inline void initState( uint64_t State[/*16*/] )
    State[14] = blake2b_IV[6];
    State[15] = blake2b_IV[7];
 #endif
+*/
+
 }

 /**
@@ -250,43 +255,76 @@ inline void absorbBlock( uint64_t *State, const uint64_t *In )
 * @param state The current state of the sponge
 * @param in    The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
 */
-inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In )
+inline void absorbBlockBlake2Safe( uint64_t *State, const uint64_t *In,
+                      const uint64_t nBlocks, const uint64_t block_len )
 {
-    //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
+// XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with
+// the IV.
 #if defined (__AVX2__)

-    register __m256i state0, state1, state2, state3;
+  register __m256i state0, state1, state2, state3;
+  const __m256i zero = m256_zero;
+
+  state0 = zero;
+  state1 = zero;
+  state2 = m256_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL,
+                          0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state3 = m256_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL,
+                          0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
+
+  for ( int i = 0; i < nBlocks; i++ )
+  { 
    __m256i *in = (__m256i*)In;
-
-    state0 = _mm256_load_si256( (__m256i*)State     );
-    state1 = _mm256_load_si256( (__m256i*)State + 1 );
-    state2 = _mm256_load_si256( (__m256i*)State + 2 );
-    state3 = _mm256_load_si256( (__m256i*)State + 3 );
-
    state0 = _mm256_xor_si256( state0, in[0] );
    state1 = _mm256_xor_si256( state1, in[1] );

    LYRA_12_ROUNDS_AVX2( state0, state1, state2, state3 );
+    In += block_len;
+  }

-    _mm256_store_si256( (__m256i*)State,     state0 );
-    _mm256_store_si256( (__m256i*)State + 1, state1 );
-    _mm256_store_si256( (__m256i*)State + 2, state2 );
-    _mm256_store_si256( (__m256i*)State + 3, state3 );
+  _mm256_store_si256( (__m256i*)State,     state0 );
+  _mm256_store_si256( (__m256i*)State + 1, state1 );
+  _mm256_store_si256( (__m256i*)State + 2, state2 );
+  _mm256_store_si256( (__m256i*)State + 3, state3 );

 #elif defined (__SSE2__)

-    __m128i* state = (__m128i*)State;
+  __m128i state0, state1, state2, state3, state4, state5, state6, state7;
+  const __m128i zero = m128_zero;
+
+  state0 = zero;
+  state1 = zero;
+  state2 = zero;
+  state3 = zero;
+  state4 = m128_const_64( 0xbb67ae8584caa73bULL, 0x6a09e667f3bcc908ULL );
+  state5 = m128_const_64( 0xa54ff53a5f1d36f1ULL, 0x3c6ef372fe94f82bULL );
+  state6 = m128_const_64( 0x9b05688c2b3e6c1fULL, 0x510e527fade682d1ULL );
+  state7 = m128_const_64( 0x5be0cd19137e2179ULL, 0x1f83d9abfb41bd6bULL );
+
+  for ( int i = 0; i < nBlocks; i++ )
+  { 
    __m128i* in    = (__m128i*)In;

-    state[0] = _mm_xor_si128( state[0], in[0] );
-    state[1] = _mm_xor_si128( state[1], in[1] );
-    state[2] = _mm_xor_si128( state[2], in[2] );
-    state[3] = _mm_xor_si128( state[3], in[3] );
+    state0 = _mm_xor_si128( state0, in[0] );
+    state1 = _mm_xor_si128( state1, in[1] );
+    state2 = _mm_xor_si128( state2, in[2] );
+    state3 = _mm_xor_si128( state3, in[3] );

    //Applies the transformation f to the sponge's state
-    LYRA_12_ROUNDS_AVX( state[0], state[1], state[2], state[3],
-                        state[4], state[5], state[6], state[7] );
+    LYRA_12_ROUNDS_AVX( state0, state1, state2, state3,
+                        state4, state5, state6, state7 );
+    In += block_len;
+  }

+  _mm_store_si128( (__m128i*)State,     state0 );
+  _mm_store_si128( (__m128i*)State + 1, state1 );
+  _mm_store_si128( (__m128i*)State + 2, state2 );
+  _mm_store_si128( (__m128i*)State + 3, state3 );
+  _mm_store_si128( (__m128i*)State + 4, state4 );
+  _mm_store_si128( (__m128i*)State + 5, state5 );
+  _mm_store_si128( (__m128i*)State + 6, state6 );
+  _mm_store_si128( (__m128i*)State + 7, state7 );
+  
 #else

    State[0] ^= In[0];
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -170,7 +170,8 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);

 //---- Absorbs
 void absorbBlock(uint64_t *state, const uint64_t *in);
-void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
+void absorbBlockBlake2Safe( uint64_t *state, const uint64_t *in,
+                            const uint64_t nBlocks, const uint64_t block_len );

 //---- Duplexes
 void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -19,100 +19,89 @@
 #define EPS1 DBL_EPSILON
 #define EPS2 3.0e-11

-inline double exp_n(double xt)
+inline double exp_n( double xt )
 {
-    if(xt < -700.0)
+    if ( xt < -700.0 )
        return 0;
-    else if(xt > 700.0)
+    else if ( xt > 700.0 )
        return 1e200;
-    else if(xt > -0.8e-8 && xt < 0.8e-8)
-        return (1.0 + xt);
+    else if ( xt > -0.8e-8 && xt < 0.8e-8 )
+        return ( 1.0 + xt );
    else
-        return exp(xt);
+        return exp( xt );
 }

-inline double exp_n2(double x1, double x2)
+inline double exp_n2( double x1, double x2 )
 {
-    double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8, p5 = 37., p6 = 700.;
+    double p1 = -700., p2 = -37., p3 = -0.8e-8, p4 = 0.8e-8,
+           p5 = 37., p6 = 700.;
    double xt = x1 - x2;
-    if (xt < p1+1.e-200)
+    if ( xt < p1+1.e-200 )
        return 1.;
-    else if (xt > p1 && xt < p2 + 1.e-200)
+    else if ( xt > p1 && xt < p2 + 1.e-200 )
        return ( 1. - exp(xt) );
-    else if (xt > p2 && xt < p3 + 1.e-200)
-        return ( 1. / (1. + exp(xt)) );
-    else if (xt > p3 && xt < p4)
+    else if ( xt > p2 && xt < p3 + 1.e-200 )
+        return ( 1. / ( 1. + exp(xt) ) );
+    else if ( xt > p3 && xt < p4 )
        return ( 1. / (2. + xt) );
-    else if (xt > p4 - 1.e-200 && xt < p5)
-        return ( exp(-xt) / (1. + exp(-xt)) );
-    else if (xt > p5 - 1.e-200 && xt < p6)
+    else if ( xt > p4 - 1.e-200 && xt < p5 )
+        return ( exp(-xt) / ( 1. + exp(-xt) ) );
+    else if ( xt > p5 - 1.e-200 && xt < p6 )
        return ( exp(-xt) );
-    else if (xt > p6 - 1.e-200)
+    else if ( xt > p6 - 1.e-200 )
        return 0.;
 }

-double swit2_(double wvnmb)
+double swit2_( double wvnmb )
 {
-    return pow( (5.55243*(exp_n(-0.3*wvnmb/15.762) - exp_n(-0.6*wvnmb/15.762)))*wvnmb, 0.5) 
-	  / 1034.66 * pow(sin(wvnmb/65.), 2.);
+    return pow( ( 5.55243 * ( exp_n( -0.3 * wvnmb / 15.762 )
+                - exp_n( -0.6 * wvnmb / 15.762 ) ) ) * wvnmb, 0.5 ) 
+	        / 1034.66 * pow( sin( wvnmb / 65. ), 2. );
 }

-
-double GaussianQuad_N2(const double x1, const double x2)
+double GaussianQuad_N2( const double x1, const double x2 )
 {
-    double s=0.0;
+    double s = 0.0;
    double x[6], w[6];
    //gauleg(a2, b2, x, w);
    
    double z1, z, xm, xl, pp, p3, p2, p1;
-    xm=0.5*(x2+x1);
-    xl=0.5*(x2-x1);
-    for(int i=1;i<=3;i++)
+    xm = 0.5 * ( x2 + x1 );
+    xl = 0.5 * ( x2 - x1 );
+    for( int i = 1; i <= 3; i++ )
    {
-		z = (i == 1) ? 0.909632 : -0.0;
-		z = (i == 2) ? 0.540641 : z;
-	    do
+      z = (i == 2) ? 0.540641 : ( (i == 1) ? 0.909632 : -0.0 );
+      do
 	    {
-			p1 = z;
-			p2 = 1;
-			p3 = 0;
-			
-			p3=1;
-			p2=z;
-			p1=((3.0 * z * z) - 1) / 2;
-			
-			p3=p2;
-			p2=p1;
-			p1=((5.0 * z * p2) - (2.0 * z)) / 3;
-			
-			p3=p2;
-			p2=p1;
-			p1=((7.0 * z * p2) - (3.0 * p3)) / 4;
-			
-			p3=p2;
-			p2=p1;
-			p1=((9.0 * z * p2) - (4.0 * p3)) / 5;
-		    
-		    pp=5*(z*p1-p2)/(z*z-1.0);
-		    z1=z;
-		    z=z1-p1/pp;
-	    } while (fabs(z-z1) > 3.0e-11);
+			p1 = ( ( 3.0 * z * z ) - 1 ) / 2;
+			p2 = p1;
+         p1 = ( ( 5.0 * z * p2 ) - ( 2.0 * z ) ) / 3;
+			p3 = p2;
+			p2 = p1;
+			p1 = ( ( 7.0 * z * p2 ) - ( 3.0 * p3 ) ) / 4;
+			p3 = p2;
+			p2 = p1;
+			p1 = ( ( 9.0 * z * p2 ) - ( 4.0 * p3 ) ) / 5;
+         pp = 5 * ( z * p1 - p2 ) / ( z * z - 1.0 );
+		   z1 = z;
+		   z = z1 - p1 / pp;
+	    } while ( fabs( z - z1 ) > 3.0e-11 );
 	    
-	    x[i]=xm-xl*z;
-	    x[5+1-i]=xm+xl*z;
-	    w[i]=2.0*xl/((1.0-z*z)*pp*pp);
-	    w[5+1-i]=w[i];
+	    x[i]       = xm - xl * z;
+	    x[ 5+1-i ] = xm + xl * z;
+	    w[i]       = 2.0 * xl / ( ( 1.0 - z * z ) * pp * pp );
+	    w[ 5+1-i ] = w [i];
    }
    
-    for(int j=1; j<=5; j++) s += w[j]*swit2_(x[j]);
+    for( int j = 1; j <= 5; j++ ) s += w[j] * swit2_( x[j] );
    
    return s;
 }

-uint32_t sw2_(int nnounce)
+uint32_t sw2_( int nnounce )
 {
-    double wmax = ((sqrt((double)(nnounce))*(1.+EPSa))/450+100);
-    return ((uint32_t)(GaussianQuad_N2(0., wmax)*(1.+EPSa)*1.e6));
+    double wmax = ( ( sqrt( (double)(nnounce) ) * ( 1.+EPSa ) ) / 450+100 );
+    return ( (uint32_t)( GaussianQuad_N2( 0., wmax ) * ( 1.+EPSa ) * 1.e6 ) );
 }

 typedef struct {
@@ -334,9 +323,9 @@ bool register_m7m_algo( algo_gate_t *gate )
  gate->build_stratum_request = (void*)&std_be_build_stratum_request;
  gate->work_decode           = (void*)&std_be_work_decode;
  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
-  gate->set_target            = (void*)&scrypt_set_target;
  gate->get_max64             = (void*)&get_max64_0x1ffff;
  gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
+  opt_target_factor = 65536.0;
  return true;
 }

--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -49,7 +49,7 @@ void anime_4way_hash( void *state, const void *input )
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
    const uint32_t mask = 8;
-    const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
+    const __m256i bit3_mask = m256_const1_64( 8 );
    const __m256i zero = _mm256_setzero_si256();
    anime_4way_ctx_holder ctx;
    memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
--- a/algo/quark/hmq1725-4way.c
+++ b/algo/quark/hmq1725-4way.c
@@ -21,7 +21,7 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
-#include "algo/sha/sha2-hash-4way.h"
+#include "algo/sha/sha-hash-4way.h"

 union _hmq1725_4way_context_overlay
 {
@@ -57,7 +57,7 @@ extern void hmq1725_4way_hash(void *state, const void *input)
     uint32_t vhashB[32<<2] __attribute__ ((aligned (64)));
     hmq1725_4way_context_overlay ctx __attribute__ ((aligned (64)));
     __m256i vh_mask;     
-     const __m256i vmask = _mm256_set1_epi64x( 24 );
+     const __m256i vmask = m256_const1_64( 24 );
     const uint32_t mask = 24;
     __m256i* vh  = (__m256i*)vhash;
     __m256i* vhA = (__m256i*)vhashA;
--- a/algo/quark/hmq1725-gate.c
+++ b/algo/quark/hmq1725-gate.c
@@ -10,8 +10,8 @@ bool register_hmq1725_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_hmq1725;
  gate->hash      = (void*)&hmq1725hash;
 #endif
-  gate->set_target       = (void*)&scrypt_set_target;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  opt_target_factor = 65536.0;
  return true;
 };

--- a/algo/quark/hmq1725.c
+++ b/algo/quark/hmq1725.c
@@ -409,14 +409,3 @@ int scanhash_hmq1725( struct work *work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-/*
-bool register_hmq1725_algo( algo_gate_t* gate )
-{
-  init_hmq1725_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-  gate->set_target       = (void*)&scrypt_set_target;
-  gate->scanhash         = (void*)&scanhash_hmq1725;
-  gate->hash             = (void*)&hmq1725hash;
-  return true;
-};
-*/
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -49,7 +49,7 @@ void quark_4way_hash( void *state, const void *input )
    __m256i* vhB = (__m256i*)vhashB;
    __m256i vh_mask;
    quark_4way_ctx_holder ctx;
-    const __m256i bit3_mask = _mm256_set1_epi64x( 8 );
+    const __m256i bit3_mask = m256_const1_64( 8 );
    const uint32_t mask = 8;
    const __m256i zero = _mm256_setzero_si256();

--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -3,7 +3,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/sha/sha2-hash-4way.h"
+#include "algo/sha/sha-hash-4way.h"
 #include "ripemd-hash-4way.h"

 #define LBRY_INPUT_SIZE 112
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -41,6 +41,7 @@ void lbry_le_build_stratum_request( char *req, struct work *work,
   free(xnonce2str);
 }

+/*
 void lbry_build_block_header( struct work* g_work, uint32_t version,
                             uint32_t *prevhash, uint32_t *merkle_root,
                             uint32_t ntime, uint32_t nbits )
@@ -63,6 +64,7 @@ void lbry_build_block_header( struct work* g_work, uint32_t version,
   g_work->data[ LBRY_NBITS_INDEX ] = nbits;
   g_work->data[28] = 0x80000000;
 }
+*/

 void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
@@ -92,11 +94,6 @@ void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   g_work->data[28] = 0x80000000;
 }

-void lbry_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
 int64_t lbry_get_max64() { return 0x1ffffLL; }

 int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
@@ -119,11 +116,11 @@ bool register_lbry_algo( algo_gate_t* gate )
  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
 //  gate->build_block_header    = (void*)&build_block_header;
  gate->build_extraheader     = (void*)&lbry_build_extraheader;
-  gate->set_target            = (void*)&lbry_set_target;
  gate->ntime_index           = LBRY_NTIME_INDEX;
  gate->nbits_index           = LBRY_NBITS_INDEX;
  gate->nonce_index           = LBRY_NONCE_INDEX;
  gate->get_work_data_size    = (void*)&lbry_get_work_data_size;
+  opt_target_factor = 256.0;
  return true;
 }

--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -5,23 +5,26 @@
 #include <stddef.h>
 #include <string.h>

+/*
 static const uint32_t IV[5] =
 { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };
+*/

 /*
 * Round constants for RIPEMD-160.
 */
-#define K11  0x00000000
-#define K12  0x5A827999
-#define K13  0x6ED9EBA1
-#define K14  0x8F1BBCDC
-#define K15  0xA953FD4E

-#define K21  0x50A28BE6
-#define K22  0x5C4DD124
-#define K23  0x6D703EF3
-#define K24  0x7A6D76E9
-#define K25  0x00000000
+#define K11  0x0000000000000000
+#define K12  0x5A8279995A827999
+#define K13  0x6ED9EBA16ED9EBA1
+#define K14  0x8F1BBCDC8F1BBCDC
+#define K15  0xA953FD4EA953FD4E
+
+#define K21  0x50A28BE650A28BE6
+#define K22  0x5C4DD1245C4DD124
+#define K23  0x6D703EF36D703EF3
+#define K24  0x7A6D76E97A6D76E9
+#define K25  0x0000000000000000

 // RIPEMD-160 4 way

@@ -44,7 +47,7 @@ static const uint32_t IV[5] =
 do{ \
   a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
                _mm_add_epi32( a, f( b ,c, d ) ), r ), \
-                                 _mm_set1_epi32( k ) ), s ), e ); \
+                                 m128_const1_64( k ) ), s ), e ); \
   c = mm128_rol_32( c, 10 );\
 } while (0)

@@ -248,11 +251,11 @@ static void ripemd160_4way_round( ripemd160_4way_context *sc )

 void ripemd160_4way_init( ripemd160_4way_context *sc )
 {
-   sc->val[0] = _mm_set1_epi32( IV[0] );
-   sc->val[1] = _mm_set1_epi32( IV[1] );
-   sc->val[2] = _mm_set1_epi32( IV[2] );
-   sc->val[3] = _mm_set1_epi32( IV[3] );
-   sc->val[4] = _mm_set1_epi32( IV[4] );
+   sc->val[0] = m128_const1_64( 0x6745230167452301 );
+   sc->val[1] = m128_const1_64( 0xEFCDAB89EFCDAB89 );
+   sc->val[2] = m128_const1_64( 0x98BADCFE98BADCFE );
+   sc->val[3] = m128_const1_64( 0x1032547610325476 );
+   sc->val[4] = m128_const1_64( 0xC3D2E1F0C3D2E1F0 );
   sc->count_high = sc->count_low = 0;
 }

@@ -343,7 +346,7 @@ void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
 do{ \
   a = _mm256_add_epi32( mm256_rol_32( _mm256_add_epi32( _mm256_add_epi32( \
                _mm256_add_epi32( a, f( b ,c, d ) ), r ), \
-                                 _mm256_set1_epi32( k ) ), s ), e ); \
+                                 m256_const1_64( k ) ), s ), e ); \
   c = mm256_rol_32( c, 10 );\
 } while (0)
    
@@ -548,11 +551,11 @@ static void ripemd160_8way_round( ripemd160_8way_context *sc )

 void ripemd160_8way_init( ripemd160_8way_context *sc )
 {
-   sc->val[0] = _mm256_set1_epi32( IV[0] );
-   sc->val[1] = _mm256_set1_epi32( IV[1] );
-   sc->val[2] = _mm256_set1_epi32( IV[2] );
-   sc->val[3] = _mm256_set1_epi32( IV[3] );
-   sc->val[4] = _mm256_set1_epi32( IV[4] );
+   sc->val[0] = m256_const1_64( 0x6745230167452301 );
+   sc->val[1] = m256_const1_64( 0xEFCDAB89EFCDAB89 );
+   sc->val[2] = m256_const1_64( 0x98BADCFE98BADCFE );
+   sc->val[3] = m256_const1_64( 0x1032547610325476 );
+   sc->val[4] = m256_const1_64( 0xC3D2E1F0C3D2E1F0 );
   sc->count_high = sc->count_low = 0;
 }

--- a/algo/scrypt/neoscrypt.c
+++ b/algo/scrypt/neoscrypt.c
@@ -1089,13 +1089,13 @@ bool register_neoscrypt_algo( algo_gate_t* gate )
  gate->scanhash              = (void*)&scanhash_neoscrypt;
  gate->hash                  = (void*)&neoscrypt;
  gate->get_max64             = (void*)&get_neoscrypt_max64;
-  gate->set_target            = (void*)&scrypt_set_target;
  gate->wait_for_diff         = (void*)&neoscrypt_wait_for_diff;
  gate->build_stratum_request = (void*)&std_be_build_stratum_request;
  gate->work_decode           = (void*)&std_be_work_decode;
  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
  gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
  gate->get_work_data_size    = (void*)&neoscrypt_get_work_data_size;
+  opt_target_factor = 65536.0;
  return true;
 };

--- a/algo/scrypt/pluck.c
+++ b/algo/scrypt/pluck.c
@@ -503,8 +503,8 @@ bool register_pluck_algo( algo_gate_t* gate )
  gate->miner_thread_init = (void*)&pluck_miner_thread_init;
  gate->scanhash         = (void*)&scanhash_pluck;
  gate->hash             = (void*)&pluck_hash;
-  gate->set_target       = (void*)&scrypt_set_target;
  gate->get_max64        = (void*)&pluck_get_max64;
+  opt_target_factor = 65536.0;
  return true;
 };

--- a/algo/scrypt/scrypt.c
+++ b/algo/scrypt/scrypt.c
@@ -698,8 +698,8 @@ static void scrypt_1024_1_1_256_24way(const uint32_t *input,
 extern int scanhash_scrypt( struct work *work, uint32_t max_nonce,
                            uint64_t *hashes_done, struct thr_info *mythr )
 {
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
 	uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8];
 	uint32_t midstate[8];
 	uint32_t n = pdata[19] - 1;
@@ -783,13 +783,18 @@ bool register_scrypt_algo( algo_gate_t* gate )
  gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
  gate->scanhash         = (void*)&scanhash_scrypt;
 //  gate->hash             = (void*)&scrypt_1024_1_1_256_24way;
-  gate->set_target       = (void*)&scrypt_set_target;
  gate->get_max64        = (void*)&scrypt_get_max64;
+  opt_target_factor = 65536.0;

-  if ( !opt_scrypt_n )
+
+  if ( !opt_param_n )
+  {
+     opt_param_n = 1024;
     scratchbuf_size = 1024;
+  }
  else
-     scratchbuf_size = opt_scrypt_n;
+     scratchbuf_size = opt_param_n;
+  applog(LOG_INFO,"Scrypt paramaters: N= %d, R= 1.", opt_param_n );
  return true;
 };

--- a/algo/scryptjane/scrypt-jane-chacha.h
+++ b/algo/scryptjane/scrypt-jane-chacha.h
@@ -55,6 +55,7 @@ typedef uint32_t scrypt_mix_word_t;
 	#include "scrypt-jane-romix-template.h"
 #endif

+
 /* cpu agnostic */
 #define SCRYPT_ROMIX_FN scrypt_ROMix_basic
 #define SCRYPT_MIX_FN chacha_core_basic
--- a/algo/scryptjane/scrypt-jane-romix-template.h
+++ b/algo/scryptjane/scrypt-jane-romix-template.h
@@ -1,9 +1,11 @@
 #if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX)

+/*
 #if defined(SCRYPT_CHOOSE_COMPILETIME)
 #undef SCRYPT_ROMIX_FN
 #define SCRYPT_ROMIX_FN scrypt_ROMix
 #endif
+*/

 #undef SCRYPT_HAVE_ROMIX
 #define SCRYPT_HAVE_ROMIX
--- a/algo/scryptjane/scrypt-jane.c
+++ b/algo/scryptjane/scrypt-jane.c
@@ -240,24 +240,24 @@ bool register_scryptjane_algo( algo_gate_t* gate )
 {
    gate->scanhash   = (void*)&scanhash_scryptjane;
    gate->hash       = (void*)&scryptjanehash;
-    gate->set_target = (void*)&scrypt_set_target;
    gate->get_max64  = (void*)&get_max64_0x40LL;
+    opt_target_factor = 65536.0;

    // figure out if arg in N or Nfactor
-    if ( !opt_scrypt_n )
+    if ( !opt_param_n )
    {
      applog( LOG_ERR, "The N factor must be specified in the form algo:nf");
      return false;
    }
-    else if ( opt_scrypt_n < 32 )
+    else if ( opt_param_n < 32 )
    {
      // arg is Nfactor, calculate N
-      sj_N = 1 << ( opt_scrypt_n + 1 );
+      sj_N = 1 << ( opt_param_n + 1 );
    }
    else
    {
      // arg is N
-      sj_N = opt_scrypt_n;
+      sj_N = opt_param_n;
    }
    return true;
 }
--- a/algo/sha/sha2-hash-4way.h
+++ b/algo/sha/sha2-hash-4way.h
@@ -55,32 +55,13 @@ typedef struct {
   __m128i buf[64>>2];
   __m128i val[8];
   uint32_t count_high, count_low;
+   bool initialized;
 } sha256_4way_context;

 void sha256_4way_init( sha256_4way_context *sc );
 void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
 void sha256_4way_close( sha256_4way_context *sc, void *dst );

-/*
-// SHA-256 7 way hybrid
-// Combines SSE, MMX and scalar data to do 8 + 2 + 1 parallel.
-typedef struct {
-   __m128i  bufx[64>>2];
-   __m128i  valx[8];
-   __m64    bufy[64>>2];
-   __m64    valy[8];
-   uint32_t bufz[64>>2];
-   uint32_t valz[8];
-   uint32_t count_high, count_low;
-} sha256_7way_context;
-
-void sha256_7way_init( sha256_7way_context *ctx );
-void sha256_7way( sha256_7way_context *ctx, const void *datax,
-                         void *datay, void *dataz, size_t len );
-void sha256_7way_close( sha256_7way_context *ctx, void *dstx, void *dstyx,
-                         void *dstz  );
-*/
-
 #if defined (__AVX2__)

 // SHA-256 8 way
@@ -89,6 +70,7 @@ typedef struct {
   __m256i buf[64>>2];
   __m256i val[8];
   uint32_t count_high, count_low;
+   bool initialized;
 } sha256_8way_context;

 void sha256_8way_init( sha256_8way_context *sc );
@@ -103,6 +85,7 @@ typedef struct {
   __m256i buf[128>>3];
   __m256i val[8];
   uint64_t count;
+   bool initialized;
 } sha512_4way_context;

 void sha512_4way_init( sha512_4way_context *sc);
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -12,6 +12,7 @@

 #include <string.h>
 #include <inttypes.h>
+#include <openssl/sha.h>

 #if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__)
 #define EXTERN_SHA256
@@ -197,7 +198,17 @@ static void sha256d_80_swap(uint32_t *hash, const uint32_t *data)

 extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
 {
-	uint32_t S[16], T[16];
+#if defined(__SHA__)
+   SHA256_CTX ctx;
+   SHA256_Init( &ctx );
+   SHA256_Update( &ctx, data, len );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+   SHA256_Init( &ctx );
+   SHA256_Update( &ctx, hash, 32 );
+   SHA256_Final( (unsigned char*)hash, &ctx );
+#else
+
+   uint32_t S[16], T[16];
 	int i, r;

 	sha256_init(S);
@@ -218,6 +229,7 @@ extern void sha256d(unsigned char *hash, const unsigned char *data, int len)
 	sha256_transform(T, S, 0);
 	for (i = 0; i < 8; i++)
 		be32enc((uint32_t *)hash + i, T[i]);
+#endif
 }

 static inline void sha256d_preextend(uint32_t *W)
@@ -635,9 +647,46 @@ int scanhash_sha256d( struct work *work,
 	return 0;
 }

+int scanhash_SHA256d( struct work *work, const uint32_t max_nonce,
+                      uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(128) hash[8];
+   uint32_t _ALIGN(64) data[20];
+   uint32_t *pdata = work->data;
+   const uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19] - 1;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+
+   memcpy( data, pdata, 80 );
+
+   do {
+      data[19] = ++n;
+      sha256d( (unsigned char*)hash, (const unsigned char*)data, 80 );
+      if ( unlikely( swab32( hash[7] ) <= Htarg ) )
+      {
+         pdata[19] = n;
+         sha256d_80_swap(hash, pdata);
+         if ( fulltest( hash, ptarget ) && !opt_benchmark )
+            submit_solution( work, hash, mythr );
+      }
+   } while ( likely( n < max_nonce && !work_restart[thr_id].restart ) );
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+   return 0;
+}
+
+
 bool register_sha256d_algo( algo_gate_t* gate )
 {
-    gate->scanhash = (void*)&scanhash_sha256d;
+#if defined(__SHA__)
+   gate->optimizations = SHA_OPT;
+   gate->scanhash = (void*)&scanhash_SHA256d;
+#else
+   gate->optimizations = SSE2_OPT | AVX2_OPT;
+   gate->scanhash = (void*)&scanhash_sha256d;
+#endif
    gate->hash     = (void*)&sha256d;
    return true;
 };
--- a/algo/sha/sha256-hash-4way.c
+++ b/algo/sha/sha256-hash-4way.c
@@ -34,19 +34,18 @@

 #include <stddef.h>
 #include <string.h>
-
-#include "sha2-hash-4way.h"
-
-#include <stdio.h>
+#include "sha-hash-4way.h"

 // SHA-256 32 bit

+/*
 static const sph_u32 H256[8] = {
        SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
        SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
        SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
        SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
 };
+*/

 static const sph_u32 K256[64] = {
        SPH_C32(0x428A2F98), SPH_C32(0x71374491),
@@ -113,16 +112,17 @@ static const sph_u32 K256[64] = {

 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
-  register __m128i T1, T2; \
+  __m128i T1, T2; \
+  __m128i K = _mm_set1_epi32( K256[( (j)+(i) )] ); \
  T1 = _mm_add_epi32( H, mm128_add4_32( BSG2_1(E), CHs(E, F, G), \
-                             _mm_set1_epi32( K256[( (j)+(i) )] ), W[i] ) ); \
+                                        K, W[i] ) ); \
  T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
  D  = _mm_add_epi32( D,  T1 ); \
  H  = _mm_add_epi32( T1, T2 ); \
 } while (0)

 static void
-sha256_4way_round( __m128i *in, __m128i r[8] )
+sha256_4way_round( sha256_4way_context *ctx, __m128i *in, __m128i r[8] )
 {
   register  __m128i A, B, C, D, E, F, G, H;
   __m128i W[16];
@@ -130,14 +130,28 @@ sha256_4way_round( __m128i *in, __m128i r[8] )
   mm128_block_bswap_32( W, in );
   mm128_block_bswap_32( W+8, in+8 );

-   A = r[0];
-   B = r[1];
-   C = r[2];
-   D = r[3];
-   E = r[4];
-   F = r[5];
-   G = r[6];
-   H = r[7];
+   if ( ctx->initialized )
+   {
+      A = r[0];
+      B = r[1];
+      C = r[2];
+      D = r[3];
+      E = r[4];
+      F = r[5];
+      G = r[6];
+      H = r[7];
+   }
+   else
+   {
+      A = m128_const1_64( 0x6A09E6676A09E667 );
+      B = m128_const1_64( 0xBB67AE85BB67AE85 );
+      C = m128_const1_64( 0x3C6EF3723C6EF372 );
+      D = m128_const1_64( 0xA54FF53AA54FF53A );
+      E = m128_const1_64( 0x510E527F510E527F );
+      F = m128_const1_64( 0x9B05688C9B05688C );
+      G = m128_const1_64( 0x1F83D9AB1F83D9AB );
+      H = m128_const1_64( 0x5BE0CD195BE0CD19 );
+   }

   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
@@ -193,19 +207,36 @@ sha256_4way_round( __m128i *in, __m128i r[8] )
      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
   }

-   r[0] = _mm_add_epi32( r[0], A );
-   r[1] = _mm_add_epi32( r[1], B );
-   r[2] = _mm_add_epi32( r[2], C );
-   r[3] = _mm_add_epi32( r[3], D );
-   r[4] = _mm_add_epi32( r[4], E );
-   r[5] = _mm_add_epi32( r[5], F );
-   r[6] = _mm_add_epi32( r[6], G );
-   r[7] = _mm_add_epi32( r[7], H );
+   if ( ctx->initialized )
+   {
+      r[0] = _mm_add_epi32( r[0], A );
+      r[1] = _mm_add_epi32( r[1], B );
+      r[2] = _mm_add_epi32( r[2], C );
+      r[3] = _mm_add_epi32( r[3], D );
+      r[4] = _mm_add_epi32( r[4], E );
+      r[5] = _mm_add_epi32( r[5], F );
+      r[6] = _mm_add_epi32( r[6], G );
+      r[7] = _mm_add_epi32( r[7], H );
+   }
+   else
+   {
+      ctx->initialized = true;
+      r[0] = _mm_add_epi32( A, m128_const1_64( 0x6A09E6676A09E667 ) );
+      r[1] = _mm_add_epi32( B, m128_const1_64( 0xBB67AE85BB67AE85 ) );
+      r[2] = _mm_add_epi32( C, m128_const1_64( 0x3C6EF3723C6EF372 ) );
+      r[3] = _mm_add_epi32( D, m128_const1_64( 0xA54FF53AA54FF53A ) );
+      r[4] = _mm_add_epi32( E, m128_const1_64( 0x510E527F510E527F ) );
+      r[5] = _mm_add_epi32( F, m128_const1_64( 0x9B05688C9B05688C ) );
+      r[6] = _mm_add_epi32( G, m128_const1_64( 0x1F83D9AB1F83D9AB ) );
+      r[7] = _mm_add_epi32( H, m128_const1_64( 0x5BE0CD195BE0CD19 ) );
+   }
 }

 void sha256_4way_init( sha256_4way_context *sc )
 {
+   sc->initialized = false;
   sc->count_high = sc->count_low = 0;
+/*
   sc->val[0] = _mm_set1_epi32( H256[0] );
   sc->val[1] = _mm_set1_epi32( H256[1] );
   sc->val[2] = _mm_set1_epi32( H256[2] );
@@ -214,6 +245,7 @@ void sha256_4way_init( sha256_4way_context *sc )
   sc->val[5] = _mm_set1_epi32( H256[5] );
   sc->val[6] = _mm_set1_epi32( H256[6] );
   sc->val[7] = _mm_set1_epi32( H256[7] );
+*/
 }

 void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
@@ -237,7 +269,7 @@ void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
      len -= clen;
      if ( ptr == buf_size )
      {
-         sha256_4way_round( sc->buf, sc->val );
+         sha256_4way_round( sc, sc->buf, sc->val );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -256,13 +288,13 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
    const int pad = buf_size - 8;

    ptr = (unsigned)sc->count_low & (buf_size - 1U);
-    sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
+    sc->buf[ ptr>>2 ] = m128_const1_64( 0x0000008000000080 );
    ptr += 4;

    if ( ptr > pad )
    {
         memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_4way_round( sc->buf, sc->val );
+         sha256_4way_round( sc, sc->buf, sc->val );
         memset_zero_128( sc->buf, pad >> 2 );
    }
    else
@@ -276,7 +308,7 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
                 mm128_bswap_32( _mm_set1_epi32( high ) );
    sc->buf[ ( pad+4 ) >> 2 ] =
                 mm128_bswap_32( _mm_set1_epi32( low ) );
-    sha256_4way_round( sc->buf, sc->val );
+    sha256_4way_round( sc, sc->buf, sc->val );

    mm128_block_bswap_32( dst, sc->val );
 }
@@ -313,16 +345,17 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )

 #define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
-  register __m256i T1, T2; \
-   T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
-                          _mm256_set1_epi32( K256[( (j)+(i) )] ), W[i] ) ); \
+  __m256i T1, T2; \
+  __m256i K = _mm256_set1_epi32( K256[( (j)+(i) )] ); \
+  T1 = _mm256_add_epi32( H, mm256_add4_32( BSG2_1x(E), CHx(E, F, G), \
+                                           K, W[i] ) ); \
  T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
  D  = _mm256_add_epi32( D,  T1 ); \
  H  = _mm256_add_epi32( T1, T2 ); \
 } while (0)

 static void
-sha256_8way_round( __m256i *in, __m256i r[8] )
+sha256_8way_round( sha256_8way_context *ctx,  __m256i *in, __m256i r[8] )
 {
   register  __m256i A, B, C, D, E, F, G, H;
   __m256i W[16];
@@ -330,14 +363,28 @@ sha256_8way_round( __m256i *in, __m256i r[8] )
   mm256_block_bswap_32( W  , in   );
   mm256_block_bswap_32( W+8, in+8 );

-   A = r[0];
-   B = r[1];
-   C = r[2];
-   D = r[3];
-   E = r[4];
-   F = r[5];
-   G = r[6];
-   H = r[7];
+   if ( ctx->initialized )
+   {
+      A = r[0];
+      B = r[1];
+      C = r[2];
+      D = r[3];
+      E = r[4];
+      F = r[5];
+      G = r[6];
+      H = r[7];
+   }
+   else
+   {
+      A = m256_const1_64( 0x6A09E6676A09E667 );
+      B = m256_const1_64( 0xBB67AE85BB67AE85 );
+      C = m256_const1_64( 0x3C6EF3723C6EF372 );
+      D = m256_const1_64( 0xA54FF53AA54FF53A );
+      E = m256_const1_64( 0x510E527F510E527F );
+      F = m256_const1_64( 0x9B05688C9B05688C );
+      G = m256_const1_64( 0x1F83D9AB1F83D9AB );
+      H = m256_const1_64( 0x5BE0CD195BE0CD19 );
+   }

   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
@@ -393,20 +440,36 @@ sha256_8way_round( __m256i *in, __m256i r[8] )
      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
   }

-   r[0] = _mm256_add_epi32( r[0], A );
-   r[1] = _mm256_add_epi32( r[1], B );
-   r[2] = _mm256_add_epi32( r[2], C );
-   r[3] = _mm256_add_epi32( r[3], D );
-   r[4] = _mm256_add_epi32( r[4], E );
-   r[5] = _mm256_add_epi32( r[5], F );
-   r[6] = _mm256_add_epi32( r[6], G );
-   r[7] = _mm256_add_epi32( r[7], H );
+   if ( ctx->initialized )
+   {
+      r[0] = _mm256_add_epi32( r[0], A );
+      r[1] = _mm256_add_epi32( r[1], B );
+      r[2] = _mm256_add_epi32( r[2], C );
+      r[3] = _mm256_add_epi32( r[3], D );
+      r[4] = _mm256_add_epi32( r[4], E );
+      r[5] = _mm256_add_epi32( r[5], F );
+      r[6] = _mm256_add_epi32( r[6], G );
+      r[7] = _mm256_add_epi32( r[7], H );
+   }
+   else
+   {
+      ctx->initialized = true;
+      r[0] = _mm256_add_epi32( A, m256_const1_64( 0x6A09E6676A09E667 ) );
+      r[1] = _mm256_add_epi32( B, m256_const1_64( 0xBB67AE85BB67AE85 ) );
+      r[2] = _mm256_add_epi32( C, m256_const1_64( 0x3C6EF3723C6EF372 ) );
+      r[3] = _mm256_add_epi32( D, m256_const1_64( 0xA54FF53AA54FF53A ) );
+      r[4] = _mm256_add_epi32( E, m256_const1_64( 0x510E527F510E527F ) );
+      r[5] = _mm256_add_epi32( F, m256_const1_64( 0x9B05688C9B05688C ) );
+      r[6] = _mm256_add_epi32( G, m256_const1_64( 0x1F83D9AB1F83D9AB ) );
+      r[7] = _mm256_add_epi32( H, m256_const1_64( 0x5BE0CD195BE0CD19 ) );
+   }
 }

-
 void sha256_8way_init( sha256_8way_context *sc )
 {
+   sc->initialized = false;
   sc->count_high = sc->count_low = 0;
+/*
   sc->val[0] = _mm256_set1_epi32( H256[0] );
   sc->val[1] = _mm256_set1_epi32( H256[1] );
   sc->val[2] = _mm256_set1_epi32( H256[2] );
@@ -415,6 +478,7 @@ void sha256_8way_init( sha256_8way_context *sc )
   sc->val[5] = _mm256_set1_epi32( H256[5] );
   sc->val[6] = _mm256_set1_epi32( H256[6] );
   sc->val[7] = _mm256_set1_epi32( H256[7] );
+*/
 }

 void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
@@ -438,7 +502,7 @@ void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
      len -= clen;
      if ( ptr == buf_size )
      {
-         sha256_8way_round( sc->buf, sc->val );
+         sha256_8way_round( sc, sc->buf, sc->val );
         ptr = 0;
      }
      clow = sc->count_low;
@@ -457,13 +521,13 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    const int pad = buf_size - 8;

    ptr = (unsigned)sc->count_low & (buf_size - 1U);
-    sc->buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
+    sc->buf[ ptr>>2 ] = m256_const1_64( 0x0000008000000080 );
    ptr += 4;

    if ( ptr > pad )
    {
         memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
-         sha256_8way_round( sc->buf, sc->val );
+         sha256_8way_round( sc, sc->buf, sc->val );
         memset_zero_256( sc->buf, pad >> 2 );
    }
    else
@@ -478,207 +542,10 @@ void sha256_8way_close( sha256_8way_context *sc, void *dst )
    sc->buf[ ( pad+4 ) >> 2 ] =
                 mm256_bswap_32( _mm256_set1_epi32( low ) );

-    sha256_8way_round( sc->buf, sc->val );
+    sha256_8way_round( sc, sc->buf, sc->val );

    mm256_block_bswap_32( dst, sc->val );
 }

-
-// SHA-512 4 way 64 bit
-
-static const sph_u64 H512[8] = {
-        SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
-        SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
-        SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
-        SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
-};
-
-static const sph_u64 K512[80] = {
-	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
-	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
-	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
-	SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
-	SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
-	SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
-	SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
-	SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
-	SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
-	SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
-	SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
-	SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
-	SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
-	SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
-	SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
-	SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
-	SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
-	SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
-	SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
-	SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
-	SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
-	SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
-	SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
-	SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
-	SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
-	SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
-	SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
-	SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
-	SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
-	SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
-	SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
-	SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
-	SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
-	SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
-	SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
-	SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
-	SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
-	SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
-	SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
-	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
-};
-
-#define CH(X, Y, Z) \
-   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
-
-#define MAJ(X, Y, Z) \
-   _mm256_or_si256( _mm256_and_si256( X, Y ), \
-                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
-
-#define BSG5_0(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-        mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) )
-
-#define BSG5_1(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-        mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) )
-
-#define SSG5_0(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-        mm256_ror_64(x,  1), mm256_ror_64(x,  8) ), _mm256_srli_epi64(x, 7) ) 
-
-#define SSG5_1(x) \
-   _mm256_xor_si256( _mm256_xor_si256( \
-        mm256_ror_64(x, 19), mm256_ror_64(x, 61) ), _mm256_srli_epi64(x, 6) )
-
-#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
-do { \
-  register __m256i T1, T2; \
-  T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
-                                _mm256_set1_epi64x( K512[i] ), W[i] ) ); \
-  T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
-  D  = _mm256_add_epi64( D, T1 ); \
-  H  = _mm256_add_epi64( T1, T2 ); \
-} while (0)
-
-static void
-sha512_4way_round( __m256i *in, __m256i r[8] )
-{
-   int i;
-   register __m256i A, B, C, D, E, F, G, H;
-   __m256i W[80];
-
-   mm256_block_bswap_64( W  , in );
-   mm256_block_bswap_64( W+8, in+8 );
-
-   for ( i = 16; i < 80; i++ )
-      W[i] = mm256_add4_64( SSG5_1( W[ i- 2 ] ), W[ i- 7 ],
-                            SSG5_0( W[ i-15 ] ), W[ i-16 ] );
-
-   A = r[0];
-   B = r[1];
-   C = r[2];
-   D = r[3];
-   E = r[4];
-   F = r[5];
-   G = r[6];
-   H = r[7];
-
-   for ( i = 0; i < 80; i += 8 )
-   {
-      SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
-      SHA3_4WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
-      SHA3_4WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
-      SHA3_4WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
-      SHA3_4WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
-      SHA3_4WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
-      SHA3_4WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
-      SHA3_4WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
-   }
-
-   r[0] = _mm256_add_epi64( r[0], A );
-   r[1] = _mm256_add_epi64( r[1], B );
-   r[2] = _mm256_add_epi64( r[2], C );
-   r[3] = _mm256_add_epi64( r[3], D );
-   r[4] = _mm256_add_epi64( r[4], E );
-   r[5] = _mm256_add_epi64( r[5], F );
-   r[6] = _mm256_add_epi64( r[6], G );
-   r[7] = _mm256_add_epi64( r[7], H );
-}
-
-void sha512_4way_init( sha512_4way_context *sc )
-{
-   sc->count = 0;
-   sc->val[0] = _mm256_set1_epi64x( H512[0] );
-   sc->val[1] = _mm256_set1_epi64x( H512[1] );
-   sc->val[2] = _mm256_set1_epi64x( H512[2] );
-   sc->val[3] = _mm256_set1_epi64x( H512[3] );
-   sc->val[4] = _mm256_set1_epi64x( H512[4] );
-   sc->val[5] = _mm256_set1_epi64x( H512[5] );
-   sc->val[6] = _mm256_set1_epi64x( H512[6] );
-   sc->val[7] = _mm256_set1_epi64x( H512[7] );
-}
-
-void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
-{
-   __m256i *vdata = (__m256i*)data;
-   size_t ptr;
-   const int buf_size = 128;
-
-   ptr = (unsigned)sc->count & (buf_size - 1U);
-   while ( len > 0 )
-   {
-      size_t clen;
-      clen = buf_size - ptr;
-      if ( clen > len )
-         clen = len;
-      memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
-      vdata = vdata + (clen>>3);
-      ptr += clen;
-      len -= clen;
-      if ( ptr == buf_size )
-      {
-         sha512_4way_round( sc->buf, sc->val );
-         ptr = 0;
-      }
-      sc->count += clen;
-   }
-}
-
-void sha512_4way_close( sha512_4way_context *sc, void *dst )
-{
-    unsigned ptr;
-    const int buf_size = 128;
-    const int pad = buf_size - 16;
-
-    ptr = (unsigned)sc->count & (buf_size - 1U);
-    sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
-    ptr += 8;
-    if ( ptr > pad )
-    {
-         memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
-         sha512_4way_round( sc->buf, sc->val );
-         memset_zero_256( sc->buf, pad >> 3 );
-    }
-    else
-         memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
-
-    sc->buf[ pad >> 3 ] =
-                 mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
-    sc->buf[ ( pad+8 ) >> 3 ] = 
-                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
-    sha512_4way_round( sc->buf, sc->val );
-
-    mm256_block_bswap_64( dst, sc->val );
-}
-
 #endif  // __AVX2__
 #endif  // __SSE2__
--- a/algo/sha/sha256q-4way.c
+++ b/algo/sha/sha256q-4way.c
@@ -3,7 +3,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "sha2-hash-4way.h"
+#include "sha-hash-4way.h"

 #if defined(SHA256T_8WAY)

--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -3,7 +3,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "sha2-hash-4way.h"
+#include "sha-hash-4way.h"

 #if defined(SHA256T_11WAY)

@@ -158,7 +158,7 @@ void sha256t_8way_hash( void* output, const void* input )
   sha256_8way_close( &ctx, output );
 }

-int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
+int scanhash_sha256t_8way( struct work *work, const uint32_t max_nonce,
                           uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*8]  __attribute__ ((aligned (64)));
@@ -166,12 +166,12 @@ int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,
   uint32_t lane_hash[8] __attribute__ ((aligned (32)));
   uint32_t *hash7 = &(hash[7<<3]);
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   const uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   __m256i  *noncev = (__m256i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;

   const uint64_t htmax[] = {          0,
                                     0xF,
@@ -194,7 +194,7 @@ int scanhash_sha256t_8way( struct work *work, uint32_t max_nonce,

   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
   {
-      uint32_t mask = masks[m];
+      const uint32_t mask = masks[m];
      do
      {
        *noncev = mm256_bswap_32( _mm256_set_epi32(
@@ -244,7 +244,7 @@ void sha256t_4way_hash( void* output, const void* input )
   sha256_4way_close( &ctx, output );
 }

-int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
+int scanhash_sha256t_4way( struct work *work, const uint32_t max_nonce,
 	                   uint64_t *hashes_done, struct thr_info *mythr )
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -252,12 +252,12 @@ int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,
   uint32_t lane_hash[8] __attribute__ ((aligned (64)));
   uint32_t *hash7 = &(hash[7<<2]);
   uint32_t *pdata = work->data;
-   uint32_t *ptarget = work->target;
+   const uint32_t *ptarget = work->target;
   const uint32_t Htarg = ptarget[7];
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   __m128i  *noncev = (__m128i*)vdata + 19;   // aligned
-   int thr_id = mythr->id;  // thr_id arg is deprecated
+   const int thr_id = mythr->id;

   const uint64_t htmax[] = {          0,
                                     0xF,
@@ -278,7 +278,7 @@ int scanhash_sha256t_4way( struct work *work, uint32_t max_nonce,

   for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
   {
-      uint32_t mask = masks[m];
+      const uint32_t mask = masks[m];
      do {
         *noncev = mm128_bswap_32( _mm_set_epi32( n+3,n+2,n+1,n ) );
         pdata[19] = n;
--- a/algo/sha/sha512-hash-4way.c
+++ b/algo/sha/sha512-hash-4way.c
@@ -0,0 +1,320 @@
+/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * SHA-384 / SHA-512 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#if defined(__AVX2__)
+
+#include <stddef.h>
+#include <string.h>
+#include "sha-hash-4way.h"
+
+// SHA-512 4 way 64 bit
+
+/*
+static const sph_u64 H512[8] = {
+        SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+        SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+        SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+        SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+*/
+
+static const sph_u64 K512[80] = {
+	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
+	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
+	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
+	SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
+	SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
+	SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
+	SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
+	SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
+	SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
+	SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
+	SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
+	SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
+	SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
+	SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
+	SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
+	SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
+	SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
+	SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
+	SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
+	SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
+	SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
+	SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
+	SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
+	SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
+	SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
+	SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
+	SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
+	SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
+	SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
+	SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
+	SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
+	SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
+	SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
+	SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
+	SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
+	SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
+	SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
+	SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
+	SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
+	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
+};
+
+#define CH(X, Y, Z) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
+
+#define MAJ(X, Y, Z) \
+   _mm256_or_si256( _mm256_and_si256( X, Y ), \
+                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
+
+#define BSG5_0(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+        mm256_ror_64(x, 28), mm256_ror_64(x, 34) ), mm256_ror_64(x, 39) )
+
+#define BSG5_1(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+        mm256_ror_64(x, 14), mm256_ror_64(x, 18) ), mm256_ror_64(x, 41) )
+
+#define SSG5_0(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+        mm256_ror_64(x,  1), mm256_ror_64(x,  8) ), _mm256_srli_epi64(x, 7) ) 
+
+#define SSG5_1(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+        mm256_ror_64(x, 19), mm256_ror_64(x, 61) ), _mm256_srli_epi64(x, 6) )
+
+// Interleave SSG0 & SSG1 for better throughput.
+// return ssg0(w0) + ssg1(w1)
+static inline __m256i ssg512_add( __m256i w0, __m256i w1 )
+{
+   __m256i w0a, w1a, w0b, w1b;
+   w0a = mm256_ror_64( w0, 1 );
+   w1a = mm256_ror_64( w1,19 );
+   w0b = mm256_ror_64( w0, 8 );
+   w1b = mm256_ror_64( w1,61 );
+   w0a = _mm256_xor_si256( w0a, w0b );
+   w1a = _mm256_xor_si256( w1a, w1b );
+   w0b = _mm256_srli_epi64( w0, 7 );
+   w1b = _mm256_srli_epi64( w1, 6 );
+   w0a = _mm256_xor_si256( w0a, w0b );
+   w1a = _mm256_xor_si256( w1a, w1b );
+   return _mm256_add_epi64( w0a, w1a );
+}
+
+
+#define SSG512x2_0( w0, w1, i ) do \
+{ \
+   __m256i X0a, X1a, X0b, X1b; \
+  X0a = mm256_ror_64( W[i-15], 1 ); \
+  X1a = mm256_ror_64( W[i-14], 1 ); \
+  X0b = mm256_ror_64( W[i-15], 8 ); \
+  X1b = mm256_ror_64( W[i-14], 8 ); \
+  X0a = _mm256_xor_si256( X0a, X0b ); \
+  X1a = _mm256_xor_si256( X1a, X1b ); \
+  X0b = _mm256_srli_epi64( W[i-15], 7 ); \
+  X1b = _mm256_srli_epi64( W[i-14], 7 ); \
+  w0  = _mm256_xor_si256( X0a, X0b ); \
+  w1  = _mm256_xor_si256( X1a, X1b ); \
+} while(0)
+
+#define SSG512x2_1( w0, w1, i ) do \
+{ \
+   __m256i X0a, X1a, X0b, X1b; \
+  X0a = mm256_ror_64( W[i-2],19 ); \
+  X1a = mm256_ror_64( W[i-1],19 ); \
+  X0b = mm256_ror_64( W[i-2],61 ); \
+  X1b = mm256_ror_64( W[i-1],61 ); \
+  X0a = _mm256_xor_si256( X0a, X0b ); \
+  X1a = _mm256_xor_si256( X1a, X1b ); \
+  X0b = _mm256_srli_epi64( W[i-2], 6 ); \
+  X1b = _mm256_srli_epi64( W[i-1], 6 ); \
+  w0  = _mm256_xor_si256( X0a, X0b ); \
+  w1  = _mm256_xor_si256( X1a, X1b ); \
+} while(0)
+
+#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
+do { \
+  __m256i T1, T2; \
+  __m256i K = _mm256_set1_epi64x( K512[ i ] ); \
+  T1 = _mm256_add_epi64( H, mm256_add4_64( BSG5_1(E), CH(E, F, G), \
+                                           K, W[i] ) ); \
+  T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
+  D  = _mm256_add_epi64( D, T1 ); \
+  H  = _mm256_add_epi64( T1, T2 ); \
+} while (0)
+
+
+static void
+sha512_4way_round( sha512_4way_context *ctx,  __m256i *in, __m256i r[8] )
+{
+   int i;
+   register __m256i A, B, C, D, E, F, G, H;
+   __m256i W[80];
+
+   mm256_block_bswap_64( W  , in );
+   mm256_block_bswap_64( W+8, in+8 );
+
+   for ( i = 16; i < 80; i++ )
+      W[i] = _mm256_add_epi64( ssg512_add( W[i-15], W[i-2] ),
+                               _mm256_add_epi64( W[ i- 7 ], W[ i-16 ] ) );
+
+   if ( ctx->initialized )
+   {
+      A = r[0];
+      B = r[1];
+      C = r[2];
+      D = r[3];
+      E = r[4];
+      F = r[5];
+      G = r[6];
+      H = r[7];
+   }
+   else
+   {
+      A = m256_const1_64( 0x6A09E667F3BCC908 );
+      B = m256_const1_64( 0xBB67AE8584CAA73B );
+      C = m256_const1_64( 0x3C6EF372FE94F82B );
+      D = m256_const1_64( 0xA54FF53A5F1D36F1 );
+      E = m256_const1_64( 0x510E527FADE682D1 );
+      F = m256_const1_64( 0x9B05688C2B3E6C1F );
+      G = m256_const1_64( 0x1F83D9ABFB41BD6B );
+      H = m256_const1_64( 0x5BE0CD19137E2179 );
+   }
+
+   for ( i = 0; i < 80; i += 8 )
+   {
+      SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
+      SHA3_4WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
+      SHA3_4WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
+      SHA3_4WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
+      SHA3_4WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
+      SHA3_4WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
+      SHA3_4WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
+      SHA3_4WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
+   }
+
+   if ( ctx->initialized )
+   {
+      r[0] = _mm256_add_epi64( r[0], A );
+      r[1] = _mm256_add_epi64( r[1], B );
+      r[2] = _mm256_add_epi64( r[2], C );
+      r[3] = _mm256_add_epi64( r[3], D );
+      r[4] = _mm256_add_epi64( r[4], E );
+      r[5] = _mm256_add_epi64( r[5], F );
+      r[6] = _mm256_add_epi64( r[6], G );
+      r[7] = _mm256_add_epi64( r[7], H );
+   }
+   else
+   {
+      ctx->initialized = true;
+      r[0] = _mm256_add_epi64( A, m256_const1_64( 0x6A09E667F3BCC908 ) );
+      r[1] = _mm256_add_epi64( B, m256_const1_64( 0xBB67AE8584CAA73B ) );
+      r[2] = _mm256_add_epi64( C, m256_const1_64( 0x3C6EF372FE94F82B ) );
+      r[3] = _mm256_add_epi64( D, m256_const1_64( 0xA54FF53A5F1D36F1 ) );
+      r[4] = _mm256_add_epi64( E, m256_const1_64( 0x510E527FADE682D1 ) );
+      r[5] = _mm256_add_epi64( F, m256_const1_64( 0x9B05688C2B3E6C1F ) );
+      r[6] = _mm256_add_epi64( G, m256_const1_64( 0x1F83D9ABFB41BD6B ) );
+      r[7] = _mm256_add_epi64( H, m256_const1_64( 0x5BE0CD19137E2179 ) );
+   }
+}
+
+void sha512_4way_init( sha512_4way_context *sc )
+{
+   sc->initialized = false;
+   sc->count = 0;
+/*
+   sc->val[0] = _mm256_set1_epi64x( H512[0] );
+   sc->val[1] = _mm256_set1_epi64x( H512[1] );
+   sc->val[2] = _mm256_set1_epi64x( H512[2] );
+   sc->val[3] = _mm256_set1_epi64x( H512[3] );
+   sc->val[4] = _mm256_set1_epi64x( H512[4] );
+   sc->val[5] = _mm256_set1_epi64x( H512[5] );
+   sc->val[6] = _mm256_set1_epi64x( H512[6] );
+   sc->val[7] = _mm256_set1_epi64x( H512[7] );
+*/
+}
+
+void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   size_t ptr;
+   const int buf_size = 128;
+
+   ptr = (unsigned)sc->count & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
+      vdata = vdata + (clen>>3);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha512_4way_round( sc, sc->buf, sc->val );
+         ptr = 0;
+      }
+      sc->count += clen;
+   }
+}
+
+void sha512_4way_close( sha512_4way_context *sc, void *dst )
+{
+    unsigned ptr;
+    const int buf_size = 128;
+    const int pad = buf_size - 16;
+
+    ptr = (unsigned)sc->count & (buf_size - 1U);
+    sc->buf[ ptr>>3 ] = m256_const1_64( 0x80 );
+    ptr += 8;
+    if ( ptr > pad )
+    {
+         memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
+         sha512_4way_round( sc, sc->buf, sc->val );
+         memset_zero_256( sc->buf, pad >> 3 );
+    }
+    else
+         memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
+
+    sc->buf[ pad >> 3 ] =
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+    sc->buf[ ( pad+8 ) >> 3 ] = 
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+    sha512_4way_round( sc, sc->buf, sc->val );
+
+    mm256_block_bswap_64( dst, sc->val );
+}
+
+#endif  // __AVX2__
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -63,7 +63,6 @@ extern "C"{
 * that it can optimize them at will.
 */

-/* BEGIN -- automatically generated code. */

 #define DECL_STATE   \
 	__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
@@ -76,8 +75,11 @@ extern "C"{
 	        M8, M9, MA, MB, MC, MD, ME, MF; \
 	sph_u32 Wlow, Whigh;

-#define READ_STATE(state)   do { \
-		A00 = (state)->A[0]; \
+#define READ_STATE(state) do \
+{ \
+   if ( (state)->state_loaded ) \
+   { \
+      A00 = (state)->A[0]; \
 		A01 = (state)->A[1]; \
 		A02 = (state)->A[2]; \
 		A03 = (state)->A[3]; \
@@ -121,9 +123,58 @@ extern "C"{
 		CD = (state)->C[13]; \
 		CE = (state)->C[14]; \
 		CF = (state)->C[15]; \
-		Wlow = (state)->Wlow; \
-		Whigh = (state)->Whigh; \
-	} while (0)
+   } \
+   else \
+   { \
+       (state)->state_loaded = true; \
+       A00 = m128_const1_64( 0x20728DFD20728DFD ); \
+       A01 = m128_const1_64( 0x46C0BD5346C0BD53 ); \
+       A02 = m128_const1_64( 0xE782B699E782B699 ); \
+       A03 = m128_const1_64( 0x5530463255304632 ); \
+       A04 = m128_const1_64( 0x71B4EF9071B4EF90 ); \
+       A05 = m128_const1_64( 0x0EA9E82C0EA9E82C ); \
+       A06 = m128_const1_64( 0xDBB930F1DBB930F1 ); \
+       A07 = m128_const1_64( 0xFAD06B8BFAD06B8B ); \
+       A08 = m128_const1_64( 0xBE0CAE40BE0CAE40 ); \
+       A09 = m128_const1_64( 0x8BD144108BD14410 ); \
+       A0A = m128_const1_64( 0x76D2ADAC76D2ADAC ); \
+       A0B = m128_const1_64( 0x28ACAB7F28ACAB7F ); \
+       B0 = m128_const1_64( 0xC1099CB7C1099CB7 ); \
+       B1 = m128_const1_64( 0x07B385F307B385F3 ); \
+       B2 = m128_const1_64( 0xE7442C26E7442C26 ); \
+       B3 = m128_const1_64( 0xCC8AD640CC8AD640 ); \
+       B4 = m128_const1_64( 0xEB6F56C7EB6F56C7 ); \
+       B5 = m128_const1_64( 0x1EA81AA91EA81AA9 ); \
+       B6 = m128_const1_64( 0x73B9D31473B9D314 ); \
+       B7 = m128_const1_64( 0x1DE85D081DE85D08 ); \
+       B8 = m128_const1_64( 0x48910A5A48910A5A ); \
+       B9 = m128_const1_64( 0x893B22DB893B22DB ); \
+       BA = m128_const1_64( 0xC5A0DF44C5A0DF44 ); \
+       BB = m128_const1_64( 0xBBC4324EBBC4324E ); \
+       BC = m128_const1_64( 0x72D2F24072D2F240 ); \
+       BD = m128_const1_64( 0x75941D9975941D99 ); \
+       BE = m128_const1_64( 0x6D8BDE826D8BDE82 ); \
+       BF = m128_const1_64( 0xA1A7502BA1A7502B ); \
+       C0 = m128_const1_64( 0xD9BF68D1D9BF68D1 ); \
+       C1 = m128_const1_64( 0x58BAD75058BAD750 ); \
+       C2 = m128_const1_64( 0x56028CB256028CB2 ); \
+       C3 = m128_const1_64( 0x8134F3598134F359 ); \
+       C4 = m128_const1_64( 0xB5D469D8B5D469D8 ); \
+       C5 = m128_const1_64( 0x941A8CC2941A8CC2 ); \
+       C6 = m128_const1_64( 0x418B2A6E418B2A6E ); \
+       C7 = m128_const1_64( 0x0405278004052780 ); \
+       C8 = m128_const1_64( 0x7F07D7877F07D787 ); \
+       C9 = m128_const1_64( 0x5194358F5194358F ); \
+       CA = m128_const1_64( 0x3C60D6653C60D665 ); \
+       CB = m128_const1_64( 0xBE97D79ABE97D79A ); \
+       CC = m128_const1_64( 0x950C3434950C3434 ); \
+       CD = m128_const1_64( 0xAED9A06DAED9A06D ); \
+       CE = m128_const1_64( 0x2537DC8D2537DC8D ); \
+       CF = m128_const1_64( 0x7CDB59697CDB5969 ); \
+   } \
+   Wlow = (state)->Wlow; \
+   Whigh = (state)->Whigh; \
+} while (0)

 #define WRITE_STATE(state)   do { \
 		(state)->A[0] = A00; \
@@ -397,6 +448,7 @@ do { \
 			Whigh = T32(Whigh + 1); \
 	} while (0)

+/*
 static const sph_u32 A_init_256[] = {
 	C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
 	C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
@@ -436,33 +488,115 @@ static const sph_u32 C_init_512[] = {
 	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
 	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
 };
+*/

 static void
 shabal_4way_init( void *cc, unsigned size )
 {
   shabal_4way_context *sc = (shabal_4way_context*)cc;
-   int i;

   if ( size == 512 )
-   {
-      for ( i = 0; i < 12; i++ )
-         sc->A[i] = _mm_set1_epi32( A_init_512[i] );
-      for ( i = 0; i < 16; i++ )
-      {
-         sc->B[i] = _mm_set1_epi32( B_init_512[i] );
-         sc->C[i] = _mm_set1_epi32( C_init_512[i] );
-      }
+   { // copy immediate constants directly to working registers later.
+       sc->state_loaded = false;
+/*
+       sc->A[ 0] = m128_const1_64( 0x20728DFD20728DFD );
+       sc->A[ 1] = m128_const1_64( 0x46C0BD5346C0BD53 );
+       sc->A[ 2] = m128_const1_64( 0xE782B699E782B699 );
+       sc->A[ 3] = m128_const1_64( 0x5530463255304632 );
+       sc->A[ 4] = m128_const1_64( 0x71B4EF9071B4EF90 );
+       sc->A[ 5] = m128_const1_64( 0x0EA9E82C0EA9E82C );
+       sc->A[ 6] = m128_const1_64( 0xDBB930F1DBB930F1 );
+       sc->A[ 7] = m128_const1_64( 0xFAD06B8BFAD06B8B );
+       sc->A[ 8] = m128_const1_64( 0xBE0CAE40BE0CAE40 );
+       sc->A[ 9] = m128_const1_64( 0x8BD144108BD14410 );
+       sc->A[10] = m128_const1_64( 0x76D2ADAC76D2ADAC );
+       sc->A[11] = m128_const1_64( 0x28ACAB7F28ACAB7F );
+
+       sc->B[ 0] = m128_const1_64( 0xC1099CB7C1099CB7 );
+       sc->B[ 1] = m128_const1_64( 0x07B385F307B385F3 );
+       sc->B[ 2] = m128_const1_64( 0xE7442C26E7442C26 );
+       sc->B[ 3] = m128_const1_64( 0xCC8AD640CC8AD640 );
+       sc->B[ 4] = m128_const1_64( 0xEB6F56C7EB6F56C7 );
+       sc->B[ 5] = m128_const1_64( 0x1EA81AA91EA81AA9 );
+       sc->B[ 6] = m128_const1_64( 0x73B9D31473B9D314 );
+       sc->B[ 7] = m128_const1_64( 0x1DE85D081DE85D08 );
+       sc->B[ 8] = m128_const1_64( 0x48910A5A48910A5A );
+       sc->B[ 9] = m128_const1_64( 0x893B22DB893B22DB );
+       sc->B[10] = m128_const1_64( 0xC5A0DF44C5A0DF44 );
+       sc->B[11] = m128_const1_64( 0xBBC4324EBBC4324E );
+       sc->B[12] = m128_const1_64( 0x72D2F24072D2F240 );
+       sc->B[13] = m128_const1_64( 0x75941D9975941D99 );
+       sc->B[14] = m128_const1_64( 0x6D8BDE826D8BDE82 );
+       sc->B[15] = m128_const1_64( 0xA1A7502BA1A7502B );
+
+       sc->C[ 0] = m128_const1_64( 0xD9BF68D1D9BF68D1 );
+       sc->C[ 1] = m128_const1_64( 0x58BAD75058BAD750 );
+       sc->C[ 2] = m128_const1_64( 0x56028CB256028CB2 );
+       sc->C[ 3] = m128_const1_64( 0x8134F3598134F359 );
+       sc->C[ 4] = m128_const1_64( 0xB5D469D8B5D469D8 );
+       sc->C[ 5] = m128_const1_64( 0x941A8CC2941A8CC2 );
+       sc->C[ 6] = m128_const1_64( 0x418B2A6E418B2A6E );
+       sc->C[ 7] = m128_const1_64( 0x0405278004052780 );
+       sc->C[ 8] = m128_const1_64( 0x7F07D7877F07D787 );
+       sc->C[ 9] = m128_const1_64( 0x5194358F5194358F );
+       sc->C[10] = m128_const1_64( 0x3C60D6653C60D665 );
+       sc->C[11] = m128_const1_64( 0xBE97D79ABE97D79A );
+       sc->C[12] = m128_const1_64( 0x950C3434950C3434 );
+       sc->C[13] = m128_const1_64( 0xAED9A06DAED9A06D );
+       sc->C[14] = m128_const1_64( 0x2537DC8D2537DC8D );
+       sc->C[15] = m128_const1_64( 0x7CDB59697CDB5969 );
+*/
   }
   else
-   {
-      for ( i = 0; i < 12; i++ )
-         sc->A[i] = _mm_set1_epi32( A_init_256[i] );
-      for ( i = 0; i < 16; i++ )
-      {
-         sc->B[i] = _mm_set1_epi32( B_init_256[i] );
-         sc->C[i] = _mm_set1_epi32( C_init_256[i] );
-      }
-    }
+   {  // No users
+       sc->state_loaded = true;
+       sc->A[ 0] = m128_const1_64( 0x52F8455252F84552 );
+       sc->A[ 1] = m128_const1_64( 0xE54B7999E54B7999 );
+       sc->A[ 2] = m128_const1_64( 0x2D8EE3EC2D8EE3EC );
+       sc->A[ 3] = m128_const1_64( 0xB9645191B9645191 );
+       sc->A[ 4] = m128_const1_64( 0xE0078B86E0078B86 );
+       sc->A[ 5] = m128_const1_64( 0xBB7C44C9BB7C44C9 );
+       sc->A[ 6] = m128_const1_64( 0xD2B5C1CAD2B5C1CA );
+       sc->A[ 7] = m128_const1_64( 0xB0D2EB8CB0D2EB8C );
+       sc->A[ 8] = m128_const1_64( 0x14CE5A4514CE5A45 );
+       sc->A[ 9] = m128_const1_64( 0x22AF50DC22AF50DC );
+       sc->A[10] = m128_const1_64( 0xEFFDBC6BEFFDBC6B );
+       sc->A[11] = m128_const1_64( 0xEB21B74AEB21B74A );
+
+       sc->B[ 0] = m128_const1_64( 0xB555C6EEB555C6EE );
+       sc->B[ 1] = m128_const1_64( 0x3E7105963E710596 );
+       sc->B[ 2] = m128_const1_64( 0xA72A652FA72A652F );
+       sc->B[ 3] = m128_const1_64( 0x9301515F9301515F );
+       sc->B[ 4] = m128_const1_64( 0xDA28C1FADA28C1FA );
+       sc->B[ 5] = m128_const1_64( 0x696FD868696FD868 );
+       sc->B[ 6] = m128_const1_64( 0x9CB6BF729CB6BF72 );
+       sc->B[ 7] = m128_const1_64( 0x0AFE40020AFE4002 );
+       sc->B[ 8] = m128_const1_64( 0xA6E03615A6E03615 );
+       sc->B[ 9] = m128_const1_64( 0x5138C1D45138C1D4 );
+       sc->B[10] = m128_const1_64( 0xBE216306BE216306 );
+       sc->B[11] = m128_const1_64( 0xB38B8890B38B8890 );
+       sc->B[12] = m128_const1_64( 0x3EA8B96B3EA8B96B );
+       sc->B[13] = m128_const1_64( 0x3299ACE43299ACE4 );
+       sc->B[14] = m128_const1_64( 0x30924DD430924DD4 );
+       sc->B[15] = m128_const1_64( 0x55CB34A555CB34A5 );
+
+       sc->C[ 0] = m128_const1_64( 0xB405F031B405F031 );
+       sc->C[ 1] = m128_const1_64( 0xC4233EBAC4233EBA );
+       sc->C[ 2] = m128_const1_64( 0xB3733979B3733979 );
+       sc->C[ 3] = m128_const1_64( 0xC0DD9D55C0DD9D55 );
+       sc->C[ 4] = m128_const1_64( 0xC51C28AEC51C28AE );
+       sc->C[ 5] = m128_const1_64( 0xA327B8E1A327B8E1 );
+       sc->C[ 6] = m128_const1_64( 0x56C5616756C56167 );
+       sc->C[ 7] = m128_const1_64( 0xED614433ED614433 );
+       sc->C[ 8] = m128_const1_64( 0x88B59D6088B59D60 );
+       sc->C[ 9] = m128_const1_64( 0x60E2CEBA60E2CEBA );
+       sc->C[10] = m128_const1_64( 0x758B4B8B758B4B8B );
+       sc->C[11] = m128_const1_64( 0x83E82A7F83E82A7F );
+       sc->C[12] = m128_const1_64( 0xBC968828BC968828 );
+       sc->C[13] = m128_const1_64( 0xE6E00BF7E6E00BF7 );
+       sc->C[14] = m128_const1_64( 0xBA839E55BA839E55 );
+       sc->C[15] = m128_const1_64( 0x9B491C609B491C60 );
+   }
    sc->Wlow = 1;
    sc->Whigh = 0;
    sc->ptr = 0;
@@ -488,6 +622,8 @@ shabal_4way_core( void *cc, const unsigned char *data, size_t len )
      sc->ptr = ptr;
      return;
   }
+   
+
   READ_STATE(sc);

   while ( len > 0 )
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -54,7 +54,8 @@ typedef struct {
 	__m128i buf[16] __attribute__ ((aligned (64)));
 	__m128i A[12], B[16], C[16];
 	sph_u32 Whigh, Wlow;
-        size_t ptr;
+   size_t ptr;
+   bool state_loaded;
 } shabal_4way_context;

 typedef shabal_4way_context shabal256_4way_context;
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -5,7 +5,7 @@
 #if defined(__SHA__)
  #include <openssl/sha.h>
 #else
-  #include "algo/sha/sha2-hash-4way.h"
+  #include "algo/sha/sha-hash-4way.h"
 #endif

 #if defined (SKEIN_4WAY)
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -415,18 +415,46 @@ do { \
 		sc->bcount = bcount; \
 	} while (0)

+/*
+static const sph_u64 IV256[] = {
+   SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
+   SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
+   SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
+   SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
+};

-static void
-skein_big_init_4way( skein512_4way_context *sc, const sph_u64 *iv )
+static const sph_u64 IV512[] = {
+   SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
+   SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
+   SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
+   SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
+};
+*/
+
+void skein256_4way_init( skein256_4way_context *sc )
 {
-        sc->h0 = _mm256_set_epi64x( iv[0], iv[0],iv[0],iv[0] );
-        sc->h1 = _mm256_set_epi64x( iv[1], iv[1],iv[1],iv[1] );
-        sc->h2 = _mm256_set_epi64x( iv[2], iv[2],iv[2],iv[2] );
-        sc->h3 = _mm256_set_epi64x( iv[3], iv[3],iv[3],iv[3] );
-        sc->h4 = _mm256_set_epi64x( iv[4], iv[4],iv[4],iv[4] );
-        sc->h5 = _mm256_set_epi64x( iv[5], iv[5],iv[5],iv[5] );
-        sc->h6 = _mm256_set_epi64x( iv[6], iv[6],iv[6],iv[6] );
-        sc->h7 = _mm256_set_epi64x( iv[7], iv[7],iv[7],iv[7] );
+        sc->h0 = m256_const1_64( 0xCCD044A12FDB3E13 );
+        sc->h1 = m256_const1_64( 0xE83590301A79A9EB );
+        sc->h2 = m256_const1_64( 0x55AEA0614F816E6F );
+        sc->h3 = m256_const1_64( 0x2A2767A4AE9B94DB );
+        sc->h4 = m256_const1_64( 0xEC06025E74DD7683 );
+        sc->h5 = m256_const1_64( 0xE7A436CDC4746251 );
+        sc->h6 = m256_const1_64( 0xC36FBAF9393AD185 );
+        sc->h7 = m256_const1_64( 0x3EEDBA1833EDFC13 );
+        sc->bcount = 0;
+        sc->ptr = 0;
+}
+
+void skein512_4way_init( skein512_4way_context *sc )
+{
+        sc->h0 = m256_const1_64( 0x4903ADFF749C51CE );
+        sc->h1 = m256_const1_64( 0x0D95DE399746DF03 );
+        sc->h2 = m256_const1_64( 0x8FD1934127C79BCE );
+        sc->h3 = m256_const1_64( 0x9A255629FF352CB1 );
+        sc->h4 = m256_const1_64( 0x5DB62599DF6CA7B0 );
+        sc->h5 = m256_const1_64( 0xEABE394CA9D5C3F4 );
+        sc->h6 = m256_const1_64( 0x991112C71A75B523 );
+        sc->h7 = m256_const1_64( 0xAE18A40B660FCC33 );
        sc->bcount = 0;
        sc->ptr = 0;
 }
@@ -524,6 +552,7 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
        memcpy_256( dst, buf, out_len >> 3 );
 }

+/*
 static const sph_u64 IV256[] = {
 	SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
 	SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
@@ -537,13 +566,14 @@ static const sph_u64 IV512[] = {
 	SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
 	SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
 };
-
-
+*/
+/*
 void
 skein256_4way_init(void *cc)
 {
 	skein_big_init_4way(cc, IV256);
 }
+*/

 void
 skein256_4way(void *cc, const void *data, size_t len)
@@ -557,11 +587,13 @@ skein256_4way_close(void *cc, void *dst)
        skein_big_close_4way(cc, 0, 0, dst, 32);
 }

+/*
 void
 skein512_4way_init(void *cc)
 {
 	skein_big_init_4way(cc, IV512);
 }
+*/

 void
 skein512_4way(void *cc, const void *data, size_t len)
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -55,25 +55,26 @@ extern "C"{
 #define SPH_SIZE_skein256   256
 #define SPH_SIZE_skein512   512

-typedef struct {
-        __m256i buf[8] __attribute__ ((aligned (32)));
-        __m256i h0, h1, h2, h3, h4, h5, h6, h7;
-        size_t ptr;
+typedef struct
+{
+   __m256i buf[8] __attribute__ ((aligned (64)));
+   __m256i h0, h1, h2, h3, h4, h5, h6, h7;
+   size_t ptr;
 	sph_u64 bcount;
 } sph_skein_4way_big_context;

 typedef sph_skein_4way_big_context skein512_4way_context;
 typedef sph_skein_4way_big_context skein256_4way_context;

-void skein512_4way_init(void *cc);
-void skein512_4way(void *cc, const void *data, size_t len);
-void skein512_4way_close(void *cc, void *dst);
+void skein512_4way_init( skein512_4way_context *sc );
+void skein512_4way( void *cc, const void *data, size_t len );
+void skein512_4way_close( void *cc, void *dst );
 //void sph_skein512_addbits_and_close(
 //        void *cc, unsigned ub, unsigned n, void *dst);

-void skein256_4way_init(void *cc);
-void skein256_4way(void *cc, const void *data, size_t len);
-void skein256_4way_close(void *cc, void *dst);
+void skein256_4way_init( skein256_4way_context *sc );
+void skein256_4way( void *cc, const void *data, size_t len );
+void skein256_4way_close( void *cc, void *dst );
 //void sph_skein256_addbits_and_close(
 //	void *cc, unsigned ub, unsigned n, void *dst);

--- a/algo/x11/fresh.c
+++ b/algo/x11/fresh.c
@@ -120,19 +120,13 @@ int scanhash_fresh( struct work *work,
 	return 0;
 }

-void fresh_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-
 bool register_fresh_algo( algo_gate_t* gate )
 {
    algo_not_tested();
    gate->scanhash   = (void*)&scanhash_fresh;
    gate->hash       = (void*)&freshhash;
-    gate->set_target = (void*)&fresh_set_target;
    gate->get_max64  = (void*)&get_max64_0x3ffff;
+    opt_target_factor = 256.0;
    return true;
 };

--- a/algo/x11/timetravel-gate.c
+++ b/algo/x11/timetravel-gate.c
@@ -1,10 +1,5 @@
 #include "timetravel-gate.h"

-void tt8_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
 bool register_timetravel_algo( algo_gate_t* gate )
 {
 #ifdef TIMETRAVEL_4WAY
@@ -16,9 +11,9 @@ bool register_timetravel_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_timetravel;
  gate->hash       = (void*)&timetravel_hash;
 #endif
-  gate->set_target = (void*)&tt8_set_target;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  opt_target_factor = 256.0;
  return true;
 };

--- a/algo/x11/timetravel10-gate.c
+++ b/algo/x11/timetravel10-gate.c
@@ -1,10 +1,5 @@
 #include "timetravel10-gate.h"

-void tt10_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
 bool register_timetravel10_algo( algo_gate_t* gate )
 {
 #ifdef TIMETRAVEL10_4WAY
@@ -16,9 +11,9 @@ bool register_timetravel10_algo( algo_gate_t* gate )
  gate->scanhash   = (void*)&scanhash_timetravel10;
  gate->hash       = (void*)&timetravel10_hash;
 #endif
-  gate->set_target = (void*)&tt10_set_target;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  opt_target_factor = 256.0;
  return true;
 };

--- a/algo/x13/drop.c
+++ b/algo/x13/drop.c
@@ -249,7 +249,6 @@ bool register_drop_algo( algo_gate_t* gate )
    gate->scanhash              = (void*)&scanhash_drop;
    gate->hash                  = (void*)&droplp_hash_pok;
    gate->get_new_work          = (void*)&drop_get_new_work;
-    gate->set_target            = (void*)&scrypt_set_target;
    gate->build_stratum_request = (void*)&std_be_build_stratum_request;
    gate->work_decode           = (void*)&std_be_work_decode;
    gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
@@ -257,6 +256,7 @@ bool register_drop_algo( algo_gate_t* gate )
    gate->decode_extra_data     = (void*)&drop_display_pok;
    gate->get_work_data_size    = (void*)&drop_get_work_data_size;
    gate->work_cmp_size         = 72;
+    opt_target_factor = 65536.0;
    return true;
 };

--- a/algo/x16/hex.c
+++ b/algo/x16/hex.c
@@ -0,0 +1,247 @@
+/**
+ * x16r algo implementation
+ *
+ * Implementation by tpruvot@github Jan 2018
+ * Optimized by JayDDee@github Jan 2018
+ */
+#include "x16r-gate.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "algo/blake/sph_blake.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/sph_groestl.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/luffa_for_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/nist.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include <openssl/sha.h>
+#if defined(__AES__)
+  #include "algo/echo/aes_ni/hash_api.h"
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+#endif
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+
+static void hex_getAlgoString(const uint32_t* prevblock, char *output)
+{
+   char *sptr = output;
+   uint8_t* data = (uint8_t*)prevblock;
+
+   for (uint8_t j = 0; j < X16R_HASH_FUNC_COUNT; j++) {
+      uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
+      uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
+      if (algoDigit >= 10)
+         sprintf(sptr, "%c", 'A' + (algoDigit - 10));
+      else
+         sprintf(sptr, "%u", (uint32_t) algoDigit);
+      sptr++;
+   }
+   *sptr = '\0';
+}
+
+union _hex_context_overlay
+{
+#if defined(__AES__)
+        hashState_echo          echo;
+        hashState_groestl       groestl;
+#else
+        sph_groestl512_context   groestl;
+        sph_echo512_context      echo;
+#endif
+        sph_blake512_context    blake;
+        sph_bmw512_context      bmw;
+        sph_skein512_context    skein;
+        sph_jh512_context       jh;
+        sph_keccak512_context   keccak;
+        hashState_luffa         luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        hashState_sd            simd;
+        sph_hamsi512_context    hamsi;
+        sph_fugue512_context    fugue;
+        sph_shabal512_context   shabal;
+        sph_whirlpool_context   whirlpool;
+        SHA512_CTX              sha512;
+};
+typedef union _hex_context_overlay hex_context_overlay;
+
+void hex_hash( void* output, const void* input )
+{
+   uint32_t _ALIGN(128) hash[16];
+   hex_context_overlay ctx;
+   void *in = (void*) input;
+   int size = 80;
+/*
+   if ( s_ntime == UINT32_MAX )
+   {
+      const uint8_t* in8 = (uint8_t*) input;
+      x16_r_s_getAlgoString( &in8[4], hashOrder );
+   }
+*/
+
+   char elem = hashOrder[0];
+   uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      switch ( algo )
+      {
+         case BLAKE:
+            sph_blake512_init( &ctx.blake );
+            sph_blake512( &ctx.blake, in, size );
+            sph_blake512_close( &ctx.blake, hash );
+         break;
+         case BMW:
+            sph_bmw512_init( &ctx.bmw );
+            sph_bmw512(&ctx.bmw, in, size);
+            sph_bmw512_close(&ctx.bmw, hash);
+         break;
+         case GROESTL:
+#if defined(__AES__)
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                      (const char*)in, size<<3 );
+#else
+            sph_groestl512_init( &ctx.groestl );
+            sph_groestl512( &ctx.groestl, in, size );
+            sph_groestl512_close(&ctx.groestl, hash);
+#endif
+         break;
+         case SKEIN:
+            sph_skein512_init( &ctx.skein );
+            sph_skein512( &ctx.skein, in, size );
+            sph_skein512_close( &ctx.skein, hash );
+         break;
+         case JH:
+            sph_jh512_init( &ctx.jh );
+            sph_jh512(&ctx.jh, in, size );
+            sph_jh512_close(&ctx.jh, hash );
+         break;
+         case KECCAK:
+            sph_keccak512_init( &ctx.keccak );
+            sph_keccak512( &ctx.keccak, in, size );
+            sph_keccak512_close( &ctx.keccak, hash );
+         break;
+         case LUFFA:
+            init_luffa( &ctx.luffa, 512 );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                    (const BitSequence*)in, size );
+         break;
+         case CUBEHASH:
+            cubehashInit( &ctx.cube, 512, 16, 32 );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash,
+                                  (const byte*)in, size );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in, size );
+            sph_shavite512_close( &ctx.shavite, hash );
+         break;
+         case SIMD:
+             init_sd( &ctx.simd, 512 );
+             update_final_sd( &ctx.simd, (BitSequence *)hash,
+                              (const BitSequence*)in, size<<3 );
+         break;
+         case ECHO:
+#if defined(__AES__)
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                                (const BitSequence*)in, size<<3 );
+#else
+             sph_echo512_init( &ctx.echo );
+             sph_echo512( &ctx.echo, in, size );
+             sph_echo512_close( &ctx.echo, hash );
+#endif
+         break;
+         case HAMSI:
+             sph_hamsi512_init( &ctx.hamsi );
+             sph_hamsi512( &ctx.hamsi, in, size );
+             sph_hamsi512_close( &ctx.hamsi, hash );
+         break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in, size );
+             sph_fugue512_close( &ctx.fugue, hash );
+         break;
+         case SHABAL:
+             sph_shabal512_init( &ctx.shabal );
+             sph_shabal512( &ctx.shabal, in, size );
+             sph_shabal512_close( &ctx.shabal, hash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash );
+         break;
+         case SHA_512:
+             SHA512_Init( &ctx.sha512 );
+             SHA512_Update( &ctx.sha512, in, size );
+             SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+         break;
+      }
+      algo = (uint8_t)hash[0] % X16R_HASH_FUNC_COUNT;
+      in = (void*) hash;
+      size = 64;
+   }
+   memcpy(output, hash, 32);
+}
+
+int scanhash_hex( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+   uint32_t nonce = first_nonce;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   uint32_t ntime = swab32(pdata[17]);
+   if ( s_ntime != ntime )
+   {
+      hex_getAlgoString( (const uint32_t*) (&endiandata[1]), hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   do
+   {
+      be32enc( &endiandata[19], nonce );
+      hex_hash( hash32, endiandata );
+
+      if ( hash32[7] <= Htarg )
+      if (fulltest( hash32, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = nonce;
+         submit_solution( work, hash32, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !(*restart) );
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
+}
--- a/algo/x16/x16r-4way.c
+++ b/algo/x16/x16r-4way.c
@@ -27,7 +27,7 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sha2-hash-4way.h"
+#include "algo/sha/sha-hash-4way.h"

 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
@@ -68,21 +68,7 @@ void x16r_4way_hash( void* output, const void* input )
   int size = 80;

   dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
- 
-   if ( s_ntime == UINT32_MAX )
-   {
-      const uint8_t* tmp = (uint8_t*) in0;
-      x16_r_s_getAlgoString( &tmp[4], hashOrder );
-   }

-   // Input data is both 64 bit interleaved (input)
-   // and deinterleaved in inp0-3.
-   // If First function uses 64 bit data it is not required to interleave inp
-   // first. It may use the inerleaved data dmost convenient, ie 4way 64 bit.
-   // All other functions assume data is deinterleaved in hash0-3
-   // All functions must exit with data deinterleaved in hash0-3.
-   // Alias in0-3 points to either inp0-3 or hash0-3 according to
-   // its hashOrder position. Size is also set accordingly.
   for ( int i = 0; i < 16; i++ )
   {
      const char elem = hashOrder[i];
--- a/algo/x16/x16r-gate.c
+++ b/algo/x16/x16r-gate.c
@@ -42,8 +42,23 @@ bool register_x16r_algo( algo_gate_t* gate )
  gate->hash      = (void*)&x16r_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-  gate->set_target = (void*)&alt_set_target;
  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
+  opt_target_factor = 256.0;
+  return true;
+};
+
+bool register_x16rv2_algo( algo_gate_t* gate )
+{
+#if defined (X16R_4WAY)
+  gate->scanhash  = (void*)&scanhash_x16rv2_4way;
+  gate->hash      = (void*)&x16rv2_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_x16rv2;
+  gate->hash      = (void*)&x16rv2_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  x16_r_s_getAlgoString = (void*)&x16r_getAlgoString;
+  opt_target_factor = 256.0;
  return true;
 };

@@ -57,8 +72,8 @@ bool register_x16s_algo( algo_gate_t* gate )
  gate->hash      = (void*)&x16r_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-  gate->set_target = (void*)&alt_set_target;
  x16_r_s_getAlgoString = (void*)&x16s_getAlgoString;
+  opt_target_factor = 256.0;
  return true;
 };

@@ -92,7 +107,7 @@ void x16rt_getAlgoString( const uint32_t *timeHash, char *output)
   *sptr = '\0';
 }

-void x16rt_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
+void veil_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
 {
   uchar merkle_tree[64] = { 0 };
   size_t t;
@@ -189,7 +204,7 @@ bool register_x16rt_algo( algo_gate_t* gate )
  gate->hash      = (void*)&x16rt_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-  gate->set_target = (void*)&alt_set_target;
+  opt_target_factor = 256.0;
  return true;
 };

@@ -203,8 +218,43 @@ bool register_x16rt_veil_algo( algo_gate_t* gate )
  gate->hash      = (void*)&x16rt_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-  gate->set_target = (void*)&alt_set_target;
-  gate->build_extraheader = (void*)&x16rt_build_extraheader;
+  gate->build_extraheader = (void*)&veil_build_extraheader;
+  opt_target_factor = 256.0;
+  return true;
+};
+
+////////////////////
+//
+//    HEX
+
+bool register_hex_algo( algo_gate_t* gate )
+{
+  gate->scanhash        = (void*)&scanhash_hex;
+  gate->hash            = (void*)&hex_hash;
+  gate->optimizations   = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  opt_target_factor = 128.0;
+  return true;
+};
+
+///////////////////////////////
+//
+//   X21S
+
+bool register_x21s_algo( algo_gate_t* gate )
+{
+#if defined (X16R_4WAY)
+  gate->scanhash          = (void*)&scanhash_x21s_4way;
+  gate->hash              = (void*)&x21s_4way_hash;
+  gate->miner_thread_init = (void*)&x21s_4way_thread_init;
+#else
+  gate->scanhash          = (void*)&scanhash_x21s;
+  gate->hash              = (void*)&x21s_hash;
+  gate->miner_thread_init = (void*)&x21s_thread_init;
+#endif
+  gate->optimizations     = SSE2_OPT | AES_OPT | AVX2_OPT | SHA_OPT;
+  x16_r_s_getAlgoString   = (void*)&x16s_getAlgoString;
+  opt_target_factor = 256.0;
  return true;
 };

--- a/algo/x16/x16r-gate.h
+++ b/algo/x16/x16r-gate.h
@@ -38,8 +38,11 @@ void x16rt_getAlgoString( const uint32_t *timeHash, char *output );
 void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash );

 bool register_x16r_algo( algo_gate_t* gate );
+bool register_x16rv2_algo( algo_gate_t* gate );
 bool register_x16s_algo( algo_gate_t* gate );
 bool register_x16rt_algo( algo_gate_t* gate );
+bool register_hex__algo( algo_gate_t* gate );
+bool register_x21s__algo( algo_gate_t* gate );

 #if defined(X16R_4WAY)

@@ -47,18 +50,41 @@ void x16r_4way_hash( void *state, const void *input );
 int scanhash_x16r_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

+void x16rv2_4way_hash( void *state, const void *input );
+int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+
 void x16rt_4way_hash( void *state, const void *input );
 int scanhash_x16rt_4way( struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done, struct thr_info *mythr );

+void x21s_4way_hash( void *state, const void *input );
+int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr );
+bool x21s_4way_thread_init();
+
 #endif

 void x16r_hash( void *state, const void *input );
 int scanhash_x16r( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );

+void x16rv2_hash( void *state, const void *input );
+int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr );
+
 void x16rt_hash( void *state, const void *input );
 int scanhash_x16rt( struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done, struct thr_info *mythr );
+
+void hex_hash( void *state, const void *input );
+int scanhash_hex( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr );
+
+void x21s_hash( void *state, const void *input );
+int scanhash_x21s( struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done, struct thr_info *mythr );
+bool x21s_thread_init();
+
 #endif

--- a/algo/x16/x16r.c
+++ b/algo/x16/x16r.c
@@ -65,13 +65,13 @@ void x16r_hash( void* output, const void* input )
   x16r_context_overlay ctx;
   void *in = (void*) input;
   int size = 80;
-
+/*
   if ( s_ntime == UINT32_MAX )
   {
      const uint8_t* in8 = (uint8_t*) input;
      x16_r_s_getAlgoString( &in8[4], hashOrder );
   }
-
+*/
   for ( int i = 0; i < 16; i++ )
   {
      const char elem = hashOrder[i];
--- a/algo/x16/x16rt-4way.c
+++ b/algo/x16/x16rt-4way.c
@@ -21,7 +21,7 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sha2-hash-4way.h"
+#include "algo/sha/sha-hash-4way.h"

 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread bool s_implemented = false;
--- a/algo/x16/x16rv2-4way.c
+++ b/algo/x16/x16rv2-4way.c
@@ -0,0 +1,384 @@
+/**
+ * x16r algo implementation
+ *
+ * Implementation by tpruvot@github Jan 2018
+ * Optimized by JayDDee@github Jan 2018
+ */
+#include "x16r-gate.h"
+
+#if defined (X16R_4WAY)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/simd-hash-2way.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/hamsi-hash-4way.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/sha/sha-hash-4way.h"
+#include "algo/tiger/sph_tiger.h"
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+
+union _x16rv2_4way_context_overlay
+{
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_echo          echo;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    luffa_2way_context      luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    simd_2way_context       simd;
+    hamsi512_4way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_4way_context     sha512;
+    sph_tiger_context       tiger;
+};
+typedef union _x16rv2_4way_context_overlay x16rv2_4way_context_overlay;
+
+// Pad the 24 bytes tiger hash to 64 bytes
+inline void padtiger512( uint32_t* hash )
+{
+  for ( int i = 6; i < 16; i++ ) hash[i] = 0;
+}
+
+void x16rv2_4way_hash( void* output, const void* input )
+{
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
+   x16rv2_4way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   int size = 80;
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_4way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_4way( &ctx.blake, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               blake512_4way( &ctx.blake, vhash, size );
+            }
+            blake512_4way_close( &ctx.blake, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case BMW:
+            bmw512_4way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_4way( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               bmw512_4way( &ctx.bmw, vhash, size );
+            }
+            bmw512_4way_close( &ctx.bmw, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+         break;
+         case SKEIN:
+            skein512_4way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_4way( &ctx.skein, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               skein512_4way( &ctx.skein, vhash, size );
+            }
+            skein512_4way_close( &ctx.skein, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case JH:
+            jh512_4way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_4way( &ctx.jh, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               jh512_4way( &ctx.jh, vhash, size );
+            }
+            jh512_4way_close( &ctx.jh, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case KECCAK:
+             sph_tiger_init( &ctx.tiger );
+			    sph_tiger( &ctx.tiger, in0, size );
+			    sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;
+
+             intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+             keccak512_4way_init( &ctx.keccak );
+             keccak512_4way( &ctx.keccak, vhash, 64 );
+             keccak512_4way_close( &ctx.keccak, vhash );
+             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case LUFFA:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = 0;
+
+             intrlv_2x128( vhash, hash0, hash1, 512 );
+             luffa_2way_init( &ctx.luffa, 512 );
+             luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+             dintrlv_2x128( hash0, hash1, vhash, 512 );
+
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash2[i] = hash3[i] = 0;
+             
+             intrlv_2x128( vhash, hash2, hash3, 512 );
+             luffa_2way_init( &ctx.luffa, 512 );
+             luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+             dintrlv_2x128( hash2, hash3, vhash, 512 );
+         break;
+         case CUBEHASH:
+             cubehashInit( &ctx.cube, 512, 16, 32 );
+             cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
+                                   (const byte*)in0, size );
+             cubehashInit( &ctx.cube, 512, 16, 32 );
+             cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                   (const byte*)in1, size );
+             cubehashInit( &ctx.cube, 512, 16, 32 );
+             cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                   (const byte*)in2, size );
+             cubehashInit( &ctx.cube, 512, 16, 32 );
+             cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                   (const byte*)in3, size );
+         break;
+         case SHAVITE:
+             sph_shavite512_init( &ctx.shavite );
+             sph_shavite512( &ctx.shavite, in0, size );
+             sph_shavite512_close( &ctx.shavite, hash0 );
+             sph_shavite512_init( &ctx.shavite );
+             sph_shavite512( &ctx.shavite, in1, size );
+             sph_shavite512_close( &ctx.shavite, hash1 );
+             sph_shavite512_init( &ctx.shavite );
+             sph_shavite512( &ctx.shavite, in2, size );
+             sph_shavite512_close( &ctx.shavite, hash2 );
+             sph_shavite512_init( &ctx.shavite );
+             sph_shavite512( &ctx.shavite, in3, size );
+             sph_shavite512_close( &ctx.shavite, hash3 );
+         break;
+         case SIMD:
+             intrlv_2x128( vhash, in0, in1, size<<3 );
+             simd_2way_init( &ctx.simd, 512 );
+             simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+             dintrlv_2x128( hash0, hash1, vhash, 512 );
+             intrlv_2x128( vhash, in2, in3, size<<3 );
+             simd_2way_init( &ctx.simd, 512 );
+             simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+             dintrlv_2x128( hash2, hash3, vhash, 512 );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+             hamsi512_4way_init( &ctx.hamsi );
+             hamsi512_4way( &ctx.hamsi, vhash, size );
+             hamsi512_4way_close( &ctx.hamsi, vhash );
+             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+         break;
+         case SHABAL:
+             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
+             shabal512_4way_init( &ctx.shabal );
+             shabal512_4way( &ctx.shabal, vhash, size );
+             shabal512_4way_close( &ctx.shabal, vhash );
+             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+         break;
+         case SHA_512:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in0, size );
+             sph_tiger_close( &ctx.tiger, hash0 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in1, size );
+             sph_tiger_close( &ctx.tiger, hash1 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in2, size );
+             sph_tiger_close( &ctx.tiger, hash2 );
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in3, size );
+             sph_tiger_close( &ctx.tiger, hash3 );
+
+             for ( int i = (24/4); i < (64/4); i++ )
+                hash0[i] = hash1[i] = hash2[i] = hash3[i] = 0;
+ 
+             intrlv_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+             sha512_4way_init( &ctx.sha512 );
+             sha512_4way( &ctx.sha512, vhash, 64 );
+             sha512_4way_close( &ctx.sha512, vhash );
+             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+      }
+      size = 64;
+   }
+   memcpy( output,    hash0, 32 );
+   memcpy( output+32, hash1, 32 );
+   memcpy( output+64, hash2, 32 );
+   memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_x16rv2_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[4*16] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   if ( s_ntime != endiandata[17] )
+   {
+      uint32_t ntime = swab32(pdata[17]);
+      x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   do
+   {
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+      x16rv2_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 4; i++ )  if ( (hash+(i<<3))[7] <= Htarg )
+      if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 4;
+   } while ( (  n < max_nonce ) && !(*restart) );
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+#endif
--- a/algo/x16/x16rv2.c
+++ b/algo/x16/x16rv2.c
@@ -0,0 +1,247 @@
+/**
+ * x16r algo implementation
+ *
+ * Implementation by tpruvot@github Jan 2018
+ * Optimized by JayDDee@github Jan 2018
+ */
+#include "x16r-gate.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "algo/blake/sph_blake.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/sph_groestl.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/luffa_for_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/nist.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include <openssl/sha.h>
+#include "algo/tiger/sph_tiger.h"
+#if defined(__AES__)
+  #include "algo/echo/aes_ni/hash_api.h"
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+#endif
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+
+union _x16rv2_context_overlay
+{
+#if defined(__AES__)
+        hashState_echo          echo;
+        hashState_groestl       groestl;
+#else
+        sph_groestl512_context   groestl;
+        sph_echo512_context      echo;
+#endif
+        sph_blake512_context    blake;
+        sph_bmw512_context      bmw;
+        sph_skein512_context    skein;
+        sph_jh512_context       jh;
+        sph_keccak512_context   keccak;
+        hashState_luffa         luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        hashState_sd            simd;
+        sph_hamsi512_context    hamsi;
+        sph_fugue512_context    fugue;
+        sph_shabal512_context   shabal;
+        sph_whirlpool_context   whirlpool;
+        SHA512_CTX              sha512;
+        sph_tiger_context       tiger;
+};
+typedef union _x16rv2_context_overlay x16rv2_context_overlay;
+
+// Pad the 24 bytes tiger hash to 64 bytes
+inline void padtiger512(uint32_t* hash) {
+   for (int i = (24/4); i < (64/4); i++) hash[i] = 0;
+}
+
+void x16rv2_hash( void* output, const void* input )
+{
+   uint32_t _ALIGN(128) hash[16];
+   x16rv2_context_overlay ctx;
+   void *in = (void*) input;
+   int size = 80;
+/*
+   if ( s_ntime == UINT32_MAX )
+   {
+      const uint8_t* in8 = (uint8_t*) input;
+      x16_r_s_getAlgoString( &in8[4], hashOrder );
+   }
+*/
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            sph_blake512_init( &ctx.blake );
+            sph_blake512( &ctx.blake, in, size );
+            sph_blake512_close( &ctx.blake, hash );
+         break;
+         case BMW:
+            sph_bmw512_init( &ctx.bmw );
+            sph_bmw512(&ctx.bmw, in, size);
+            sph_bmw512_close(&ctx.bmw, hash);
+         break;
+         case GROESTL:
+#if defined(__AES__)
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                      (const char*)in, size<<3 );
+#else
+            sph_groestl512_init( &ctx.groestl );
+            sph_groestl512( &ctx.groestl, in, size );
+            sph_groestl512_close(&ctx.groestl, hash);
+#endif
+         break;
+         case SKEIN:
+            sph_skein512_init( &ctx.skein );
+            sph_skein512( &ctx.skein, in, size );
+            sph_skein512_close( &ctx.skein, hash );
+         break;
+         case JH:
+            sph_jh512_init( &ctx.jh );
+            sph_jh512(&ctx.jh, in, size );
+            sph_jh512_close(&ctx.jh, hash );
+         break;
+         case KECCAK:
+            sph_tiger_init( &ctx.tiger );
+            sph_tiger( &ctx.tiger, in, size );
+            sph_tiger_close( &ctx.tiger, hash );
+            padtiger512( hash );
+            sph_keccak512_init( &ctx.keccak );
+            sph_keccak512( &ctx.keccak, hash, 64 );
+            sph_keccak512_close( &ctx.keccak, hash );
+         break;
+         case LUFFA:
+            sph_tiger_init( &ctx.tiger );
+            sph_tiger( &ctx.tiger, in, size );
+            sph_tiger_close( &ctx.tiger, hash );
+            padtiger512( hash );
+            init_luffa( &ctx.luffa, 512 );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                    (const BitSequence*)hash, 64 );
+         break;
+         case CUBEHASH:
+            cubehashInit( &ctx.cube, 512, 16, 32 );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash,
+                                  (const byte*)in, size );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in, size );
+            sph_shavite512_close( &ctx.shavite, hash );
+         break;
+         case SIMD:
+             init_sd( &ctx.simd, 512 );
+             update_final_sd( &ctx.simd, (BitSequence *)hash,
+                              (const BitSequence*)in, size<<3 );
+         break;
+         case ECHO:
+#if defined(__AES__)
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                                (const BitSequence*)in, size<<3 );
+#else
+             sph_echo512_init( &ctx.echo );
+             sph_echo512( &ctx.echo, in, size );
+             sph_echo512_close( &ctx.echo, hash );
+#endif
+         break;
+         case HAMSI:
+             sph_hamsi512_init( &ctx.hamsi );
+             sph_hamsi512( &ctx.hamsi, in, size );
+             sph_hamsi512_close( &ctx.hamsi, hash );
+         break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in, size );
+             sph_fugue512_close( &ctx.fugue, hash );
+         break;
+         case SHABAL:
+             sph_shabal512_init( &ctx.shabal );
+             sph_shabal512( &ctx.shabal, in, size );
+             sph_shabal512_close( &ctx.shabal, hash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash );
+         break;
+         case SHA_512:
+             sph_tiger_init( &ctx.tiger );
+             sph_tiger( &ctx.tiger, in, size );
+             sph_tiger_close( &ctx.tiger, hash );
+             padtiger512( hash );
+             SHA512_Init( &ctx.sha512 );
+             SHA512_Update( &ctx.sha512, hash, 64 );
+             SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+         break;
+      }
+      in = (void*) hash;
+      size = 64;
+   }
+   memcpy(output, hash, 32);
+}
+
+int scanhash_x16rv2( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+   uint32_t nonce = first_nonce;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   if ( s_ntime != pdata[17] )
+   {
+      uint32_t ntime = swab32(pdata[17]);
+      x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   do
+   {
+      be32enc( &endiandata[19], nonce );
+      x16rv2_hash( hash32, endiandata );
+
+      if ( hash32[7] <= Htarg )
+      if (fulltest( hash32, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = nonce;
+         submit_solution( work, hash32, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !(*restart) );
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
+}
--- a/algo/x16/x21s-4way.c
+++ b/algo/x16/x21s-4way.c
@@ -0,0 +1,431 @@
+/**
+ * x16r algo implementation
+ *
+ * Implementation by tpruvot@github Jan 2018
+ * Optimized by JayDDee@github Jan 2018
+ */
+#include "x16r-gate.h"
+
+#if defined (X16R_4WAY)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/simd-hash-2way.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/hamsi/hamsi-hash-4way.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/shabal-hash-4way.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include "algo/sha/sha-hash-4way.h"
+#include "algo/haval/haval-hash-4way.h"
+#include "algo/tiger/sph_tiger.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/lyra2/lyra2.h"
+#if defined(__SHA__)
+ #include <openssl/sha.h>
+#endif
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+static __thread uint64_t* x21s_4way_matrix;
+
+union _x21s_4way_context_overlay
+{
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_echo          echo;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    luffa_2way_context      luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    simd_2way_context       simd;
+    hamsi512_4way_context   hamsi;
+    sph_fugue512_context    fugue;
+    shabal512_4way_context  shabal;
+    sph_whirlpool_context   whirlpool;
+    sha512_4way_context     sha512;
+    haval256_5_4way_context haval;
+    sph_tiger_context       tiger;
+    sph_gost512_context     gost;
+#if defined(__SHA__)
+    SHA256_CTX              sha256;
+#else
+    sha256_4way_context     sha256;
+#endif
+};
+typedef union _x21s_4way_context_overlay x21s_4way_context_overlay;
+
+void x21s_4way_hash( void* output, const void* input )
+{
+   uint32_t hash0[24] __attribute__ ((aligned (64)));
+   uint32_t hash1[24] __attribute__ ((aligned (64)));
+   uint32_t hash2[24] __attribute__ ((aligned (64)));
+   uint32_t hash3[24] __attribute__ ((aligned (64)));
+   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
+   x21s_4way_context_overlay ctx;
+   void *in0 = (void*) hash0;
+   void *in1 = (void*) hash1;
+   void *in2 = (void*) hash2;
+   void *in3 = (void*) hash3;
+   int size = 80;
+
+   dintrlv_4x64( hash0, hash1, hash2, hash3, input, 640 );
+
+   // Input data is both 64 bit interleaved (input)
+   // and deinterleaved in inp0-3.
+   // If First function uses 64 bit data it is not required to interleave inp
+   // first. It may use the inerleaved data dmost convenient, ie 4way 64 bit.
+   // All other functions assume data is deinterleaved in hash0-3
+   // All functions must exit with data deinterleaved in hash0-3.
+   // Alias in0-3 points to either inp0-3 or hash0-3 according to
+   // its hashOrder position. Size is also set accordingly.
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            blake512_4way_init( &ctx.blake );
+            if ( i == 0 )
+               blake512_4way( &ctx.blake, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               blake512_4way( &ctx.blake, vhash, size );
+            }
+            blake512_4way_close( &ctx.blake, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case BMW:
+            bmw512_4way_init( &ctx.bmw );
+            if ( i == 0 )
+               bmw512_4way( &ctx.bmw, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               bmw512_4way( &ctx.bmw, vhash, size );
+            }
+            bmw512_4way_close( &ctx.bmw, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case GROESTL:
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                 (const char*)in0, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                 (const char*)in1, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                 (const char*)in2, size<<3 );
+               init_groestl( &ctx.groestl, 64 );
+               update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                 (const char*)in3, size<<3 );
+         break;
+         case SKEIN:
+            skein512_4way_init( &ctx.skein );
+            if ( i == 0 )
+               skein512_4way( &ctx.skein, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               skein512_4way( &ctx.skein, vhash, size );
+            }
+            skein512_4way_close( &ctx.skein, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case JH:
+            jh512_4way_init( &ctx.jh );
+            if ( i == 0 )
+               jh512_4way( &ctx.jh, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               jh512_4way( &ctx.jh, vhash, size );
+            }
+            jh512_4way_close( &ctx.jh, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case KECCAK:
+            keccak512_4way_init( &ctx.keccak );
+            if ( i == 0 )
+               keccak512_4way( &ctx.keccak, input, size );
+            else
+            {
+               intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+               keccak512_4way( &ctx.keccak, vhash, size );
+            }
+            keccak512_4way_close( &ctx.keccak, vhash );
+            dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case LUFFA:
+            intrlv_2x128( vhash, in0, in1, size<<3 );
+            luffa_2way_init( &ctx.luffa, 512 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
+            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            intrlv_2x128( vhash, in2, in3, size<<3 );
+            luffa_2way_init( &ctx.luffa, 512 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
+            dintrlv_2x128( hash2, hash3, vhash, 512 );
+         break;
+         case CUBEHASH:
+            cubehashInit( &ctx.cube, 512, 16, 32 );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash0,
+                                  (const byte*)in0, size );
+            cubehashInit( &ctx.cube, 512, 16, 32 );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash1,
+                                  (const byte*)in1, size );
+            cubehashInit( &ctx.cube, 512, 16, 32 );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash2,
+                                  (const byte*)in2, size );
+            cubehashInit( &ctx.cube, 512, 16, 32 );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash3,
+                                  (const byte*)in3, size );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in0, size );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in1, size );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in2, size );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in3, size );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+         break;
+         case SIMD:
+            intrlv_2x128( vhash, in0, in1, size<<3 );
+            simd_2way_init( &ctx.simd, 512 );
+            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_2x128( hash0, hash1, vhash, 512 );
+            intrlv_2x128( vhash, in2, in3, size<<3 );
+            simd_2way_init( &ctx.simd, 512 );
+            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            dintrlv_2x128( hash2, hash3, vhash, 512 );
+         break;
+         case ECHO:
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash0,
+                                (const BitSequence*)in0, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash1,
+                                (const BitSequence*)in1, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash2,
+                                (const BitSequence*)in2, size<<3 );
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash3,
+                                (const BitSequence*)in3, size<<3 );
+         break;
+         case HAMSI:
+             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+             hamsi512_4way_init( &ctx.hamsi );
+             hamsi512_4way( &ctx.hamsi, vhash, size );
+             hamsi512_4way_close( &ctx.hamsi, vhash );
+             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in0, size );
+             sph_fugue512_close( &ctx.fugue, hash0 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in1, size );
+             sph_fugue512_close( &ctx.fugue, hash1 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in2, size );
+             sph_fugue512_close( &ctx.fugue, hash2 );
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in3, size );
+             sph_fugue512_close( &ctx.fugue, hash3 );
+         break;
+         case SHABAL:
+             intrlv_4x32( vhash, in0, in1, in2, in3, size<<3 );
+             shabal512_4way_init( &ctx.shabal );
+             shabal512_4way( &ctx.shabal, vhash, size );
+             shabal512_4way_close( &ctx.shabal, vhash );
+             dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in0, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash0 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in1, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash1 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in2, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash2 );
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in3, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash3 );
+         break;
+         case SHA_512:
+             intrlv_4x64( vhash, in0, in1, in2, in3, size<<3 );
+             sha512_4way_init( &ctx.sha512 );
+             sha512_4way( &ctx.sha512, vhash, size );
+             sha512_4way_close( &ctx.sha512, vhash );
+             dintrlv_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+         break;
+      }
+      size = 64;
+   }
+
+   intrlv_4x32( vhash, hash0, hash1, hash2, hash3,  512 );
+
+   haval256_5_4way_init( &ctx.haval );
+   haval256_5_4way( &ctx.haval, vhash, 64 );
+   haval256_5_4way_close( &ctx.haval, vhash );
+
+   dintrlv_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash0, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash0 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash1, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash1 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash2, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash2 );
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash3, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash3 );
+
+   LYRA2REV2( x21s_4way_matrix, (void*) hash0, 32, (const void*) hash0, 32,
+            (const void*) hash0, 32, 1, 4, 4 );
+   LYRA2REV2( x21s_4way_matrix, (void*) hash1, 32, (const void*) hash1, 32,
+            (const void*) hash1, 32, 1, 4, 4 );
+   LYRA2REV2( x21s_4way_matrix, (void*) hash2, 32, (const void*) hash2, 32,
+            (const void*) hash2, 32, 1, 4, 4 );
+   LYRA2REV2( x21s_4way_matrix, (void*) hash3, 32, (const void*) hash3, 32,
+            (const void*) hash3, 32, 1, 4, 4 );
+
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash0, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash0 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash1, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash1 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash2, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash2 );
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash3, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash3 );
+
+#if defined(__SHA__)
+
+   SHA256_Init( &ctx.sha256 );
+   SHA256_Update( &ctx.sha256, hash0, 64 );
+   SHA256_Final( (unsigned char*)hash0, &ctx.sha256 );
+   SHA256_Init( &ctx.sha256 );
+   SHA256_Update( &ctx.sha256, hash1, 64 );
+   SHA256_Final( (unsigned char*)hash1, &ctx.sha256 );
+   SHA256_Init( &ctx.sha256 );
+   SHA256_Update( &ctx.sha256, hash2, 64 );
+   SHA256_Final( (unsigned char*)hash2, &ctx.sha256 );
+   SHA256_Init( &ctx.sha256 );
+   SHA256_Update( &ctx.sha256, hash3, 64 );
+   SHA256_Final( (unsigned char*)hash3, &ctx.sha256 );
+
+   memcpy( output,    hash0, 32 );
+   memcpy( output+32, hash1, 32 );
+   memcpy( output+64, hash2, 32 );
+   memcpy( output+96, hash3, 32 );
+
+#else
+
+   intrlv_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+   sha256_4way_init( &ctx.sha256 );
+   sha256_4way( &ctx.sha256, vhash, 64 );
+   sha256_4way_close( &ctx.sha256, vhash );
+   dintrlv_4x32( output, output+32, output+64,output+96, vhash, 256 );
+
+#endif
+}
+
+int scanhash_x21s_4way( struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done, struct thr_info *mythr)
+{
+   uint32_t hash[4*16] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   int thr_id = mythr->id; 
+    __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   casti_m256i( endiandata, 0 ) = mm256_bswap_32( casti_m256i( pdata, 0 ) );
+   casti_m256i( endiandata, 1 ) = mm256_bswap_32( casti_m256i( pdata, 1 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   if ( s_ntime != endiandata[17] )
+   {
+      uint32_t ntime = swab32(pdata[17]);
+      x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   intrlv_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   do
+   {
+      *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
+               _mm256_set_epi32( n+3, 0, n+2, 0, n+1, 0, n, 0 ) ), *noncev );
+
+      x21s_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 4; i++ )  if ( (hash+(i<<3))[7] <= Htarg )
+      if( fulltest( hash+(i<<3), ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = n+i;
+         submit_lane_solution( work, hash+(i<<3), mythr, i );
+      }
+      n += 4;
+   } while ( (  n < max_nonce ) && !(*restart) );
+
+   *hashes_done = n - first_nonce + 1;
+   return 0;
+}
+
+bool x21s_4way_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   x21s_4way_matrix = _mm_malloc( size, 64 );
+   return x21s_4way_matrix;
+}
+
+#endif
--- a/algo/x16/x21s.c
+++ b/algo/x16/x21s.c
@@ -0,0 +1,263 @@
+/**
+ * x16r algo implementation
+ *
+ * Implementation by tpruvot@github Jan 2018
+ * Optimized by JayDDee@github Jan 2018
+ */
+#include "x16r-gate.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "algo/blake/sph_blake.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/groestl/sph_groestl.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/luffa/luffa_for_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#include "algo/simd/nist.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/hamsi/sph_hamsi.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/whirlpool/sph_whirlpool.h"
+#include <openssl/sha.h>
+#if defined(__AES__)
+  #include "algo/echo/aes_ni/hash_api.h"
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+#endif
+#include "algo/haval/sph-haval.h"
+#include "algo/tiger/sph_tiger.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/lyra2/lyra2.h"
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 };
+
+static __thread uint64_t* x21s_matrix;
+
+union _x21s_context_overlay
+{
+#if defined(__AES__)
+        hashState_echo          echo;
+        hashState_groestl       groestl;
+#else
+        sph_groestl512_context   groestl;
+        sph_echo512_context      echo;
+#endif
+        sph_blake512_context    blake;
+        sph_bmw512_context      bmw;
+        sph_skein512_context    skein;
+        sph_jh512_context       jh;
+        sph_keccak512_context   keccak;
+        hashState_luffa         luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        hashState_sd            simd;
+        sph_hamsi512_context    hamsi;
+        sph_fugue512_context    fugue;
+        sph_shabal512_context   shabal;
+        sph_whirlpool_context   whirlpool;
+        SHA512_CTX              sha512;
+        sph_haval256_5_context  haval;
+        sph_tiger_context       tiger;
+        sph_gost512_context     gost;
+        SHA256_CTX              sha256;
+};
+typedef union _x21s_context_overlay x21s_context_overlay;
+
+void x21s_hash( void* output, const void* input )
+{
+   uint32_t _ALIGN(128) hash[16];
+   x21s_context_overlay ctx;
+   void *in = (void*) input;
+   int size = 80;
+
+   for ( int i = 0; i < 16; i++ )
+   {
+      const char elem = hashOrder[i];
+      const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+      switch ( algo )
+      {
+         case BLAKE:
+            sph_blake512_init( &ctx.blake );
+            sph_blake512( &ctx.blake, in, size );
+            sph_blake512_close( &ctx.blake, hash );
+         break;
+         case BMW:
+            sph_bmw512_init( &ctx.bmw );
+            sph_bmw512(&ctx.bmw, in, size);
+            sph_bmw512_close(&ctx.bmw, hash);
+         break;
+         case GROESTL:
+#if defined(__AES__)
+            init_groestl( &ctx.groestl, 64 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash,
+                                      (const char*)in, size<<3 );
+#else
+            sph_groestl512_init( &ctx.groestl );
+            sph_groestl512( &ctx.groestl, in, size );
+            sph_groestl512_close(&ctx.groestl, hash);
+#endif
+         break;
+         case SKEIN:
+            sph_skein512_init( &ctx.skein );
+            sph_skein512( &ctx.skein, in, size );
+            sph_skein512_close( &ctx.skein, hash );
+         break;
+         case JH:
+            sph_jh512_init( &ctx.jh );
+            sph_jh512(&ctx.jh, in, size );
+            sph_jh512_close(&ctx.jh, hash );
+         break;
+         case KECCAK:
+            sph_keccak512_init( &ctx.keccak );
+            sph_keccak512( &ctx.keccak, in, size );
+            sph_keccak512_close( &ctx.keccak, hash );
+         break;
+         case LUFFA:
+            init_luffa( &ctx.luffa, 512 );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash,
+                                    (const BitSequence*)in, size );
+         break;
+         case CUBEHASH:
+            cubehashInit( &ctx.cube, 512, 16, 32 );
+            cubehashUpdateDigest( &ctx.cube, (byte*) hash,
+                                  (const byte*)in, size );
+         break;
+         case SHAVITE:
+            sph_shavite512_init( &ctx.shavite );
+            sph_shavite512( &ctx.shavite, in, size );
+            sph_shavite512_close( &ctx.shavite, hash );
+         break;
+         case SIMD:
+             init_sd( &ctx.simd, 512 );
+             update_final_sd( &ctx.simd, (BitSequence *)hash,
+                              (const BitSequence*)in, size<<3 );
+         break;
+         case ECHO:
+#if defined(__AES__)
+             init_echo( &ctx.echo, 512 );
+             update_final_echo ( &ctx.echo, (BitSequence *)hash,
+                                (const BitSequence*)in, size<<3 );
+#else
+             sph_echo512_init( &ctx.echo );
+             sph_echo512( &ctx.echo, in, size );
+             sph_echo512_close( &ctx.echo, hash );
+#endif
+         break;
+         case HAMSI:
+             sph_hamsi512_init( &ctx.hamsi );
+             sph_hamsi512( &ctx.hamsi, in, size );
+             sph_hamsi512_close( &ctx.hamsi, hash );
+         break;
+         case FUGUE:
+             sph_fugue512_init( &ctx.fugue );
+             sph_fugue512( &ctx.fugue, in, size );
+             sph_fugue512_close( &ctx.fugue, hash );
+         break;
+         case SHABAL:
+             sph_shabal512_init( &ctx.shabal );
+             sph_shabal512( &ctx.shabal, in, size );
+             sph_shabal512_close( &ctx.shabal, hash );
+         break;
+         case WHIRLPOOL:
+             sph_whirlpool_init( &ctx.whirlpool );
+             sph_whirlpool( &ctx.whirlpool, in, size );
+             sph_whirlpool_close( &ctx.whirlpool, hash );
+         break;
+         case SHA_512:
+             SHA512_Init( &ctx.sha512 );
+             SHA512_Update( &ctx.sha512, in, size );
+             SHA512_Final( (unsigned char*) hash, &ctx.sha512 );
+         break;
+      }
+      in = (void*) hash;
+      size = 64;
+   }
+
+   sph_haval256_5_init( &ctx.haval );
+   sph_haval256_5( &ctx.haval, (const void*) hash, 64) ;
+   sph_haval256_5_close( &ctx.haval, hash );
+
+   sph_tiger_init( &ctx.tiger );
+   sph_tiger ( &ctx.tiger, (const void*) hash, 64 );
+   sph_tiger_close( &ctx.tiger, (void*) hash );
+
+   LYRA2REV2( x21s_matrix, (void*) hash, 32, (const void*) hash, 32,
+               (const void*) hash, 32, 1, 4, 4);
+
+   sph_gost512_init( &ctx.gost );
+   sph_gost512 ( &ctx.gost, (const void*) hash, 64 );
+   sph_gost512_close( &ctx.gost, (void*) hash );
+
+   SHA256_Init( &ctx.sha256 );
+   SHA256_Update( &ctx.sha256, hash, 64 );
+   SHA256_Final( (unsigned char*)hash, &ctx.sha256 );
+
+   memcpy( output, hash, 32 );
+}
+
+int scanhash_x21s( struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done, struct thr_info *mythr )
+{
+   uint32_t _ALIGN(128) hash32[8];
+   uint32_t _ALIGN(128) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   int thr_id = mythr->id;  // thr_id arg is deprecated
+   uint32_t nonce = first_nonce;
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+   casti_m128i( endiandata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) );
+   casti_m128i( endiandata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) );
+   casti_m128i( endiandata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) );
+   casti_m128i( endiandata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) );
+   casti_m128i( endiandata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) );
+
+   if ( s_ntime != pdata[17] )
+   {
+      uint32_t ntime = swab32(pdata[17]);
+      x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
+      s_ntime = ntime;
+      if ( opt_debug && !thr_id )
+              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
+   }
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0cff;
+
+   do
+   {
+      be32enc( &endiandata[19], nonce );
+      x21s_hash( hash32, endiandata );
+
+      if ( hash32[7] <= Htarg )
+      if (fulltest( hash32, ptarget ) && !opt_benchmark )
+      {
+         pdata[19] = nonce;
+         submit_solution( work, hash32, mythr );
+      }
+      nonce++;
+   } while ( nonce < max_nonce && !(*restart) );
+   pdata[19] = nonce;
+   *hashes_done = pdata[19] - first_nonce + 1;
+   return 0;
+}
+
+bool x21s_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   const int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   x21s_matrix = _mm_malloc( size, 64 );
+   return x21s_matrix;
+}
+
--- a/algo/x17/sonoa-4way.c
+++ b/algo/x17/sonoa-4way.c
@@ -23,7 +23,7 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
-#include "algo/sha/sha2-hash-4way.h"
+#include "algo/sha/sha-hash-4way.h"

 union _sonoa_4way_context_overlay
 {
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -22,7 +22,7 @@
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/haval/haval-hash-4way.h"
-#include "algo/sha/sha2-hash-4way.h"
+#include "algo/sha/sha-hash-4way.h"

 union _x17_4way_context_overlay
 {
@@ -210,11 +210,11 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
     uint32_t lane_hash[8] __attribute__ ((aligned (32)));
     uint32_t *hash7 = &(hash[7<<2]);
     uint32_t *pdata = work->data;
-     uint32_t *ptarget = work->target;
-     uint32_t n = pdata[19];
+     const uint32_t *ptarget = work->target;
     const uint32_t first_nonce = pdata[19];
     __m256i  *noncev = (__m256i*)vdata + 9;   // aligned
-     int thr_id = mythr->id;  // thr_id arg is deprecated
+     uint32_t n = first_nonce;
+     const int thr_id = mythr->id;
     const uint32_t Htarg = ptarget[7];
     uint64_t htmax[] = {          0,        0xF,       0xFF,
                               0xFFF,     0xFFFF, 0x10000000  };
@@ -225,7 +225,7 @@ int scanhash_x17_4way( struct work *work, uint32_t max_nonce,
     mm256_bswap32_intrlv80_4x64( vdata, pdata );
     for ( int m = 0; m < 6; m++ ) if ( Htarg <= htmax[m] )
     {
-        uint32_t mask = masks[ m ];
+        const uint32_t mask = masks[ m ];
        do
        {
           *noncev = mm256_intrlv_blend_32( mm256_bswap_32(
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -22,7 +22,7 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
-#include "algo/sha/sha2-hash-4way.h"
+#include "algo/sha/sha-hash-4way.h"
 #include "algo/haval/haval-hash-4way.h"

 union _xevan_4way_context_overlay
--- a/algo/x17/xevan-gate.c
+++ b/algo/x17/xevan-gate.c
@@ -1,10 +1,5 @@
 #include "xevan-gate.h"

-void xevan_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
 bool register_xevan_algo( algo_gate_t* gate )
 {
 #if defined (XEVAN_4WAY)
@@ -17,8 +12,8 @@ bool register_xevan_algo( algo_gate_t* gate )
  gate->hash      = (void*)&xevan_hash;
 #endif
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
-  gate->set_target = (void*)&xevan_set_target;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  opt_target_factor = 256.0;
  return true;
 };

--- a/algo/x20/x20r-gate.c
+++ b/algo/x20/x20r-gate.c
@@ -26,9 +26,9 @@ bool register_x20r_algo( algo_gate_t* gate )
  gate->scanhash  = (void*)&scanhash_x20r;
  gate->hash      = (void*)&x20r_hash;
 #endif
-  gate->set_target = (void*)&alt_set_target;
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  x20_r_s_getAlgoString = (void*)&x20r_getAlgoString;
+  opt_target_factor = 256.;
  return true;
 };

--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -431,18 +431,38 @@ void yescrypt_gate_base(algo_gate_t *gate )
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yescrypt;
   gate->hash       = (void*)&yescrypt_hash;
-   gate->set_target = (void*)&scrypt_set_target;
+   opt_target_factor = 65536.0;
 }

 bool register_yescrypt_algo( algo_gate_t* gate )
 {
   yescrypt_gate_base( gate );
   gate->get_max64  = (void*)&yescrypt_get_max64;
-   yescrypt_client_key = NULL;
-   yescrypt_client_key_len = 0;
-   YESCRYPT_N = 2048;
-   YESCRYPT_R = 8;
+
+   if ( opt_param_n )  YESCRYPT_N = opt_param_n;
+   else                YESCRYPT_N = 2048;
+
+   if ( opt_param_r )  YESCRYPT_R = opt_param_r;
+   else                YESCRYPT_R = 8;
+ 
+   if ( opt_param_key ) 
+   {   
+     yescrypt_client_key = opt_param_key;
+     yescrypt_client_key_len = strlen( opt_param_key );
+   }
+   else
+   {   
+     yescrypt_client_key = NULL;
+     yescrypt_client_key_len = 0;
+   }
+
   YESCRYPT_P = 1;
+
+   applog( LOG_NOTICE,"Yescrypt parameters: N= %d, R= %d.", YESCRYPT_N,
+                                                            YESCRYPT_R );
+   if ( yescrypt_client_key )
+     applog( LOG_NOTICE,"Key= \"%s\"\n", yescrypt_client_key );
+
   return true;
 }

--- a/algo/yespower/yespower.c
+++ b/algo/yespower/yespower.c
@@ -78,15 +78,34 @@ int64_t yespower_get_max64()
 bool register_yespower_algo( algo_gate_t* gate )
 {
  yespower_params.version = YESPOWER_1_0;
-  yespower_params.N       = 2048;
-  yespower_params.r       = 32;
-  yespower_params.pers    = NULL;
-  yespower_params.perslen = 0;
+
+  if ( opt_param_n )  yespower_params.N = opt_param_n;
+  else                yespower_params.N = 2048;
+
+  if ( opt_param_r )  yespower_params.r = opt_param_r;
+  else                yespower_params.r = 32;
+
+  if ( opt_param_key )
+  {
+     yespower_params.pers = opt_param_key;
+     yespower_params.perslen = strlen( opt_param_key );
+  }
+  else
+  {
+     yespower_params.pers    = NULL;
+     yespower_params.perslen = 0;
+  }
+
+  applog( LOG_NOTICE,"Yespower parameters: N= %d, R= %d.", yespower_params.N,
+                                                           yespower_params.r );
+  if ( yespower_params.pers )
+     applog( LOG_NOTICE,"Key= \"%s\"\n", yespower_params.pers );
+
  gate->optimizations = SSE2_OPT;
  gate->get_max64     = (void*)&yespower_get_max64;
  gate->scanhash      = (void*)&scanhash_yespower;
  gate->hash          = (void*)&yespower_hash;
-  gate->set_target    = (void*)&scrypt_set_target;
+  opt_target_factor = 65536.0;
  return true;
 };

@@ -101,7 +120,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate )
  gate->get_max64     = (void*)&yespower_get_max64;
  gate->scanhash      = (void*)&scanhash_yespower;
  gate->hash          = (void*)&yespower_hash;
-  gate->set_target    = (void*)&scrypt_set_target;
+  opt_target_factor = 65536.0;
  return true;
 };

@@ -120,13 +139,13 @@ bool register_yescrypt_05_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
-   gate->set_target = (void*)&scrypt_set_target;
   gate->get_max64  = (void*)&yescrypt_05_get_max64;
   yespower_params.version = YESPOWER_0_5;
   yespower_params.N       = 2048;
   yespower_params.r       = 8;
   yespower_params.pers    = NULL;
   yespower_params.perslen = 0;
+   opt_target_factor = 65536.0;
   return true;
 }

@@ -134,13 +153,13 @@ bool register_yescryptr8_05_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
-   gate->set_target = (void*)&scrypt_set_target;
   gate->get_max64  = (void*)&yescrypt_05_get_max64;
   yespower_params.version = YESPOWER_0_5;
   yespower_params.N       = 2048;
   yespower_params.r       = 8;
   yespower_params.pers    = "Client Key";
   yespower_params.perslen = 10;
+   opt_target_factor = 65536.0;
   return true;
 }

@@ -148,13 +167,13 @@ bool register_yescryptr16_05_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
-   gate->set_target = (void*)&scrypt_set_target;
   gate->get_max64  = (void*)&yescryptr16_05_get_max64;
   yespower_params.version = YESPOWER_0_5;
   yespower_params.N       = 4096;
   yespower_params.r       = 16;
   yespower_params.pers    = NULL;
   yespower_params.perslen = 0;
+   opt_target_factor = 65536.0;
   return true;
 }

@@ -162,13 +181,13 @@ bool register_yescryptr32_05_algo( algo_gate_t* gate )
 {
   gate->optimizations = SSE2_OPT | SHA_OPT;
   gate->scanhash   = (void*)&scanhash_yespower;
-   gate->set_target = (void*)&scrypt_set_target;
   gate->get_max64  = (void*)&yescryptr16_05_get_max64;
   yespower_params.version = YESPOWER_0_5;
   yespower_params.N       = 4096;
   yespower_params.r       = 32;
   yespower_params.pers    = "WaviBanana";
   yespower_params.perslen = 10;
+   opt_target_factor = 65536.0;
   return true;
 }

--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.6.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.9.8.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.9.6'
-PACKAGE_STRING='cpuminer-opt 3.9.6'
+PACKAGE_VERSION='3.9.8.1'
+PACKAGE_STRING='cpuminer-opt 3.9.8.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1332,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.9.6 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.9.8.1 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1404,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.9.6:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.9.8.1:";;
   esac
  cat <<\_ACEOF

@@ -1509,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.9.6
+cpuminer-opt configure 3.9.8.1
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2012,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.9.6, which was
+It was created by cpuminer-opt $as_me 3.9.8.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2993,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.9.6'
+ VERSION='3.9.8.1'


 cat >>confdefs.h <<_ACEOF
@@ -6690,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.9.6, which was
+This file was extended by cpuminer-opt $as_me 3.9.8.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6756,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.9.6
+cpuminer-opt config.status 3.9.8.1
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.9.6])
+AC_INIT([cpuminer-opt], [3.9.8.1])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
--- a/miner.h
+++ b/miner.h
@@ -310,6 +310,7 @@ struct thr_api {
 #define CL_WHT  "\x1B[01;37m" /* white */

 void   applog(int prio, const char *fmt, ...);
+void   applog2(int prio, const char *fmt, ...);
 void   restart_threads(void);
 extern json_t *json_rpc_call( CURL *curl, const char *url, const char *userpass,
                	const char *rpc_req, int *curl_err, int flags );
@@ -331,6 +332,24 @@ extern void diff_to_target(uint32_t *target, double diff);
 double hash_target_ratio( uint32_t* hash, uint32_t* target );
 void   work_set_target_ratio( struct work* work, uint32_t* hash );

+struct thr_info {
+        int id;
+        pthread_t pth;
+        pthread_attr_t attr;
+        struct thread_q *q;
+        struct cpu_info cpu;
+};
+
+//struct thr_info *thr_info;
+
+bool   submit_solution( struct work *work, void *hash,
+                        struct thr_info *thr );
+bool   submit_lane_solution( struct work *work, void *hash,
+                             struct thr_info *thr, int lane );
+
+
+//bool submit_work( struct thr_info *thr, const struct work *work_in );
+

 void   get_currentalgo( char* buf, int sz );
 bool   has_sha();
@@ -355,7 +374,7 @@ struct work {
 	uint32_t target[8];

 	double targetdiff;
-	double shareratio;
+//	double shareratio;
 	double sharediff;

 	int height;
@@ -471,6 +490,9 @@ void print_hash_tests(void);

 void scale_hash_for_display ( double* hashrate, char* units );

+void report_summary_log( bool force );
+
+/*
 struct thr_info {
        int id;
        pthread_t pth;
@@ -478,6 +500,7 @@ struct thr_info {
        struct thread_q *q;
        struct cpu_info cpu;
 };
+*/

 struct work_restart {
        volatile uint8_t restart;
@@ -510,9 +533,9 @@ enum algos {
        ALGO_AXIOM,       
        ALGO_BASTION,
        ALGO_BLAKE,       
-        ALGO_BLAKECOIN,   
-//        ALGO_BLAKE2B,
+        ALGO_BLAKE2B,
        ALGO_BLAKE2S,     
+        ALGO_BLAKECOIN,
        ALGO_BMW,        
        ALGO_BMW512,
        ALGO_C11,         
@@ -526,6 +549,7 @@ enum algos {
        ALGO_FRESH,       
        ALGO_GROESTL,     
        ALGO_HEAVY,
+        ALGO_HEX,
        ALGO_HMQ1725,
        ALGO_HODL,
        ALGO_JHA,
@@ -577,10 +601,12 @@ enum algos {
        ALGO_X14,        
        ALGO_X15,       
        ALGO_X16R,
+        ALGO_X16RV2,
        ALGO_X16RT,
        ALGO_X16RT_VEIL,
        ALGO_X16S,
        ALGO_X17,
+        ALGO_X21S,
        ALGO_XEVAN,
        ALGO_YESCRYPT,
        ALGO_YESCRYPTR8,
@@ -602,9 +628,9 @@ static const char* const algo_names[] = {
        "axiom",
        "bastion",
        "blake",
-        "blakecoin",
-//        "blake2b",
+        "blake2b",
        "blake2s",
+        "blakecoin",
        "bmw",
        "bmw512",
        "c11",
@@ -618,6 +644,7 @@ static const char* const algo_names[] = {
        "fresh",
        "groestl",
        "heavy",
+        "hex",
        "hmq1725",
        "hodl",
        "jha",
@@ -669,10 +696,12 @@ static const char* const algo_names[] = {
        "x14",
        "x15",
        "x16r",
+        "x16rv2",
        "x16rt",
        "x16rt-veil",
        "x16s",
        "x17",
+        "x21s",
        "xevan",
        "yescrypt",
        "yescryptr8",
@@ -725,8 +754,11 @@ extern double stratum_diff;
 extern double net_diff;
 extern double net_hashrate;
 extern int opt_pluck_n;
-extern int opt_scrypt_n;
+extern int opt_param_n;
+extern int opt_param_r;
+extern char* opt_param_key;
 extern double opt_diff_factor;
+extern double opt_target_factor;
 extern bool opt_randomize;
 extern bool allow_mininginfo;
 extern time_t g_work_time;
@@ -757,8 +789,9 @@ Options:\n\
                          axiom         Shabal-256 MemoHash\n\
                          bastion\n\
                          blake         blake256r14 (SFR)\n\
-                          blakecoin     blake256r8\n\
+                          blake2b       Blake2b 256\n\
                          blake2s       Blake-2 S\n\
+                          blakecoin     blake256r8\n\
                          bmw           BMW 256\n\
                          bmw512        BMW 512\n\
                          c11           Chaincoin\n\
@@ -772,6 +805,7 @@ Options:\n\
                          fresh         Fresh\n\
                          groestl       Groestl coin\n\
                          heavy         Heavy\n\
+                          hex           x16r-hex\n\
                          hmq1725       Espers\n\
                          hodl          Hodlcoin\n\
                          jha           jackppot (Jackpotcoin)\n\
@@ -790,8 +824,8 @@ Options:\n\
                          neoscrypt     NeoScrypt(128, 2, 1)\n\
                          nist5         Nist5\n\
                          pentablake    5 x blake512\n\
-                          phi1612       phi, LUX coin (original algo)\n\
-                          phi2          LUX (new algo)\n\
+                          phi1612       phi\n\
+                          phi2\n\
 			                 pluck         Pluck:128 (Supcoin)\n\
                          polytimos\n\
                          quark         Quark\n\
@@ -823,11 +857,13 @@ Options:\n\
                          x13sm3        hsr (Hshare)\n\
                          x14           X14\n\
                          x15           X15\n\
-                          x16r          Ravencoin (RVN)\n\
+                          x16r\n\
+                          x16rv2        Ravencoin (RVN)\n\
                          x16rt         Gincoin (GIN)\n\
                          x16rt-veil    Veil (VEIL)\n\
                          x16s          Pigeoncoin (PGN)\n\
                          x17\n\
+                          x21s\n\
                          xevan         Bitsend (BSD)\n\
                          yescrypt      Globalboost-Y (BSTY)\n\
                          yescryptr8    BitZeny (ZNY)\n\
@@ -836,6 +872,9 @@ Options:\n\
                          yespower      Cryply\n\
                          yespowerr16   Yenten (YTN)\n\
                          zr5           Ziftr\n\
+  -N, --param-n         N parameter for scrypt based algos\n\
+  -R, --patam-r         R parameter for scrypt based algos\n\
+  -K, --param-key       Key parameter for algos that use it\n\
  -o, --url=URL         URL of mining server\n\
  -O, --userpass=U:P    username:password pair for mining server\n\
  -u, --user=USERNAME   username for mining server\n\
@@ -845,7 +884,7 @@ Options:\n\
  -t, --threads=N       number of miner threads (default: number of processors)\n\
  -r, --retries=N       number of times to retry if a network call fails\n\
                          (default: retry indefinitely)\n\
-  -R, --retry-pause=N   time to pause between retries, in seconds (default: 30)\n\
+      --retry-pause=N   time to pause between retries, in seconds (default: 30)\n\
      --time-limit=N    maximum time [s] to mine before exiting the program.\n\
  -T, --timeout=N       timeout for long poll and stratum (default: 300 seconds)\n\
  -s, --scantime=N      upper bound on time spent scanning current work when\n\
@@ -920,6 +959,7 @@ static struct option const options[] = {
        { "hash-meter", 0, NULL, 1014 },
        { "hide-diff", 0, NULL, 1013 },
        { "help", 0, NULL, 'h' },
+        { "key", 1, NULL, 'K' },
        { "no-gbt", 0, NULL, 1011 },
        { "no-getwork", 0, NULL, 1010 },
        { "no-longpoll", 0, NULL, 1003 },
@@ -929,13 +969,16 @@ static struct option const options[] = {
        { "max-temp", 1, NULL, 1060 },
        { "max-diff", 1, NULL, 1061 },
        { "max-rate", 1, NULL, 1062 },
+        { "param-key", 1, NULL, 'K' },
+        { "param-n", 1, NULL, 'N' },
+        { "param-r", 1, NULL, 'R' },
        { "pass", 1, NULL, 'p' },
        { "protocol", 0, NULL, 'P' },
        { "protocol-dump", 0, NULL, 'P' },
        { "proxy", 1, NULL, 'x' },
        { "quiet", 0, NULL, 'q' },
        { "retries", 1, NULL, 'r' },
-        { "retry-pause", 1, NULL, 'R' },
+        { "retry-pause", 1, NULL, 1025 },
        { "randomize", 0, NULL, 1024 },
        { "scantime", 1, NULL, 's' },
 #ifdef HAVE_SYSLOG_H
--- a/simd-utils.h
+++ b/simd-utils.h
@@ -175,7 +175,6 @@

 // 64 bit vectors
 #include "simd-utils/simd-64.h"
-//#include "simd-utils/intrlv-mmx.h"

 #if defined(__SSE2__)

@@ -189,6 +188,8 @@

 #if defined(__AVX2__)

+// Utilities that require AVX2 are defined in simd-256.h.
+
 // Skylake-X has all these
 #if defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)

--- a/simd-utils/intrlv.h
+++ b/simd-utils/intrlv.h
@@ -477,13 +477,15 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src )
   __m256i s0 = mm256_bswap_32( casti_m256i( src,0 ) );
   __m256i s1 = mm256_bswap_32( casti_m256i( src,1 ) );
   __m128i s2 = mm128_bswap_32( casti_m128i( src,4 ) );
-  const __m256i zero  = m256_zero;
+//  const __m256i zero  = m256_zero;
  const __m256i one   = m256_one_32;
  const __m256i two   = _mm256_add_epi32( one, one );
  const __m256i three = _mm256_add_epi32( two, one );
  const __m256i four  = _mm256_add_epi32( two, two );

-  casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, zero  );
+  casti_m256i( d, 0 ) = _mm256_broadcastd_epi32(
+                             _mm256_castsi256_si128( s0 ) );
+//  casti_m256i( d, 0 ) = _mm256_permutevar8x32_epi32( s0, m256_zero  );
  casti_m256i( d, 1 ) = _mm256_permutevar8x32_epi32( s0, one   );
  casti_m256i( d, 2 ) = _mm256_permutevar8x32_epi32( s0, two   );
  casti_m256i( d, 3 ) = _mm256_permutevar8x32_epi32( s0, three );
@@ -494,7 +496,9 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src )
                                       _mm256_add_epi32( four, two   ) );
  casti_m256i( d, 7 ) = _mm256_permutevar8x32_epi32( s0,
                                       _mm256_add_epi32( four, three ) );
-  casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, zero  );
+  casti_m256i( d, 8 ) = _mm256_broadcastd_epi32(
+                             _mm256_castsi256_si128( s1 ) );
+//  casti_m256i( d, 8 ) = _mm256_permutevar8x32_epi32( s1, m256_zero  );
  casti_m256i( d, 9 ) = _mm256_permutevar8x32_epi32( s1, one   );
  casti_m256i( d,10 ) = _mm256_permutevar8x32_epi32( s1, two   );
  casti_m256i( d,11 ) = _mm256_permutevar8x32_epi32( s1, three );
@@ -505,8 +509,9 @@ static inline void mm256_bswap32_intrlv80_8x32( void *d, void *src )
                                       _mm256_add_epi32( four, two   ) );
  casti_m256i( d,15 ) = _mm256_permutevar8x32_epi32( s1,
                                       _mm256_add_epi32( four, three ) );
-  casti_m256i( d,16 ) = _mm256_permutevar8x32_epi32(
-                             _mm256_castsi128_si256( s2 ), zero  );
+  casti_m256i( d,16 ) = _mm256_broadcastd_epi32(     s2 );
+//  casti_m256i( d,16 ) = _mm256_permutevar8x32_epi32(
+//                             _mm256_castsi128_si256( s2 ), m256_zero  );
  casti_m256i( d,17 ) = _mm256_permutevar8x32_epi32(
                             _mm256_castsi128_si256( s2 ), one   );
  casti_m256i( d,18 ) = _mm256_permutevar8x32_epi32(
@@ -677,41 +682,43 @@ static inline void mm512_bswap32_intrlv80_16x32( void *d, void *src )
 {
  __m512i s0 = mm512_bswap_32( casti_m512i( src, 0 ) );
  __m128i s1 = mm128_bswap_32( casti_m128i( src, 4 ) );
-  const __m512i zero     = m512_zero;
-  const __m512i one      = m512_one_32;
-  const __m512i two      = _mm512_add_epi32( one,   one   );
-  const __m512i three    = _mm512_add_epi32( two,   one   );
-  const __m512i four     = _mm512_add_epi32( two,   two   );
-  const __m512i eight    = _mm512_add_epi32( four,  four  );
-  const __m512i eleven   = _mm512_add_epi32( eight, three );
+  const __m512i one   = m512_one_32;
+  const __m512i two   = _mm512_add_epi32( one,   one   );
+  const __m512i three = _mm512_add_epi32( two,   one   );
+        __m512i     x = _mm512_add_epi32( three, three );

-  casti_m512i( d, 0 ) = _mm512_permutexvar_epi32( s0, zero   );
-  casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( s0, one    );
-  casti_m512i( d, 2 ) = _mm512_permutexvar_epi32( s0, two    );
-  casti_m512i( d, 3 ) = _mm512_permutexvar_epi32( s0, three  );
-  casti_m512i( d, 4 ) = _mm512_permutexvar_epi32( s0, four   );
+  casti_m512i( d, 0 ) = _mm512_broadcastd_epi32(
+                          _mm512_castsi512_si128( s0 ) );
+//  casti_m512i( d, 0 ) = _mm512_permutexvar_epi32( s0, m512_zero );
+  casti_m512i( d, 1 ) = _mm512_permutexvar_epi32( s0, one   );
+  casti_m512i( d, 2 ) = _mm512_permutexvar_epi32( s0, two   );
+  casti_m512i( d, 3 ) = _mm512_permutexvar_epi32( s0, three );
+  casti_m512i( d, 4 ) = _mm512_permutexvar_epi32( s0,
+                                    _mm512_add_epi32( two,   two ) );
  casti_m512i( d, 5 ) = _mm512_permutexvar_epi32( s0,
-                                    _mm512_add_epi32( four,   one   ) );
-  casti_m512i( d, 6 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( four,   two   ) );
+                                    _mm512_add_epi32( three, two ) );
+  casti_m512i( d, 6 ) = _mm512_permutexvar_epi32( s0, x );
  casti_m512i( d, 7 ) = _mm512_permutexvar_epi32( s0,
-                                    _mm512_add_epi32( four,   three ) );
-  casti_m512i( d, 8 ) = _mm512_permutexvar_epi32( s0, eight );
-  casti_m512i( d, 9 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( eight,  one   ) );
+                                    _mm512_add_epi32( x,      one   ) );
+  casti_m512i( d, 8 ) = _mm512_permutexvar_epi32( s0,
+                                    _mm512_add_epi32( x,      two   ) );
+  x = _mm512_add_epi32( x, three );
+  casti_m512i( d, 9 ) = _mm512_permutexvar_epi32( s0, x );
  casti_m512i( d,10 ) = _mm512_permutexvar_epi32( s0,
-                                    _mm512_add_epi32( eight,  two   ) );
-  casti_m512i( d,11 ) = _mm512_permutexvar_epi32( s0, eleven ); 
-  casti_m512i( d,12 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( eleven, one   ) );
-  casti_m512i( d,13 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( eleven, two   ) );
-  casti_m512i( d,14 ) = _mm512_permutexvar_epi32( s0, 
-                                    _mm512_add_epi32( eleven, three ) );
+                                    _mm512_add_epi32( x,      one   ) );
+  casti_m512i( d,11 ) = _mm512_permutexvar_epi32( s0,
+                                    _mm512_add_epi32( x,      two   ) );
+  x = _mm512_add_epi32( x, three );
+  casti_m512i( d,12 ) = _mm512_permutexvar_epi32( s0, x );
+  casti_m512i( d,13 ) = _mm512_permutexvar_epi32( s0,
+                                    _mm512_add_epi32( x,      one   ) );
+  casti_m512i( d,14 ) = _mm512_permutexvar_epi32( s0,
+                                    _mm512_add_epi32( x,      two   ) );
  casti_m512i( d,15 ) = _mm512_permutexvar_epi32( s0,
-                                    _mm512_add_epi32( eleven, four  ) );
-  casti_m512i( d,16 ) = _mm512_permutexvar_epi32(
-                          _mm512_castsi128_si512( s1 ), zero );
+                                    _mm512_add_epi32( x,      three ) );
+  casti_m512i( d,16 ) = _mm512_broadcastd_epi32(  s1 );
+//  casti_m512i( d,16 ) = _mm512_permutexvar_epi32(
+//                          _mm512_castsi128_si512( s1 ), m512_zero );
  casti_m512i( d,17 ) = _mm512_permutexvar_epi32(
                          _mm512_castsi128_si512( s1 ), one  );
  casti_m512i( d,18 ) = _mm512_permutexvar_epi32(
@@ -769,14 +776,14 @@ static inline void dintrlv_2x64( void *dst0, void *dst1,

 // 4x64   (AVX2)

-static inline void intrlv_4x64( void *dst, const void *src0,
-           const void *src1, const void *src2, const void *src3, int bit_len )
+static inline void intrlv_4x64( void *dst, void *src0,
+           void *src1, void *src2, void *src3, int bit_len )
 {
   uint64_t *d = (uint64_t*)dst;
-   const uint64_t *s0 = (const uint64_t*)src0;
-   const uint64_t *s1 = (const uint64_t*)src1;
-   const uint64_t *s2 = (const uint64_t*)src2;
-   const uint64_t *s3 = (const uint64_t*)src3;
+   uint64_t *s0 = (uint64_t*)src0;
+   uint64_t *s1 = (uint64_t*)src1;
+   uint64_t *s2 = (uint64_t*)src2;
+   uint64_t *s3 = (uint64_t*)src3;
   d[  0] = s0[ 0];   d[  1] = s1[ 0];   d[  2] = s2[ 0];   d[  3] = s3[ 0];
   d[  4] = s0[ 1];   d[  5] = s1[ 1];   d[  6] = s2[ 1];   d[  7] = s3[ 1];
   d[  8] = s0[ 2];   d[  9] = s1[ 2];   d[ 10] = s2[ 2];   d[ 11] = s3[ 2];
@@ -870,19 +877,11 @@ static inline void extr_lane_4x64( void *d, const void *s,
   ((uint64_t*)d)[ 1] = ((uint64_t*)s)[ lane+ 4 ];
   ((uint64_t*)d)[ 2] = ((uint64_t*)s)[ lane+ 8 ];
   ((uint64_t*)d)[ 3] = ((uint64_t*)s)[ lane+12 ];
+   if ( bit_len <= 256 ) return;
   ((uint64_t*)d)[ 4] = ((uint64_t*)s)[ lane+16 ];
   ((uint64_t*)d)[ 5] = ((uint64_t*)s)[ lane+20 ];
   ((uint64_t*)d)[ 6] = ((uint64_t*)s)[ lane+24 ];
   ((uint64_t*)d)[ 7] = ((uint64_t*)s)[ lane+28 ];
-   if ( bit_len <= 256 ) return;
-   ((uint64_t*)d)[ 8] = ((uint64_t*)s)[ lane+32 ];
-   ((uint64_t*)d)[ 9] = ((uint64_t*)s)[ lane+36 ];
-   ((uint64_t*)d)[10] = ((uint64_t*)s)[ lane+40 ];
-   ((uint64_t*)d)[11] = ((uint64_t*)s)[ lane+44 ];
-   ((uint64_t*)d)[12] = ((uint64_t*)s)[ lane+48 ];
-   ((uint64_t*)d)[13] = ((uint64_t*)s)[ lane+52 ];
-   ((uint64_t*)d)[14] = ((uint64_t*)s)[ lane+56 ];
-   ((uint64_t*)d)[15] = ((uint64_t*)s)[ lane+60 ];
 }

 #if defined(__AVX2__)
@@ -984,19 +983,11 @@ static inline void extr_lane_8x64( void *d, const void *s,
   ((uint64_t*)d)[ 1] = ((uint64_t*)s)[ lane+  8 ];
   ((uint64_t*)d)[ 2] = ((uint64_t*)s)[ lane+ 16 ];
   ((uint64_t*)d)[ 3] = ((uint64_t*)s)[ lane+ 24 ];
+   if ( bit_len <= 256 ) return;
   ((uint64_t*)d)[ 4] = ((uint64_t*)s)[ lane+ 32 ];
   ((uint64_t*)d)[ 5] = ((uint64_t*)s)[ lane+ 40 ];
   ((uint64_t*)d)[ 6] = ((uint64_t*)s)[ lane+ 48 ];
   ((uint64_t*)d)[ 7] = ((uint64_t*)s)[ lane+ 56 ];
-   if ( bit_len <= 256 ) return;
-   ((uint64_t*)d)[ 8] = ((uint64_t*)s)[ lane+ 64 ];
-   ((uint64_t*)d)[ 9] = ((uint64_t*)s)[ lane+ 72 ];
-   ((uint64_t*)d)[10] = ((uint64_t*)s)[ lane+ 80 ];
-   ((uint64_t*)d)[11] = ((uint64_t*)s)[ lane+ 88 ];
-   ((uint64_t*)d)[12] = ((uint64_t*)s)[ lane+ 96 ];
-   ((uint64_t*)d)[13] = ((uint64_t*)s)[ lane+104 ];
-   ((uint64_t*)d)[14] = ((uint64_t*)s)[ lane+112 ];
-   ((uint64_t*)d)[15] = ((uint64_t*)s)[ lane+120 ];
 }

 #if defined(__AVX512F__) && defined(__AVX512VL__)
@@ -1004,15 +995,17 @@ static inline void extr_lane_8x64( void *d, const void *s,
 static inline void mm512_bswap32_intrlv80_8x64( void *dst, void *src )
 {
   __m512i *d = (__m512i*)dst;
-   __m512i s0 = mm512_bswap_32( casti_m512i(src, 0 ) );
-   __m128i s1 = mm128_bswap_32( casti_m128i(src, 4 ) );
-  const __m512i zero   = m512_zero;
+   __m512i s0 = mm512_bswap_32( casti_m512i( src, 0 ) );
+   __m128i s1 = mm128_bswap_32( casti_m128i( src, 4 ) );
+//  const __m512i zero   = m512_zero;
  const __m512i one    = m512_one_64;
  const __m512i two    = _mm512_add_epi64( one, one );
  const __m512i three  = _mm512_add_epi64( two, one );
  const __m512i four   = _mm512_add_epi64( two, two );

-  d[0] = _mm512_permutexvar_epi64( s0, zero );
+  d[0] = _mm512_broadcastq_epi64(
+           _mm512_castsi512_si128( s0 ) );
+//  d[0] = _mm512_permutexvar_epi64( s0, m512_zero );
  d[1] = _mm512_permutexvar_epi64( s0, one  );
  d[2] = _mm512_permutexvar_epi64( s0, two  );
  d[3] = _mm512_permutexvar_epi64( s0, three  );
@@ -1020,8 +1013,9 @@ static inline void mm512_bswap32_intrlv80_8x64( void *dst, void *src )
  d[5] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, one   ) );
  d[6] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, two   ) );
  d[7] = _mm512_permutexvar_epi64( s0, _mm512_add_epi64( four, three ) );
-  d[8] = _mm512_permutexvar_epi64(
-           _mm512_castsi128_si512( s1 ), zero );
+  d[8] = _mm512_broadcastq_epi64(  s1 );
+//  d[8] = _mm512_permutexvar_epi64(
+//           _mm512_castsi128_si512( s1 ), m512_zero );
  d[9] = _mm512_permutexvar_epi64(
           _mm512_castsi128_si512( s1 ), one  );
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	789c8b70bc	v3.9.8.1	2019-10-01 14:17:36 -04:00
Jay D Dee	01550d94a2	v3.9.8	2019-09-26 22:37:26 -04:00
Jay D Dee	a042fb7612	v3.9.7	2019-08-03 10:39:54 -04:00
Jay D Dee	9d49e0be7a	v3.9.6.2	2019-07-30 10:16:43 -04:00
Jay D Dee	a51f59086b	v3.9.6.1	2019-07-18 19:46:57 -04:00