v3.7.10

v3.7.9
v3.7.8
2025-09-17 23:44:27 +00:00 · 2018-01-16 15:11:44 -05:00 · 2018-01-08 22:04:43 -05:00 · 2017-12-30 19:19:46 -05:00 · 2017-12-17 12:00:42 -05:00 · 2017-12-14 18:28:51 -05:00
150 changed files with 10987 additions and 2547 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -38,7 +38,6 @@ cpuminer_SOURCES = \
  algo/argon2/ar2/cores.c \
  algo/argon2/ar2/ar2-scrypt-jane.c \
  algo/argon2/ar2/blake2b.c \
-  algo/axiom.c \
  algo/blake/sph_blake.c \
  algo/blake/blake-hash-4way.c \
  algo/blake/blake-gate.c \
@@ -47,8 +46,10 @@ cpuminer_SOURCES = \
  algo/blake/sph_blake2b.c \
  algo/blake/blake2b.c \
  algo/blake/blake2s.c \
+  algo/blake/blakecoin-gate.c \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
+  algo/blake/blakecoin-4way.c \
  algo/blake/decred-gate.c \
  algo/blake/decred.c \
  algo/blake/decred-4way.c \
@@ -56,6 +57,7 @@ cpuminer_SOURCES = \
  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
  algo/bmw/sph_bmw.c \
+  algo/bmw/bmw-hash-4way.c \
  algo/bmw/bmw256.c \
  algo/cryptonight/cryptolight.c \
  algo/cryptonight/cryptonight-common.c\
@@ -63,10 +65,8 @@ cpuminer_SOURCES = \
  algo/cryptonight/cryptonight.c\
  algo/cubehash/sph_cubehash.c \
  algo/cubehash/sse2/cubehash_sse2.c\
-  algo/drop.c \
  algo/echo/sph_echo.c \
  algo/echo/aes_ni/hash.c\
-  algo/fresh.c \
  algo/gost/sph_gost.c \
  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
@@ -79,7 +79,6 @@ cpuminer_SOURCES = \
  algo/heavy/sph_hefty1.c \
  algo/heavy/heavy.c \
  algo/heavy/bastion.c \
-  algo/hmq1725.c \
  algo/hodl/aes.c \
  algo/hodl/hodl-gate.c \
  algo/hodl/hodl-wolf.c \
@@ -102,21 +101,27 @@ cpuminer_SOURCES = \
  algo/luffa/sse2/luffa_for_sse2.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
+  algo/lyra2/lyra2rev2-gate.c \
  algo/lyra2/lyra2rev2.c \
+  algo/lyra2/lyra2rev2-4way.c \
  algo/lyra2/lyra2re.c \
  algo/lyra2/lyra2z-gate.c \
  algo/lyra2/lyra2z.c \
  algo/lyra2/lyra2z-4way.c \
  algo/lyra2/lyra2z330.c \
+  algo/lyra2/lyra2h-gate.c \
+  algo/lyra2/lyra2h.c \
+  algo/lyra2/lyra2h-4way.c \
  algo/m7m.c \
-  algo/neoscrypt.c \
+  algo/neoscrypt/neoscrypt.c \
  algo/nist5/nist5-gate.c \
  algo/nist5/nist5-4way.c \
  algo/nist5/nist5.c \
+  algo/nist5/zr5.c \
  algo/pluck.c \
-  algo/polytimos/polytimos-gate.c \
-  algo/polytimos/polytimos.c \
+  algo/quark/quark-gate.c \
  algo/quark/quark.c \
+  algo/quark/quark-4way.c \
  algo/qubit/qubit.c \
  algo/qubit/deep.c \
  algo/ripemd/sph_ripemd.c \
@@ -127,7 +132,9 @@ cpuminer_SOURCES = \
  algo/sha/sha2.c \
  algo/sha/sha256t.c \
  algo/shabal/sph_shabal.c \
+  algo/shabal/shabal-hash-4way.c \
  algo/shavite/sph_shavite.c \
+  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite.c \
  algo/simd/sph_simd.c \
  algo/simd/sse2/nist.c \
@@ -136,41 +143,77 @@ cpuminer_SOURCES = \
  algo/skein/skein-hash-4way.c \
  algo/skein/skein.c \
  algo/skein/skein-4way.c \
-  algo/skein/skein-gate.c \  
+  algo/skein/skein-gate.c \
  algo/skein/skein2.c \
  algo/skein/skein2-4way.c \
  algo/skein/skein2-gate.c \
-  algo/skunk.c \
  algo/sm3/sm3.c \
+  algo/sm3/sm3-hash-4way.c \
  algo/tiger/sph_tiger.c \
-  algo/timetravel.c \
-  algo/timetravel10.c \
-  algo/tribus/tribus-gate.c \
-  algo/tribus/tribus.c \
-  algo/tribus/tribus-4way.c \
-  algo/veltor.c \
  algo/whirlpool/sph_whirlpool.c \
  algo/whirlpool/whirlpool-hash-4way.c \
  algo/whirlpool/whirlpool-gate.c \
  algo/whirlpool/whirlpool-4way.c \
  algo/whirlpool/whirlpool.c \
  algo/whirlpool/whirlpoolx.c \
-  algo/x11/phi1612.c \
+  algo/x11/x11-gate.c \
  algo/x11/x11.c \
-  algo/x11/x11evo.c \
+  algo/x11/x11-4way.c \
+  algo/x11/x11gost-gate.c \
  algo/x11/x11gost.c \
+  algo/x11/x11gost-4way.c \
+  algo/x11/c11-gate.c \
  algo/x11/c11.c \
+  algo/x11/c11-4way.c \
+  algo/x11/tribus-gate.c \
+  algo/x11/tribus.c \
+  algo/x11/tribus-4way.c \
+  algo/x11/timetravel-gate.c \
+  algo/x11/timetravel.c \
+  algo/x11/timetravel-4way.c \
+  algo/x11/timetravel10-gate.c \
+  algo/x11/timetravel10.c \
+  algo/x11/timetravel10-4way.c \
+  algo/x11/fresh.c \
+  algo/x11/x11evo.c \
+  algo/x11/x11evo-4way.c \
+  algo/x11/x11evo-gate.c \
+  algo/x13/x13-gate.c \
  algo/x13/x13.c \
+  algo/x13/x13-4way.c \
+  algo/x13/x13sm3-gate.c \
  algo/x13/x13sm3.c \
+  algo/x13/x13sm3-4way.c \
+  algo/x13/phi1612-gate.c \
+  algo/x13/phi1612.c \
+  algo/x13/phi1612-4way.c \
+  algo/x13/skunk-gate.c \
+  algo/x13/skunk-4way.c \
+  algo/x13/skunk.c \
+  algo/x13/drop.c \
+  algo/x14/x14-gate.c \
  algo/x14/x14.c \
+  algo/x14/x14-4way.c \
+  algo/x14/veltor-gate.c \
+  algo/x14/veltor.c \
+  algo/x14/veltor-4way.c \
+  algo/x14/polytimos-gate.c \
+  algo/x14/polytimos.c \
+  algo/x14/polytimos-4way.c \
+  algo/x14/axiom.c \
+  algo/x15/x15-gate.c \
  algo/x15/x15.c \
+  algo/x15/x15-4way.c \
+  algo/x17/x17-gate.c \
  algo/x17/x17.c \
-  algo/xevan.c \
+  algo/x17/x17-4way.c \
+  algo/x17/xevan-gate.c \
+  algo/x17/xevan.c \
+  algo/x17/xevan-4way.c \
+  algo/x17/hmq1725.c \
  algo/yescrypt/yescrypt.c \
-  algo/yescrypt/sha256_Y.c\
-  algo/yescrypt/yescrypt-simd.c\
-  algo/zr5.c
-
+  algo/yescrypt/sha256_Y.c \
+  algo/yescrypt/yescrypt-simd.c

 disable_flags =

--- a/README.md
+++ b/README.md
@@ -40,6 +40,7 @@ Supported Algorithms
                          keccakc      Creative coin
                          lbry         LBC, LBRY Credits
                          luffa        Luffa
+                          lyra2h       Hppcoin
                          lyra2re      lyra2
                          lyra2rev2    lyra2v2, Vertcoin
                          lyra2z       Zcoin (XZC)
@@ -67,7 +68,7 @@ Supported Algorithms
                          timetravel10 Bitcore
                          tribus       Denarius (DNR)
                          vanilla      blake256r8vnl (VCash)
-                          veltor
+                          veltor       (VLT)
                          whirlpool
                          whirlpoolx
                          x11          Dash
@@ -80,6 +81,7 @@ Supported Algorithms
                          x17
                          xevan        Bitsend
                          yescrypt     Globalboost-Y (BSTY)
+                          yescryptr8   BitZeny (ZNY)\n\
                          yescryptr16  Yenten (YTN)
                          zr5          Ziftr

@@ -95,13 +97,16 @@ algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
 Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
 performance.

+ARM CPUs are not supported.
+
 2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
 Centos are known to work and have all dependencies in their repositories.
 Others may work but may require more effort.
 64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.

-3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
-may work wallet mining but there are no guarantees.
+MacOS, OSx is not supported.
+
+3. Stratum pool. Some algos may work wallet mining using getwork.

 Errata
 ------
--- a/README.txt
+++ b/README.txt
@@ -17,17 +17,21 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.

-Exe name                  Compile opts         Arch name
+Exe name                Compile flags              Arch name

-cpuminer-sse2.exe         -march=core2         Core2   
-cpuminer-sse42.exe        -march=corei7        Nehalem
-cpuminer-aes-sse42.exe    -maes -msse4.2"      Westmere
-cpuminer-aes-avx.exe      -march=corei7-avx"   Sandybridge, Ivybridge
-cpuminer-aes-avx2.exe     "-march=core-avx2"   Haswell, Broadwell, Skylake, Kabylake
-cpuminer-4way.exe         "-march=core-avx2 -DFOUR_WAY"
+cpuminer-sse2.exe      "-march=core2"              Core2   
+cpuminer-sse42.exe     "-march=corei7"             Nehalem
+cpuminer-aes-sse42.exe "-maes -msse4.2"            Westmere
+cpuminer-avx.exe       "-march=corei7-avx"         Sandybridge, Ivybridge
+cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
+cpuminer-avx-sha       "-march=corei7-avx -msha"   Ryzen...
+cpuminer-4way.exe      "-march=core-avx2 -DFOUR_WAY"       same as avx2
+cpuminer-4way-sha.exe  "-march=core-avx2 -msha -DFOUR_WAY" same as avx2-sha

 4way requires a CPU with AES and AVX2. It is still under development and
 only a few algos are supported. See change log in RELEASE_NOTES in source
 package for supported algos.

-There is no binary support available for SHA on AMD Ryzen CPUs.
+Ryzen CPus perform better with AVX than AVX2 therefore an avx-sha build
+is provided. Four way still uses AVX2. 
+
--- a/36
+++ b/36
@@ -27,8 +27,9 @@ Compile Instructions

 Requirements:

-Intel Core2 or newer, or AMD Steamroller or newer CPU.
-64 bit Linux or Windows operating system.
+Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
+supported.
+64 bit Linux or Windows operating system. Apple is not supported.

 Building on linux prerequisites:

@@ -164,6 +165,35 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.7.10
+
+4way optimizations for lyra2rev2, lyra2h, quark, timetravel8, timetravel10
+   x11evo, blakecoin.
+Faster x13sm3 (hsr).
+Added share difficulty to accepted message.
+
+v3.7.9
+
+Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.
+Additional 4way optimizations for X algos.
+New algo yescryptr8 for BitZeny, not to be confused with original
+yescrypt Globalboost-Y.
+
+v3.7.8
+
+Partial 4way optimization for most X algos including c11, xevan, phi, hsr
+
+v3.7.7
+
+Fixed regression caused by 64 CPU support.
+Fixed lyra2h.
+
+v3.7.6
+
+Added lyra2h algo for Hppcoin.
+Added support for more than 64 CPUs.
+Optimized shavite512 with AES, improves x11 etc.
+
 v3.7.5

 New algo keccakc for Creative coin with 4way optimizations
@@ -171,7 +201,7 @@ New algo keccakc for Creative coin with 4way optimizations
 Rewrote some AVX/AVX2 code for more consistent implementation and some
 optimizing.

-Enhanced capabilities check to support 4way, mor eprecise reporting of
+Enhanced capabilities check to support 4way, more precise reporting of
 features (not all algos use SSE2), and better error messages when using
 an incompatible pre-built version (Windows users).

--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -138,6 +138,10 @@ void init_algo_gate( algo_gate_t* gate )
   gate->work_cmp_size           = STD_WORK_CMP_SIZE;
 }

+// Ignore warnings for not yet defined register functions
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
+
 // called by each thread that uses the gate
 bool register_algo_gate( int algo, algo_gate_t *gate )
 {
@@ -151,11 +155,6 @@ bool register_algo_gate( int algo, algo_gate_t *gate )

   switch (algo)
   {
-
-// Ignore warnings for not yet defined register fucntions
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
-
     case ALGO_ARGON2:       register_argon2_algo      ( gate ); break;
     case ALGO_AXIOM:        register_axiom_algo       ( gate ); break;
     case ALGO_BASTION:      register_bastion_algo     ( gate ); break;
@@ -180,6 +179,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_KECCAKC:      register_keccakc_algo     ( gate ); break;
     case ALGO_LBRY:         register_lbry_algo        ( gate ); break;
     case ALGO_LUFFA:        register_luffa_algo       ( gate ); break;
+     case ALGO_LYRA2H:       register_lyra2h_algo      ( gate ); break;
     case ALGO_LYRA2RE:      register_lyra2re_algo     ( gate ); break;
     case ALGO_LYRA2REV2:    register_lyra2rev2_algo   ( gate ); break;
     case ALGO_LYRA2Z:       register_lyra2z_algo      ( gate ); break;
@@ -211,7 +211,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo  ( gate ); break;
     case ALGO_X11:          register_x11_algo         ( gate ); break;
     case ALGO_X11EVO:       register_x11evo_algo      ( gate ); break;
-     case ALGO_X11GOST:      register_sib_algo         ( gate ); break;
+     case ALGO_X11GOST:      register_x11gost_algo     ( gate ); break;
     case ALGO_X13:          register_x13_algo         ( gate ); break;
     case ALGO_X13SM3:       register_x13sm3_algo      ( gate ); break;
     case ALGO_X14:          register_x14_algo         ( gate ); break;
@@ -219,12 +219,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_X17:          register_x17_algo         ( gate ); break;
     case ALGO_XEVAN:        register_xevan_algo       ( gate ); break;
     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
+     case ALGO_YESCRYPTR8:   register_yescryptr8_algo  ( gate ); break;
     case ALGO_YESCRYPTR16:  register_yescryptr16_algo ( gate ); break;
     case ALGO_ZR5:          register_zr5_algo         ( gate ); break;
-
-// restore warnings
-#pragma GCC diagnostic pop
-
    default:
        applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
        return false;
@@ -239,6 +236,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
  return true;
 }

+// restore warnings
+#pragma GCC diagnostic pop
+
 // override std defaults with jr2 defaults
 bool register_json_rpc2( algo_gate_t *gate )
 {
@@ -279,6 +279,7 @@ const char* const algo_alias_map[][2] =
 {
 //   alias                proper
  { "bitcore",           "timetravel10" },
+  { "bitzeny",           "yescryptr8"   },
  { "blake256r8",        "blakecoin"    },
  { "blake256r8vnl",     "vanilla"      },
  { "blake256r14",       "blake"        },
@@ -301,10 +302,9 @@ const char* const algo_alias_map[][2] =
 //  { "sia",               "blake2b"      },
  { "sib",               "x11gost"      },
  { "timetravel8",       "timetravel"   },
-  { "yes",               "yescrypt"     },
  { "ziftr",             "zr5"          },
  { "yenten",            "yescryptr16"  },
-  { "yescryptr8",        "yescrypt"     },
+  { "yescryptr8k",       "yescrypt"     },
  { "zcoin",             "lyra2z"       },
  { "zoin",              "lyra2z330"    },
  { NULL,                NULL           }   
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -1,107 +1,90 @@
 #include "blake-gate.h"
-#include "sph_blake.h"
+
+#if defined (__AVX__)
+
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>

-#if defined (BLAKE_4WAY)
+blake256r14_4way_context blake_ctx;

 void blakehash_4way(void *state, const void *input)
 {
-     uint32_t vhash[4*4] __attribute__ ((aligned (64)));
-     uint32_t hash0[4] __attribute__ ((aligned (32)));
-     uint32_t hash1[4] __attribute__ ((aligned (32)));
-     uint32_t hash2[4] __attribute__ ((aligned (32)));
-     uint32_t hash3[4] __attribute__ ((aligned (32)));
-     blake256_4way_context ctx;
-
-     blake256_4way_init( &ctx );
-     blake256_4way( &ctx, input, 16 );
-     blake256_4way_close( &ctx, vhash );
-
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash1, 32 );
-     memcpy( state+96, hash1, 32 );
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256r14_4way_context ctx;
+     memcpy( &ctx, &blake_ctx, sizeof ctx );
+     blake256r14_4way( &ctx, input + (64<<2), 16 );
+     blake256r14_4way_close( &ctx, vhash );
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done )
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-   uint32_t hash[4*4] __attribute__ ((aligned (32)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
-//   uint32_t HTarget = ptarget[7];
-   uint32_t _ALIGN(32) endiandata[20];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found = 0;

-//   if (opt_benchmark)
-//      HTarget = 0x7f;
+   if (opt_benchmark)
+      HTarget = 0x7f;

   // we need big endian data...
-   swab32_array( endiandata, pdata, 20 );
+   swab32_array( edata, pdata, 20 );

-   mm_interleave_4x32( vdata, endiandata, endiandata, endiandata,
-                         endiandata, 640 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   blake256r14_4way_init( &blake_ctx );
+   blake256r14_4way( &blake_ctx, vdata, 64 );

   uint32_t *noncep = vdata + 76;   // 19*4
   do {
      found[0] = found[1] = found[2] = found[3] = false;
      be32enc( noncep,    n   );
-      be32enc( noncep +2, n+1 );
-      be32enc( noncep +4, n+2 );
-      be32enc( noncep +6, n+3 );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );

      blakehash_4way( hash, vdata );

-      if ( hash[7] == 0 )
+      if (  hash[7] <= HTarget && fulltest( hash, ptarget ) )
      {
-         if ( fulltest( hash, ptarget ) )
-         {
-             found[0] = true;
-             num_found++;
-             nonces[0] = n;
-             pdata[19] = n;
-         }
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          pdata[19] = n;
+          work_set_target_ratio( work, hash );
      }
-      if ( (hash+8)[7] == 0 ) 
+      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
      {
-         if ( fulltest( hash, ptarget ) ) 
-         {
-             found[1] = true;
-             num_found++;
-             nonces[1] = n+1;
-         }
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
      }
-      if ( (hash+16)[7] == 0 )
+      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
      {
-          if ( fulltest( hash, ptarget ) )
-          {
-              found[2] = true;
-              num_found++;
-              nonces[2] = n+2;
-          }
+           found[2] = true;
+           num_found++;
+           nonces[2] = n+2;
+           work_set_target_ratio( work, hash+16 );
      }
-      if ( (hash+24)[7] == 0 )
+      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
      {
-         if ( fulltest( hash, ptarget ) )
-         {
-              found[3] = true;
-              num_found++;
-              nonces[3] = n+3;
-         }
+           found[3] = true;
+           num_found++;
+           nonces[3] = n+3;
+           work_set_target_ratio( work, hash+24 );
      }
- 
      n += 4;
-      *hashes_done = n - first_nonce + 1;

   } while ( (num_found == 0) && (n < max_nonce) 
             && !work_restart[thr_id].restart );
--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -17,7 +17,6 @@ bool register_blake_algo( algo_gate_t* gate )
  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_blake_4way;
  gate->hash      = (void*)&blakehash_4way;
-  four_way_not_tested();
 #else
  gate->scanhash  = (void*)&scanhash_blake;
  gate->hash      = (void*)&blakehash;
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -36,7 +36,6 @@
 #include <string.h>
 #include <limits.h>

-//#include "sph_blake.h"
 #include "blake-hash-4way.h"

 #ifdef __cplusplus
@@ -98,18 +97,6 @@ static const unsigned sigma[16][16] = {
 	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
 };

-/*
-  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
- 14 10  4  8  9 15 13  6  1 12  0  2 11  7  5  3
- 11  8 12  0  5  2 15 13 10 14  3  6  7  1  9  4
-  7  9  3  1 13 12 11 14  2  6  5 10  4  0 15  8
-  9  0  5  7  2  4 10 15 14  1 11 12  6  8  3 13
-  2 12  6 10  0 11  8  3  4 13  7  5 15 14  1  9
- 12  5  1 15 14 13  4 10  0  7  6  3  9  2  8 11
- 13 11  7 14 12  1  3  9  5  0 15  4  8  6  2 10
-  6 15 14  9 11  3  0  8 12  2 13  7  1  4 10  5
- 10  2  8  4  7  6  1  5 15 11  9 14  3 12 13  0
-*/
 #endif

 #define Z00   0
@@ -504,14 +491,9 @@ do { \
 		(state)->T1 = T1; \
 	} while (0)

-//#define BLAKE32_ROUNDS 8
-#ifndef BLAKE32_ROUNDS
-#define BLAKE32_ROUNDS 14
-#endif
-
 #if SPH_COMPACT_BLAKE_32

-#define COMPRESS32_4WAY   do { \
+#define COMPRESS32_4WAY( rounds )   do { \
 	__m128i M[16]; \
 	__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
 	__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
@@ -524,18 +506,18 @@ do { \
 	V5 = H5; \
 	V6 = H6; \
 	V7 = H7; \
-        V8 = _mm_xor_si128( s0, _mmset_epi32( CS0, CS0, CS0, CS0 ) ); \
-        V9 = _mm_xor_si128( s1, _mmset_epi32( CS1, CS1, CS1, CS1 ) ); \
-        VA = _mm_xor_si128( s2, _mmset_epi32( CS2, CS2, CS2, CS2 ) ); \
-        VB = _mm_xor_si128( s3, _mmset_epi32( CS3, CS3, CS3, CS3 ) ); \
-        VC = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \
-                            _mmset_epi32( CS4, CS4, CS4, CS4 ) ); \
-        VD = _mm_xor_si128( _mmset_epi32( T0, T0, T0, T0 ), \
-                            _mmset_epi32( CS5, CS5, CS5, CS5 ) ); \
-        VE = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ) \
-                          , _mmset_epi32( CS6, CS6, CS6, CS6 ) ); \
-        VF = _mm_xor_si128( _mmset_epi32( T1, T1, T1, T1 ), \
-                            _mmset_epi32( CS7, CS7, CS7, CS7 ) ); \
+        V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
+        V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
+        VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
+        VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
+        VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
+                            _mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
+        VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
+                            _mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
+        VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ) \
+                          , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
+        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
+                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
 	M[0x0] = mm_byteswap_32( *(buf +  0) ); \
 	M[0x1] = mm_byteswap_32( *(buf +  1) ); \
 	M[0x2] = mm_byteswap_32( *(buf +  2) ); \
@@ -552,7 +534,7 @@ do { \
 	M[0xD] = mm_byteswap_32( *(buf + 13) ); \
 	M[0xE] = mm_byteswap_32( *(buf + 14) ); \
 	M[0xF] = mm_byteswap_32( *(buf + 15) ); \
-	for (r = 0; r < BLAKE32_ROUNDS; r ++) \
+	for (r = 0; r < rounds; r ++) \
 		ROUND_S_4WAY(r); \
        H0 = _mm_xor_si128( _mm_xor_si128( \
                                   _mm_xor_si128( S0, V0 ), V8 ), H0 ); \
@@ -576,80 +558,70 @@ do { \

 // current impl

-#define COMPRESS32_4WAY   do { \
-	__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
-	__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
-	__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
-	__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
-	V0 = H0; \
-	V1 = H1; \
-	V2 = H2; \
-	V3 = H3; \
-	V4 = H4; \
-	V5 = H5; \
-	V6 = H6; \
-	V7 = H7; \
-        V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
-        V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
-        VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
-        VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
-        VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
-                            _mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
-        VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
-                            _mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
-        VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
-                            _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
-        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
-                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
-	M0 = mm_byteswap_32( * buf ); \
-	M1 = mm_byteswap_32( *(buf+1) ); \
-	M2 = mm_byteswap_32( *(buf+2) ); \
-	M3 = mm_byteswap_32( *(buf+3) ); \
-	M4 = mm_byteswap_32( *(buf+4) ); \
-	M5 = mm_byteswap_32( *(buf+5) ); \
-	M6 = mm_byteswap_32( *(buf+6) ); \
-	M7 = mm_byteswap_32( *(buf+7) ); \
-	M8 = mm_byteswap_32( *(buf+8) ); \
-	M9 = mm_byteswap_32( *(buf+9) ); \
-	MA = mm_byteswap_32( *(buf+10) ); \
-	MB = mm_byteswap_32( *(buf+11) ); \
-	MC = mm_byteswap_32( *(buf+12) ); \
-	MD = mm_byteswap_32( *(buf+13) ); \
-	ME = mm_byteswap_32( *(buf+14) ); \
-	MF = mm_byteswap_32( *(buf+15) ); \
-	ROUND_S_4WAY(0); \
-	ROUND_S_4WAY(1); \
-	ROUND_S_4WAY(2); \
-	ROUND_S_4WAY(3); \
-	ROUND_S_4WAY(4); \
-	ROUND_S_4WAY(5); \
-	ROUND_S_4WAY(6); \
-	ROUND_S_4WAY(7); \
-	if (BLAKE32_ROUNDS == 14) { \
-	ROUND_S_4WAY(8); \
-	ROUND_S_4WAY(9); \
-	ROUND_S_4WAY(0); \
-	ROUND_S_4WAY(1); \
-	ROUND_S_4WAY(2); \
-	ROUND_S_4WAY(3); \
-	} \
-        H0 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
-        H1 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
-        H2 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
-        H3 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
-        H4 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
-        H5 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
-        H6 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
-        H7 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
-	} while (0)
+#define COMPRESS32_4WAY( rounds ) \
+do { \
+   __m128i M0, M1, M2, M3, M4, M5, M6, M7; \
+   __m128i M8, M9, MA, MB, MC, MD, ME, MF; \
+   __m128i V0, V1, V2, V3, V4, V5, V6, V7; \
+   __m128i V8, V9, VA, VB, VC, VD, VE, VF; \
+   V0 = H0; \
+   V1 = H1; \
+   V2 = H2; \
+   V3 = H3; \
+   V4 = H4; \
+   V5 = H5; \
+   V6 = H6; \
+   V7 = H7; \
+   V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
+   V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
+   VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
+   VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
+   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
+   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
+   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
+   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
+   M0 = mm_byteswap_32( * buf ); \
+   M1 = mm_byteswap_32( *(buf+1) ); \
+   M2 = mm_byteswap_32( *(buf+2) ); \
+   M3 = mm_byteswap_32( *(buf+3) ); \
+   M4 = mm_byteswap_32( *(buf+4) ); \
+   M5 = mm_byteswap_32( *(buf+5) ); \
+   M6 = mm_byteswap_32( *(buf+6) ); \
+   M7 = mm_byteswap_32( *(buf+7) ); \
+   M8 = mm_byteswap_32( *(buf+8) ); \
+   M9 = mm_byteswap_32( *(buf+9) ); \
+   MA = mm_byteswap_32( *(buf+10) ); \
+   MB = mm_byteswap_32( *(buf+11) ); \
+   MC = mm_byteswap_32( *(buf+12) ); \
+   MD = mm_byteswap_32( *(buf+13) ); \
+   ME = mm_byteswap_32( *(buf+14) ); \
+   MF = mm_byteswap_32( *(buf+15) ); \
+   ROUND_S_4WAY(0); \
+   ROUND_S_4WAY(1); \
+   ROUND_S_4WAY(2); \
+   ROUND_S_4WAY(3); \
+   ROUND_S_4WAY(4); \
+   ROUND_S_4WAY(5); \
+   ROUND_S_4WAY(6); \
+   ROUND_S_4WAY(7); \
+   if (rounds == 14) \
+   { \
+      ROUND_S_4WAY(8); \
+      ROUND_S_4WAY(9); \
+      ROUND_S_4WAY(0); \
+      ROUND_S_4WAY(1); \
+      ROUND_S_4WAY(2); \
+      ROUND_S_4WAY(3); \
+   } \
+   H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
+   H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
+   H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
+   H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
+   H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
+   H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
+   H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
+   H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
+} while (0)

 #endif

@@ -710,18 +682,18 @@ do { \
 	V5 = H5; \
 	V6 = H6; \
 	V7 = H7; \
-        V8 = _mm256_xor_si256( S0, _mm256_set_epi64( CB0, CB0, CB0, CB0 ) ); \
-        V9 = _mm256_xor_si256( S1, _mm256_set_epi64( CB1, CB1, CB1, CB1 ) ); \
-        VA = _mm256_xor_si256( S2, _mm256_set_epi64( CB2, CB2, CB2, CB2 ) ); \
-        VB = _mm256_xor_si256( S3, _mm256_set_epi64( CB3, CB3, CB3, CB3 ) ); \
-        VC = _mm256_xor_si128( _mm256_set_epi64( T0, T0, T0, T0 ), \
-                               _mm256_set_epi64( CB4, CB4, CB4, CB4 ) ); \
-        VD = _mm256_xor_si256( _mm256_set_epi64( T0, T0, T0, T0 ), \
-                               _mm256_set_epi64( CB5, CB5, CB5, CB5 ) ); \
-        VE = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
-                               _mm256_set256_epi64( CB6, CB6, CB6, CB6 ) ); \
-        VF = _mm256_xor_si256( _mm256_set_epi64( T1, T1, T1, T1 ), \
-                              _mm256_set256_epi64( CB7, CB7, CB7, CB7 ) ); \
+        V8 = _mm256_xor_si256( S0, _mm256_set_epi64x( CB0, CB0, CB0, CB0 ) ); \
+        V9 = _mm256_xor_si256( S1, _mm256_set_epi64x( CB1, CB1, CB1, CB1 ) ); \
+        VA = _mm256_xor_si256( S2, _mm256_set_epi64x( CB2, CB2, CB2, CB2 ) ); \
+        VB = _mm256_xor_si256( S3, _mm256_set_epi64x( CB3, CB3, CB3, CB3 ) ); \
+        VC = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
+                               _mm256_set_epi64x( CB4, CB4, CB4, CB4 ) ); \
+        VD = _mm256_xor_si256( _mm256_set_epi64x( T0, T0, T0, T0 ), \
+                               _mm256_set_epi64x( CB5, CB5, CB5, CB5 ) ); \
+        VE = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
+                               _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
+        VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
+                               _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
 	M[0x0] = mm256_byteswap_64( *(buf+0) ); \
 	M[0x1] = mm256_byteswap_64( *(buf+1) ); \
 	M[0x2] = mm256_byteswap_64( *(buf+2) ); \
@@ -845,15 +817,16 @@ static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };

 static void
 blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
-                   const sph_u32 *salt)
+                   const sph_u32 *salt, int rounds )
 {
-        int i;
-        for ( i = 0; i < 8; i++ )
-           sc->H[i] = _mm_set_epi32( iv[i], iv[i], iv[i], iv[i] );
-        for ( i = 0; i < 4; i++ )
-           sc->S[i] = _mm_set_epi32( salt[i], salt[i], salt[i], salt[i] );
-	sc->T0 = sc->T1 = 0;
-	sc->ptr = 0;
+   int i;
+   for ( i = 0; i < 8; i++ )
+      sc->H[i] = _mm_set1_epi32( iv[i] );
+   for ( i = 0; i < 4; i++ )
+      sc->S[i] = _mm_set1_epi32( salt[i] );
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+   sc->rounds = rounds;
 }

 static void
@@ -867,7 +840,6 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )

 	buf = sc->buf;
 	ptr = sc->ptr;
-
 	if ( len < buf_size - ptr )
        {
 		memcpy_128( buf + (ptr>>2), vdata, len>>2 );
@@ -892,7 +864,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
           {
 		if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
 			T1 = SPH_T32(T1 + 1);
-		COMPRESS32_4WAY;
+                COMPRESS32_4WAY( sc->rounds );
 		ptr = 0;
 	   }
 	}
@@ -915,48 +887,44 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   unsigned z = 0x80 >> n;
-   unsigned zz = ((ub & -z) | z) & 0xFF;
-   u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
+   u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;

   if ( ptr == 0 )
   {
-	sc->T0 = SPH_C32(0xFFFFFE00);
-	sc->T1 = SPH_C32(0xFFFFFFFF);
+	sc->T0 = SPH_C32(0xFFFFFE00UL);
+	sc->T1 = SPH_C32(0xFFFFFFFFUL);
   }
   else if ( sc->T0 == 0 )
   {
-	sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
+	sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
 	sc->T1 = SPH_T32(sc->T1 - 1);
   } 
   else
 	sc->T0 -= 512 - bit_len;

-   if ( ptr <= 48 )
+   if ( ptr <= 52 )
   {
-       memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
+       memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
       if (out_size_w32 == 8)
           u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
-                                    _mm_set_epi32( 0x010000000, 0x01000000,
-                                                   0x010000000, 0x01000000 ) );
-       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
-       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
+                                        _mm_set1_epi32( 0x01000000UL ) );
+       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
+       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
       blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
   }
   else
   {
 	memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
 	blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
-	sc->T0 = SPH_C32(0xFFFFFE00);
-	sc->T1 = SPH_C32(0xFFFFFFFF);
+	sc->T0 = SPH_C32(0xFFFFFE00UL);
+	sc->T1 = SPH_C32(0xFFFFFFFFUL);
 	memset_zero_128( u.buf, 56>>2 );
       if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
-                                         0x010000000, 0x01000000 );
-        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
-        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
+           u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
+        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
+        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
 	blake32_4way( sc, u.buf, 64 );
   }
   out = (__m128i*)dst;
@@ -974,9 +942,9 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
 {
        int i;
        for ( i = 0; i < 8; i++ )
-           sc->H[i] = _mm256_set_epi64x( iv[i], iv[i], iv[i], iv[i] );
+           sc->H[i] = _mm256_set1_epi64x( iv[i] );
        for ( i = 0; i < 4; i++ )
-           sc->S[i] = _mm256_set_epi64x( salt[i], salt[i], salt[i], salt[i] );
+           sc->S[i] = _mm256_set1_epi64x( salt[i] );
        sc->T0 = sc->T1 = 0;
        sc->ptr = 0;
 }
@@ -1048,12 +1016,12 @@ blake64_4way_close( blake_4way_big_context *sc,
   th = sc->T1;
   if (ptr == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
-	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
   }
   else if ( sc->T0 == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len;
+	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
 	sc->T1 = SPH_T64(sc->T1 - 1);
   } 
   else
@@ -1065,10 +1033,7 @@ blake64_4way_close( blake_4way_big_context *sc,
       memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
       if ( out_size_w64 == 8 )
          u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
-                                    _mm256_set_epi64x( 0x0100000000000000,
-                                                       0x0100000000000000,
-                                                       0x0100000000000000,
-                                                       0x0100000000000000 ) );
+                                 _mm256_set1_epi64x( 0x0100000000000000ULL ) );
       *(u.buf+(112>>3)) = mm256_byteswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
       *(u.buf+(120>>3)) = mm256_byteswap_64(
@@ -1081,15 +1046,11 @@ blake64_4way_close( blake_4way_big_context *sc,
       memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
-       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
-       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
       memset_zero_256( u.buf, 112>>3 ); 
       if ( out_size_w64 == 8 )
-           u.buf[104>>3] = _mm256_set_epi64x( 0x0100000000000000,
-                                              0x0100000000000000,
-                                              0x0100000000000000,
-                                              0x0100000000000000 );
-
+           u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
       *(u.buf+(112>>3)) = mm256_byteswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
       *(u.buf+(120>>3)) = mm256_byteswap_64(
@@ -1104,10 +1065,11 @@ blake64_4way_close( blake_4way_big_context *sc,

 #endif

+// default 14 rounds, backward copatibility
 void
 blake256_4way_init(void *cc)
 {
-	blake32_4way_init(cc, IV256, salt_zero_small);
+   blake32_4way_init( cc, IV256, salt_zero_small, 14 );
 }

 void
@@ -1119,13 +1081,43 @@ blake256_4way(void *cc, const void *data, size_t len)
 void
 blake256_4way_close(void *cc, void *dst)
 {
-	blake256_4way_addbits_and_close(cc, 0, 0, dst);
+        blake32_4way_close(cc, 0, 0, dst, 8);
+}
+
+// 14 rounds blake, decred
+void blake256r14_4way_init(void *cc)
+{
+   blake32_4way_init( cc, IV256, salt_zero_small, 14 );
 }

 void
-blake256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+blake256r14_4way(void *cc, const void *data, size_t len)
 {
-	blake32_4way_close(cc, ub, n, dst, 8);
+   blake32_4way(cc, data, len);
+}
+
+void
+blake256r14_4way_close(void *cc, void *dst)
+{
+   blake32_4way_close(cc, 0, 0, dst, 8);
+}
+
+// 8 rounds blakecoin, vanilla
+void blake256r8_4way_init(void *cc)
+{
+   blake32_4way_init( cc, IV256, salt_zero_small, 8 );
+}
+
+void
+blake256r8_4way(void *cc, const void *data, size_t len)
+{
+   blake32_4way(cc, data, len);
+}
+
+void
+blake256r8_4way_close(void *cc, void *dst)
+{
+   blake32_4way_close(cc, 0, 0, dst, 8);
 }

 #if defined (__AVX2__)
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -35,7 +35,9 @@
 */

 #ifndef __BLAKE_HASH_4WAY__
-#define __BLAKE_HASH_4WAY___
+#define __BLAKE_HASH_4WAY__
+
+#ifdef __AVX__

 #ifdef __cplusplus
 extern "C"{
@@ -45,38 +47,36 @@ extern "C"{
 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"

-/**
- * Output size (in bits) for BLAKE-256.
- */
 #define SPH_SIZE_blake256   256

-#if SPH_64
-
-/**
- * Output size (in bits) for BLAKE-512.
- */
 #define SPH_SIZE_blake512   512

-#endif
-
-#ifdef __AVX__
 typedef struct {
-        __m128i buf[16] __attribute__ ((aligned (64)));
-        __m128i H[8];
-        __m128i S[4];    
-        size_t ptr;
-	sph_u32 T0, T1;
+   __m128i buf[16] __attribute__ ((aligned (64)));
+   __m128i H[8];
+   __m128i S[4];    
+   size_t ptr;
+   sph_u32 T0, T1;
+   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
 } blake_4way_small_context;

+// Default 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
-
 void blake256_4way_init(void *cc);
 void blake256_4way(void *cc, const void *data, size_t len);
 void blake256_4way_close(void *cc, void *dst);
-void blake256_4way_addbits_and_close(
-        void *cc, unsigned ub, unsigned n, void *dst);

-#endif
+// 14 rounds, blake, decred
+typedef blake_4way_small_context blake256r14_4way_context;
+void blake256r14_4way_init(void *cc);
+void blake256r14_4way(void *cc, const void *data, size_t len);
+void blake256r14_4way_close(void *cc, void *dst);
+
+// 8 rounds, blakecoin, vanilla
+typedef blake_4way_small_context blake256r8_4way_context;
+void blake256r8_4way_init(void *cc);
+void blake256r8_4way(void *cc, const void *data, size_t len);
+void blake256r8_4way_close(void *cc, void *dst);

 #ifdef __AVX2__

@@ -103,3 +103,5 @@ void blake512_4way_addbits_and_close(
 #endif

 #endif
+
+#endif
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -0,0 +1,106 @@
+#include "blakecoin-gate.h"
+
+#if defined (__AVX__)
+
+#include "blake-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+
+blake256r8_4way_context blakecoin_ctx;
+
+void blakecoin_4way_hash(void *state, const void *input)
+{
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256r8_4way_context ctx;
+     memcpy( &ctx, &blakecoin_ctx, sizeof ctx );
+     blake256r8_4way( &ctx, input + (64<<2), 16 );
+     blake256r8_4way_close( &ctx, vhash );
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+
+   if (opt_benchmark)
+      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( edata, pdata, 20 );
+
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   blake256r8_4way_init( &blakecoin_ctx );
+   blake256r8_4way( &blakecoin_ctx, vdata, 64 );
+
+   uint32_t *noncep = vdata + 76;   // 19*4
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+
+      blakecoin_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if (  hash[7] <= HTarget && fulltest( hash, ptarget ) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) ) 
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
+      {
+           found[2] = true;
+           num_found++;
+           nonces[2] = n+2;
+           work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
+      {
+           found[3] = true;
+           num_found++;
+           nonces[3] = n+3;
+           work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+
+   } while ( (num_found == 0) && (n < max_nonce) 
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+
+   // workaround to prevent flood of hash reports when nonce range exhasuted
+   // and thread is spinning waiting for new work
+   if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
+   {
+      *hashes_done = 0;
+      sleep(1);
+   }
+
+   return num_found;
+}
+
+#endif
+
--- a/algo/blake/blakecoin-gate.c
+++ b/algo/blake/blakecoin-gate.c
@@ -0,0 +1,71 @@
+#include "blakecoin-gate.h"
+#include <memory.h>
+
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blakecoin_get_max64 ()
+{
+  return 0x7ffffLL;
+//  return 0x3fffffLL;
+}
+
+// Blakecoin 4 way hashes so fast it runs out of nonces.
+// This is an attempt to solve this but the result may be
+// to rehash old nonces until new work is received.
+void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id,
+                     uint32_t *end_nonce_ptr, bool clean_job )
+{
+   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
+// 
+//   if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) )
+//      algo_gate.stratum_gen_work( &stratum, g_work );
+
+   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) 
+   || ( *nonceptr >= *end_nonce_ptr )
+   || (  work->job_id != g_work->job_id ) && clean_job  )
+/*
+   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
+      && ( clean_job || ( *nonceptr >= *end_nonce_ptr )
+         || ( work->job_id != g_work->job_id ) ) )
+*/   
+   {
+     work_free( work );
+     work_copy( work, g_work );
+     *nonceptr = 0xffffffffU / opt_n_threads * thr_id;
+     if ( opt_randomize )
+       *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
+     *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; 
+// try incrementing the xnonce to chsnge the data
+//     for ( int i = 0; i < work->xnonce2_size && !( ++work->xnonce2[i] ); i++ );
+   }
+   else
+       ++(*nonceptr);
+}
+
+
+// vanilla uses default gen merkle root, otherwise identical to blakecoin
+bool register_vanilla_algo( algo_gate_t* gate )
+{
+#if defined(BLAKECOIN_4WAY)
+//  four_way_not_tested();
+  gate->optimizations = FOUR_WAY_OPT;
+  gate->scanhash  = (void*)&scanhash_blakecoin_4way;
+  gate->hash      = (void*)&blakecoin_4way_hash;
+//  gate->get_new_work = (void*)&bc4w_get_new_work;
+//  blakecoin_4way_init( &blake_4way_init_ctx );
+#else
+  gate->scanhash = (void*)&scanhash_blakecoin;
+  gate->hash     = (void*)&blakecoinhash;
+//  blakecoin_init( &blake_init_ctx );
+#endif
+  gate->optimizations = AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&blakecoin_get_max64;
+  return true;
+}
+
+bool register_blakecoin_algo( algo_gate_t* gate )
+{
+  register_vanilla_algo( gate );
+  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  return true;
+}
+
--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -0,0 +1,21 @@
+#ifndef __BLAKECOIN_GATE_H__
+#define __BLAKECOIN_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX__)
+  #define BLAKECOIN_4WAY
+#endif
+
+#if defined (BLAKECOIN_4WAY)
+void blakecoin_4way_hash(void *state, const void *input);
+int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
+void blakecoinhash( void *state, const void *input );
+int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done );
+
+#endif
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "blakecoin-gate.h"
 #define BLAKE32_ROUNDS 8
 #include "sph_blake.h"

@@ -98,7 +98,7 @@ void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
 SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
 }
 */
-
+/*
 // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
 int64_t blakecoin_get_max64 ()
 {
@@ -121,4 +121,4 @@ bool register_blakecoin_algo( algo_gate_t* gate )
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  return true;
 }
-
+*/
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -1,5 +1,4 @@
 #include "decred-gate.h"
-#include "sph_blake.h"
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
@@ -9,110 +8,58 @@
 #if defined (DECRED_4WAY)

 static __thread blake256_4way_context blake_mid;
-static __thread bool ctx_midstate_done = false;

 void decred_hash_4way( void *state, const void *input )
 {
-     uint32_t vhash[4*4] __attribute__ ((aligned (64)));
-     uint32_t hash0[4] __attribute__ ((aligned (32)));
-     uint32_t hash1[4] __attribute__ ((aligned (32)));
-     uint32_t hash2[4] __attribute__ ((aligned (32)));
-     uint32_t hash3[4] __attribute__ ((aligned (32)));
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     uint32_t hash0[8] __attribute__ ((aligned (32)));
+     uint32_t hash1[8] __attribute__ ((aligned (32)));
+     uint32_t hash2[8] __attribute__ ((aligned (32)));
+     uint32_t hash3[8] __attribute__ ((aligned (32)));
+     void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
+     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
     blake256_4way_context ctx __attribute__ ((aligned (64)));

-     sph_blake256_context ctx2 __attribute__ ((aligned (64)));
-     uint32_t hash[16] __attribute__ ((aligned (64)));
-     uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
-     mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
-
-     void *tail = input + DECRED_MIDSTATE_LEN;
-     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
-//     #define MIDSTATE_LEN 128
-/*
-        uint8_t *ending = (uint8_t*) input;
-        ending += MIDSTATE_LEN;
-
-     if ( !ctx_midstate_done )
-     {
-          blake256_4way_init( &blake_mid );
-          blake256_4way( &blake_mid, input, DECRED_MIDSTATE_LEN );
-          ctx_midstate_done = true;
-     }
     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
-
     blake256_4way( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
-*/
-
-
-     sph_blake256_init( &ctx2 );
-     sph_blake256( &ctx2, sin0, 180 );
-     sph_blake256_close( &ctx2, hash );
-
-     blake256_4way_init( &ctx );
-     blake256_4way( &ctx, input, 180 );
-     blake256_4way_close( &ctx, vhash );
-
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
-/*
-        for ( int i = 0; i < 8; i++ )
-          if ( hash[i] != hash0[i] )
-            printf(" hash mismatch, i = %u\n",i);
-
-printf("hash:  %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
-                             *(hash+2), *(hash+3) );
-printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
-                             *(hash0+2), *(hash0+3) );
-printf("\n");
-*/
-
-//     memcpy( state,    hash0, 32 );
-//     memcpy( state+32, hash1, 32 );
-//     memcpy( state+64, hash1, 32 );
-//     memcpy( state+96, hash1, 32 );
-
-     memcpy( state, hash, 32 );
-
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done)
 {
-   uint32_t vdata[45*4] __attribute__ ((aligned (64)));
-   uint32_t hash[4*4] __attribute__ ((aligned (32)));
-        uint32_t _ALIGN(64) endiandata[48];
-//        uint32_t _ALIGN(64) hash32[8];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-        uint32_t n = first_nonce;
-        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
+   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t _ALIGN(64) edata[48];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
+   uint32_t n = first_nonce;
+   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found = 0;

-//        #define DCR_NONCE_OFT32 35
-
-        ctx_midstate_done = false;
-
-//        memcpy(endiandata, pdata, 180);
+   // copy to buffer guaranteed to be aligned.
+   memcpy( edata, pdata, 180 );

   // use the old way until  new way updated for size.
-   mm_interleave_4x32x( vdata, pdata, pdata, pdata, pdata, 180*8 );
+   mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );
+
+   blake256_4way_init( &blake_mid );
+   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );

   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
   do {
      found[0] = found[1] = found[2] = found[3] = false;
      * noncep    = n;
-      *(noncep+2) = n+1;
-      *(noncep+4) = n+2;
-      *(noncep+6) = n+3;
+      *(noncep+1) = n+1;
+      *(noncep+2) = n+2;
+      *(noncep+3) = n+3;

      decred_hash_4way( hash, vdata );

-//                endiandata[DCR_NONCE_OFT32] = n;
-//                decred_hash(hash32, endiandata);
-
      if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
      {
          work_set_target_ratio( work, hash );
@@ -121,28 +68,28 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
          nonces[0] = n;
          pdata[DECRED_NONCE_INDEX] = n;
      }
-/*      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
+      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
      {
          work_set_target_ratio( work, hash+8 );
          found[1] = true;
          num_found++;
-          nonces[1] = n;
+          nonces[1] = n+1;
      }
      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
      {
          work_set_target_ratio( work, hash+16 );
          found[2] = true;
          num_found++;
-          nonces[2] = n;
+          nonces[2] = n+2;
      }
+
      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
      {
          work_set_target_ratio( work, hash+24 );
          found[3] = true;
          num_found++;
-          nonces[3] = n;
+          nonces[3] = n+3;
      }
-*/
      n += 4;
  } while ( (num_found == 0) && (n < max_nonce) 
            && !work_restart[thr_id].restart );
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -1,4 +1,7 @@
 #include "pentablake-gate.h"
+
+#ifdef __AVX2__
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,8 +12,6 @@

 //#define DEBUG_ALGO

-#ifdef PENTABLAKE_4WAY
-
 extern void pentablakehash_4way( void *output, const void *input )
 {
 	unsigned char _ALIGN(32) hash[128];
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX__)
+#if defined(FOUR_WAY) && defined(__AVX2__)
  #define PENTABLAKE_4WAY
 #endif

--- a/algo/blake/sph_blake.c
+++ b/algo/blake/sph_blake.c
@@ -872,6 +872,7 @@ blake32_close(sph_blake_small_context *sc,
 	} else {
 		sc->T0 -= 512 - bit_len;
 	}
+
 	if (bit_len <= 446) {
 		memset(u.buf + ptr + 1, 0, 55 - ptr);
 		if (out_size_w32 == 8)
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -0,0 +1,95 @@
+/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
+ * functions which differ by their output size; this implementation
+ * defines BMW for output sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_bmw.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef BMW_HASH_H__
+#define BMW_HASH_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#ifdef __AVX2__
+
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_bmw256   256
+
+#define SPH_SIZE_bmw512   512
+
+typedef struct {
+   __m128i buf[64];
+   __m128i H[16];
+   size_t ptr;
+   sph_u32 bit_count;  // assume bit_count fits in 32 bits
+} bmw_4way_small_context;
+
+typedef bmw_4way_small_context bmw256_4way_context;
+
+typedef struct {
+   __m256i buf[16];
+   __m256i H[16];
+   size_t ptr;
+   sph_u64 bit_count;
+} bmw_4way_big_context;
+
+typedef bmw_4way_big_context bmw512_4way_context;
+
+void bmw256_4way_init(void *cc);
+
+void bmw256_4way(void *cc, const void *data, size_t len);
+
+void bmw256_4way_close(void *cc, void *dst);
+
+void bmw256_4way_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+void bmw512_4way_init(void *cc);
+
+void bmw512_4way(void *cc, const void *data, size_t len);
+
+void bmw512_4way_close(void *cc, void *dst);
+
+void bmw512_4way_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -96,34 +96,18 @@ extern "C"{
 do { \
   __m256i cc = _mm256_set_epi64x( c, c, c, c ); \
    x3 = mm256_not( x3 ); \
-    x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_not( x2 ) ) ); \
+    x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
    tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
    x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
-    x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_not( x1 ), x2 ) ); \
+    x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \
    x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
-    x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_not( x3 ) ) ); \
+    x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \
    x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
    x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
    x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
    x2 = _mm256_xor_si256( x2, tmp ); \
 } while (0)

-/*
-#define Sb(x0, x1, x2, x3, c)   do { \
-		x3 = ~x3; \
-		x0 ^= (c) & ~x2; \
-		tmp = (c) ^ (x0 & x1); \
-		x0 ^= x2 & x3; \
-		x3 ^= ~x1 & x2; \
-		x1 ^= x0 & x2; \
-		x2 ^= x0 & ~x3; \
-		x0 ^= x1 | x3; \
-		x3 ^= x1 & x2; \
-		x1 ^= tmp & x0; \
-		x2 ^= tmp; \
-	} while (0)
-*/
-
 #define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
 do { \
    x4 = _mm256_xor_si256( x4, x1 ); \
@@ -136,20 +120,6 @@ do { \
    x3 = _mm256_xor_si256( x3, x4 ); \
 } while (0)

-
-/*
-#define Lb(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
-		x4 ^= x1; \
-		x5 ^= x2; \
-		x6 ^= x3 ^ x0; \
-		x7 ^= x0; \
-		x0 ^= x5; \
-		x1 ^= x6; \
-		x2 ^= x7 ^ x4; \
-		x3 ^= x4; \
-	} while (0)
-*/
-
 #if SPH_JH_64

 static const sph_u64 C[] = {
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -23,12 +23,12 @@ void jha_hash_4way( void *out, const void *input )
    uint64_t hash2[8] __attribute__ ((aligned (64)));
    uint64_t hash3[8] __attribute__ ((aligned (64)));
    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
-    __m256i mask0, mask1;
-    __m256i* vh = (__m256i*)vhash;
-    __m256i* vh0 = (__m256i*)vhash0;
-    __m256i* vh1 = (__m256i*)vhash1;
+    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+    __m256i* vh  = (__m256i*)vhash;
+    __m256i* vhA = (__m256i*)vhashA;
+    __m256i* vhB = (__m256i*)vhashB;
+    __m256i vh_mask;

    blake512_4way_context  ctx_blake;
    hashState_groestl      ctx_groestl;
@@ -40,122 +40,69 @@ void jha_hash_4way( void *out, const void *input )
    keccak512_4way( &ctx_keccak, input, 80 );
    keccak512_4way_close( &ctx_keccak, vhash );

-//    memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
-//    keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
-//    keccak512_4way_close( &ctx_keccak, vhash );
-
    // Heavy & Light Pair Loop
    for ( int round = 0; round < 3; round++ )
    {
-//       memset_zero_256( vh0, 20 );
-//       memset_zero_256( vh1, 20 );
-
-      // positive logic, if maski select vhi
-      // going from bit to mask reverses logic such that if the test bit is set
-      // zero will be put in mask0, meaning don't take vh0. mask1 is
-      // inverted so 1 will be put in mask1 meaning take it.
-      mask0 = mm256_negate_64(
-                     _mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
-      mask1 = mm256_not( mask0 );
-
-//       mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
-//                     _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
-
-       // groestl (serial) v skein
+       vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
+               vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero );

       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash0,
-                     (char*)hash0, 512 );
-
+                                               (char*)hash0, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash1,
-                                          (char*)hash1, 512 );
-
+                                               (char*)hash1, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash2,
-                                          (char*)hash2, 512 );
+                                               (char*)hash2, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash3,
-                                          (char*)hash3, 512 );
-
-       mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
-
-       // skein
+                                               (char*)hash3, 512 );
+       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

       skein512_4way_init( &ctx_skein );
       skein512_4way( &ctx_skein, vhash, 64 );
-       skein512_4way_close( &ctx_skein, vhash1 );
+       skein512_4way_close( &ctx_skein, vhashB );

-       // merge vectored hash
       for ( int i = 0; i < 8; i++ )
-       {
-          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
-                                   _mm256_and_si256( vh1[i], mask1 ) );
-/*
-          vha256[i] = _mm256_maskload_epi64( 
-                                      vhasha + i*4, mm256_not( mask ) );
-          vhb256[i] = _mm256_maskload_epi64(
-                                      vhashb + i*4, mask );
-          vh256[i]  = _mm256_or_si256( vha256[i], vhb256[i] );
-*/
-       }
-
-       // blake v jh
+          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );

       blake512_4way_init( &ctx_blake );
       blake512_4way( &ctx_blake, vhash, 64 );
-       blake512_4way_close( &ctx_blake, vhash0 );
+       blake512_4way_close( &ctx_blake, vhashA );

       jh512_4way_init( &ctx_jh );
       jh512_4way( &ctx_jh, vhash, 64 );
-       jh512_4way_close( &ctx_jh, vhash1 );
+       jh512_4way_close( &ctx_jh, vhashB );

-       // merge hash
       for ( int i = 0; i < 8; i++ )
-       {
-          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
-                                   _mm256_and_si256( vh1[i], mask1 ) );
-/*
-          vha256[i] = _mm256_maskload_epi64(
-                                      vhasha + i*4, mm256_not( mask ) );
-          vhb256[i] = _mm256_maskload_epi64(
-                                      vhashb + i*4, mask );
-          vh256[i]  = _mm256_or_si256( vha256[i], vhb256[i] );
-*/
-       }
+          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    }

    mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
-
-//    memcpy( output,       hash0, 32 );
-//    memcpy( output+32,    hash1, 32 );
-//    memcpy( output+64,    hash2, 32 );
-//    memcpy( output+96,    hash3, 32 );
-
 }

 int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done )
 {
-     uint32_t hash[8*4] __attribute__ ((aligned (64)));
-     uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-	const uint32_t first_nonce = pdata[19];
-	const uint32_t Htarg = ptarget[7];
-	uint32_t n = pdata[19];
-     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
-     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t n = pdata[19];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;

-	uint64_t htmax[] = {
+   uint64_t htmax[] = {
 		0,
 		0xF,
 		0xFF,
@@ -163,7 +110,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
 		0xFFFF,
 		0x10000000
 	};
-	uint32_t masks[] = {
+   uint32_t masks[] = {
 		0xFFFFFFFF,
 		0xFFFFFFF0,
 		0xFFFFFF00,
@@ -172,17 +119,12 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
 		0
 	};

-   // we need bigendian data...
   for ( int i=0; i < 19; i++ )
      be32enc( &endiandata[i], pdata[i] );

   uint64_t *edata = (uint64_t*)endiandata;
   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

-   // precalc midstate for keccak
-//   keccak512_4way_init( &jha_kec_mid );
-//   keccak512_4way( &jha_kec_mid, vdata, 64 );
-
   for ( int m = 0; m < 6; m++ )
   {
      if ( Htarg <= htmax[m] )
@@ -196,7 +138,6 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
              be32enc( noncep3, n+3 );

              jha_hash_4way( hash, vdata );
-
              pdata[19] = n;

              if ( ( !(hash[7] & mask) )
@@ -234,11 +175,9 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
              n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
                     && !work_restart[thr_id].restart );
-
         break;
      }
   }
-
   *hashes_done = n - first_nonce + 1;
   return num_found;
 }
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -0,0 +1,128 @@
+#include "lyra2h-gate.h"
+
+#ifdef LYRA2H_4WAY
+
+#include <memory.h>
+#include <mm_malloc.h>
+#include "lyra2.h"
+#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake-hash-4way.h"
+
+__thread uint64_t* lyra2h_4way_matrix;
+
+bool lyra2h_4way_thread_init()
+{
+ return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_4way_context l2h_4way_blake_mid;
+
+void lyra2h_4way_midstate( const void* input )
+{
+       blake256_4way_init( &l2h_4way_blake_mid );
+       blake256_4way( &l2h_4way_blake_mid, input, 64 );
+}
+
+void lyra2h_4way_hash( void *state, const void *input )
+{
+     uint32_t hash0[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (64)));
+     uint32_t hash2[8] __attribute__ ((aligned (64)));
+     uint32_t hash3[8] __attribute__ ((aligned (64)));
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
+
+     memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
+     blake256_4way( &ctx_blake, input + (64*4), 16 );
+     blake256_4way_close( &ctx_blake, vhash );
+
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+
+     LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 76; // 19*4
+   uint32_t *noncep1 = vdata + 77;
+   uint32_t *noncep2 = vdata + 78;
+   uint32_t *noncep3 = vdata + 79;
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   for ( int i=0; i < 19; i++ )
+      be32enc( &edata[i], pdata[i] );
+
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   lyra2h_4way_midstate( vdata );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      be32enc( &edata[19], n );
+      lyra2h_4way_hash( hash, vdata );
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = pdata[19] = n;
+          work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+      {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+      {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
+
--- a/algo/lyra2/lyra2h-gate.c
+++ b/algo/lyra2/lyra2h-gate.c
@@ -0,0 +1,25 @@
+#include "lyra2h-gate.h"
+#include "lyra2.h"
+
+void lyra2h_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_lyra2h_algo( algo_gate_t* gate )
+{
+#ifdef LYRA2H_4WAY
+  gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h_4way;
+  gate->hash       = (void*)&lyra2h_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&lyra2h_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h;
+  gate->hash       = (void*)&lyra2h_hash;
+#endif
+  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&lyra2h_set_target;
+  return true;
+};
+
--- a/algo/lyra2/lyra2h-gate.h
+++ b/algo/lyra2/lyra2h-gate.h
@@ -0,0 +1,32 @@
+#ifndef LYRA2H_GATE_H__
+#define LYRA2H_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY)
+  #define LYRA2H_4WAY
+#endif
+
+#define LYRA2H_MATRIX_SIZE  BLOCK_LEN_INT64 * 16 * 16 * 8
+
+#if defined(LYRA2H_4WAY)
+
+void lyra2h_4way_hash( void *state, const void *input );
+
+int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+bool lyra2h_4way_thread_init();
+
+#endif
+
+void lyra2h_hash( void *state, const void *input );
+
+int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+bool lyra2h_thread_init();
+
+#endif
+
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -0,0 +1,75 @@
+#include "lyra2h-gate.h"
+#include <memory.h>
+#include <mm_malloc.h>
+#include "lyra2.h"
+#include "algo/blake/sph_blake.h"
+
+__thread uint64_t* lyra2h_matrix;
+
+bool lyra2h_thread_init()
+{
+   lyra2h_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 );
+   return lyra2h_matrix;
+}
+
+static __thread sph_blake256_context lyra2h_blake_mid;
+
+void lyra2h_midstate( const void* input )
+{
+       sph_blake256_init( &lyra2h_blake_mid );
+       sph_blake256( &lyra2h_blake_mid, input, 64 );
+}
+
+void lyra2h_hash( void *state, const void *input )
+{
+        uint32_t _ALIGN(64) hash[16];
+
+        sph_blake256_context ctx_blake __attribute__ ((aligned (64)));
+
+        memcpy( &ctx_blake, &lyra2h_blake_mid, sizeof lyra2h_blake_mid );
+        sph_blake256( &ctx_blake, input + 64, 16 );
+        sph_blake256_close( &ctx_blake, hash );
+
+        LYRA2Z( lyra2h_matrix, hash, 32, hash, 32, hash, 32, 16, 16, 16 );
+
+    memcpy(state, hash, 32);
+}
+
+int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done )
+{
+	uint32_t _ALIGN(64) hash[8];
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+
+	if (opt_benchmark)
+		ptarget[7] = 0x0000ff;
+
+	for (int i=0; i < 19; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+
+        lyra2h_midstate( endiandata );
+
+	do {
+		be32enc(&endiandata[19], nonce);
+                lyra2h_hash( hash, endiandata );
+
+		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+			work_set_target_ratio(work, hash);
+			pdata[19] = nonce;
+			*hashes_done = pdata[19] - first_nonce;
+			return 1;
+		}
+		nonce++;
+
+	} while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -106,6 +106,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
                   {
 			pdata[19] = nonce;
 			*hashes_done = pdata[19] - first_nonce;
+                        work_set_target_ratio( work, hash );
 			return 1;
                   }
 		}
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -0,0 +1,177 @@
+#include "lyra2rev2-gate.h"
+#include <memory.h>
+
+#ifdef __AVX2__	
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+
+#include "algo/cubehash/sph_cubehash.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+
+typedef struct {
+   blake256_4way_context     blake;
+   keccak256_4way_context    keccak;
+   cubehashParam             cube;
+   skein256_4way_context     skein;
+        sph_bmw256_context       bmw;
+
+} lyra2v2_4way_ctx_holder;
+
+static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
+
+void init_lyra2rev2_4way_ctx()
+{
+//   blake256_4way_init( &l2v2_4way_ctx.blake );
+   keccak256_4way_init( &l2v2_4way_ctx.keccak );
+   cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
+   skein256_4way_init( &l2v2_4way_ctx.skein );
+        sph_bmw256_init( &l2v2_4way_ctx.bmw );
+}
+
+void lyra2rev2_4way_hash( void *state, const void *input )
+{
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+   uint64_t vhash64[4*4] __attribute__ ((aligned (64)));
+   lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
+   memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
+
+   blake256_4way( &ctx.blake, input + (64<<2), 16 );
+//   blake256_4way( &ctx.blake, input, 80 );
+   blake256_4way_close( &ctx.blake, vhash );
+
+   mm256_reinterleave_4x64( vhash64, vhash, 256 );
+   keccak256_4way( &ctx.keccak, vhash64, 32 );
+   keccak256_4way_close( &ctx.keccak, vhash64 );
+   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
+
+   LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
+
+   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   skein256_4way( &ctx.skein, vhash64, 32 );
+   skein256_4way_close( &ctx.skein, vhash64 );
+   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
+
+
+	sph_bmw256( &ctx.bmw, hash0, 32 );
+	sph_bmw256_close( &ctx.bmw, hash0 );
+        memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
+        sph_bmw256( &ctx.bmw, hash1, 32 );
+        sph_bmw256_close( &ctx.bmw, hash1 );
+        memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
+        sph_bmw256( &ctx.bmw, hash2, 32 );
+        sph_bmw256_close( &ctx.bmw, hash2 );
+        memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw );
+        sph_bmw256( &ctx.bmw, hash3, 32 );
+        sph_bmw256_close( &ctx.bmw, hash3 );
+
+
+   memcpy( state,    hash0, 32 );
+   memcpy( state+32, hash1, 32 );
+   memcpy( state+64, hash2, 32 );
+   memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 76; // 19*4
+   uint32_t *noncep1 = vdata + 77;
+   uint32_t *noncep2 = vdata + 78;
+   uint32_t *noncep3 = vdata + 79;
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   swab32_array( edata, pdata, 20 );
+
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   blake256_4way_init( &l2v2_4way_ctx.blake );
+   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      lyra2rev2_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = pdata[19] = n;
+          work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+      {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+      {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/lyra2/lyra2rev2-gate.c
+++ b/algo/lyra2/lyra2rev2-gate.c
@@ -0,0 +1,38 @@
+#include "lyra2rev2-gate.h"
+
+__thread uint64_t* l2v2_wholeMatrix;
+
+void lyra2rev2_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool lyra2rev2_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   l2v2_wholeMatrix = _mm_malloc( i, 64 );
+
+   return l2v2_wholeMatrix;
+}
+
+bool register_lyra2rev2_algo( algo_gate_t* gate )
+{
+#if defined (LYRA2REV2_4WAY)
+  init_lyra2rev2_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
+  gate->hash      = (void*)&lyra2rev2_4way_hash;
+#else
+  init_lyra2rev2_ctx();
+  gate->scanhash  = (void*)&scanhash_lyra2rev2;
+  gate->hash      = (void*)&lyra2rev2_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
+  gate->set_target        = (void*)&lyra2rev2_set_target;
+  return true;
+};
+
+
--- a/algo/lyra2/lyra2rev2-gate.h
+++ b/algo/lyra2/lyra2rev2-gate.h
@@ -0,0 +1,35 @@
+#ifndef LYRA2REV2_GATE_H__
+#define LYRA2REV2_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+#include "lyra2.h"
+
+#if defined(HASH_4WAY)
+  #define LYRA2REV2_4WAY
+#endif
+
+extern __thread uint64_t* l2v2_wholeMatrix;
+
+bool register_lyra2rev2_algo( algo_gate_t* gate );
+
+#if defined(LYRA2REV2_4WAY)
+
+void lyra2rev2_4way_hash( void *state, const void *input );
+
+int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_lyra2rev2_4way_ctx();
+
+#endif
+
+void lyra2rev2_hash( void *state, const void *input );
+
+int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_lyra2rev2_ctx();
+
+#endif
+
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -1,20 +1,12 @@
+#include "lyra2rev2-gate.h"
 #include <memory.h>
-
-#include "algo-gate-api.h"
-
 #include "algo/blake/sph_blake.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-#include "lyra2.h"
-#include "avxdefs.h"
-
-// This gets allocated when miner_thread starts up and is never freed.
-// It's not a leak because the only way to allocate it again is to exit
-// the thread and that only occurs when the entire program exits.
-__thread uint64_t* l2v2_wholeMatrix;
+//#include "lyra2.h"

 typedef struct {
        cubehashParam           cube1;
@@ -106,6 +98,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
                   if( fulltest(hash, ptarget) )
                   {
 			pdata[19] = nonce;
+                        work_set_target_ratio( work, hash );
 			*hashes_done = pdata[19] - first_nonce;
 		   	return 1;
 		   }
@@ -119,30 +112,3 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
 	return 0;
 }

-void lyra2rev2_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool lyra2rev2_thread_init()
-{
-   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-
-   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
-   l2v2_wholeMatrix = _mm_malloc( i, 64 );
-
-   return l2v2_wholeMatrix;
-}
-
-bool register_lyra2rev2_algo( algo_gate_t* gate )
-{
-  init_lyra2rev2_ctx();
-  gate->optimizations = AVX_OPT | AVX2_OPT;
-  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
-  gate->scanhash          = (void*)&scanhash_lyra2rev2;
-  gate->hash              = (void*)&lyra2rev2_hash;
-  gate->set_target        = (void*)&lyra2rev2_set_target;
-  return true;
-};
-
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -4,13 +4,10 @@

 #include <memory.h>
 #include <mm_malloc.h>
-//#include "algo-gate-api.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "algo/blake/blake-hash-4way.h"
-//#include "avxdefs.h"

-// same size, only difference is the name, lyra2 is done serially
 __thread uint64_t* lyra2z_4way_matrix;

 bool lyra2z_4way_thread_init()
@@ -26,12 +23,8 @@ void lyra2z_4way_midstate( const void* input )
       blake256_4way( &l2z_4way_blake_mid, input, 64 );
 }

-// block 2050 new algo, blake plus new lyra parms. new input
-// is power of 2 so normal lyra can be used
-//void zcoin_hash(void *state, const void *input, uint32_t height)
 void lyra2z_4way_hash( void *state, const void *input )
 {
-//        uint32_t _ALIGN(64) hash[16];
     uint32_t hash0[8] __attribute__ ((aligned (64)));
     uint32_t hash1[8] __attribute__ ((aligned (64)));
     uint32_t hash2[8] __attribute__ ((aligned (64)));
@@ -39,27 +32,21 @@ void lyra2z_4way_hash( void *state, const void *input )
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

-//     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
-//     blake256_4way( &ctx_blake, input + (64*4), 16 );
-//     blake256_4way_close( &ctx_blake, vhash );
-
-     blake256_4way_init( &ctx_blake );
-     blake256_4way( &ctx_blake, input, 80 );
+     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
+     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

     LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
-//     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
     LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
-//     LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );

     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash2, 32 );
     memcpy( state+96, hash3, 32 );
-
-//    memcpy(state, hash, 32);
 }

 int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -67,7 +54,6 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-//	uint32_t _ALIGN(64) hash[8];
   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -90,7 +76,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,

   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

-//   lyra2z_4way_midstate( vdata );
+   lyra2z_4way_midstate( vdata );

   do {
      found[0] = found[1] = found[2] = found[3] = false;
@@ -99,46 +85,38 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
      be32enc( noncep2, n+2 );
      be32enc( noncep3, n+3 );

-      be32enc( &edata[19], n );
      lyra2z_4way_hash( hash, vdata );
+      pdata[19] = n;

      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
      {
-printf("found 0\n");
          found[0] = true;
          num_found++;
          nonces[0] = pdata[19] = n;
          work_set_target_ratio( work, hash );
      }
-/*      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
      {
-printf("found 1\n");          
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
          work_set_target_ratio( work, hash+8 );
      }
-*/
      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
      {
-printf("found 2\n");          
          found[2] = true;
          num_found++;
          nonces[2] = n+2;
          work_set_target_ratio( work, hash+16 );
      }
-/*
      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
      {
-printf("found 3\n");          
          found[3] = true;
          num_found++;
          nonces[3] = n+3;
          work_set_target_ratio( work, hash+24 );
      }
      n += 4;
-*/
-      n += 2;
   } while ( (num_found == 0) && (n < max_nonce-4)
                   && !work_restart[thr_id].restart);

@@ -148,21 +126,3 @@ printf("found 3\n");

 #endif

-/*
-
-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
-			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
-		nonce++;
-
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
-}
-*/
-
--- a/algo/lyra2/lyra2z-gate.c
+++ b/algo/lyra2/lyra2z-gate.c
@@ -9,18 +9,15 @@ void lyra2z_set_target( struct work* work, double job_diff )
 bool register_lyra2z_algo( algo_gate_t* gate )
 {
 #ifdef LYRA2Z_4WAY
-  four_way_not_tested();
-  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
  gate->hash       = (void*)&lyra2z_4way_hash;
 #else
-  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2z_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-
+  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&lyra2z_set_target;
  return true;
--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -82,41 +82,3 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-/*
-//int64_t get_max64_0xffffLL() { return 0xffffLL; };
-
-void lyra2z_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
-{
-   work->height = sctx->bloc_height;
-   return false;
-}
-
-
-bool lyra2z_thread_init()
-{
-   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-
-   int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
-   lyra2z_wholeMatrix = _mm_malloc( i, 64 );
-
-   return lyra2z_wholeMatrix;
-}
-
-bool register_lyra2z_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  gate->miner_thread_init = (void*)&lyra2z_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2z;
-  gate->hash       = (void*)&lyra2z_hash;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&lyra2z_set_target;
-//  gate->prevent_dupes = (void*)&zcoin_get_work_height;
-  return true;
-};
-*/
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -65,13 +65,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_rotl256_1x64( s1); \
+   s1 = mm256_rotr256_1x64( s1); \
   s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_rotr256_1x64( s3 ); \
+   s3 = mm256_rotl256_1x64( s3 ); \
   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_rotr256_1x64( s1 ); \
+   s1 = mm256_rotl256_1x64( s1 ); \
   s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_rotl256_1x64( s3 );
+   s3 = mm256_rotr256_1x64( s3 );

 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -346,6 +346,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
                    hash_str,
                    target_str);
            }
+            work_set_target_ratio( work, hash );
            pdata[19] = data[19];
            goto out;
 	  }
--- a/algo/neoscrypt/neoscrypt.c
+++ b/algo/neoscrypt/neoscrypt.c
--- a/algo/nist5/nist5-gate.c
+++ b/algo/nist5/nist5-gate.c
@@ -2,7 +2,7 @@

 bool register_nist5_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
 #if defined (NIST5_4WAY)
    gate->scanhash = (void*)&scanhash_nist5_4way;
    gate->hash     = (void*)&nist5hash_4way;
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+#if defined(HASH_4WAY) && defined(__AES__)
  #define NIST5_4WAY
 #endif

--- a/algo/nist5/nist5.c
+++ b/algo/nist5/nist5.c
@@ -132,6 +132,7 @@ int scanhash_nist5(int thr_id, struct work *work,
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
+                                                work_set_target_ratio( work, hash64 );
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
--- a/algo/nist5/zr5.c
+++ b/algo/nist5/zr5.c
@@ -172,6 +172,7 @@ int scanhash_zr5( int thr_id, struct work *work,
         pdata[0] = tmpdata[0];
         pdata[19] = nonce;
         *hashes_done = pdata[19] - first_nonce + 1;
+         work_set_target_ratio( work, hash );
         if (opt_debug)
           applog(LOG_INFO, "found nonce %x", nonce);
         return 1;
--- a/algo/polytimos/polytimos-gate.h
+++ b/algo/polytimos/polytimos-gate.h
@@ -1,12 +0,0 @@
-#ifndef __POLYTIMOS_GATE_H__
-#define __POLYTIMOS_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-void polytimos_hash( void *state, const void *input );
-int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-void init_polytimos_context();
-
-#endif
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -0,0 +1,207 @@
+#include "cpuminer-config.h"
+#include "quark-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+
+typedef struct {
+    blake512_4way_context  blake;
+    bmw512_4way_context    bmw;
+    hashState_groestl      groestl;
+    jh512_4way_context     jh;
+    skein512_4way_context  skein;
+    keccak512_4way_context keccak;
+} quark_4way_ctx_holder;
+
+quark_4way_ctx_holder quark_4way_ctx __attribute__ ((aligned (64)));
+
+void init_quark_4way_ctx()
+{
+     blake512_4way_init( &quark_4way_ctx.blake );
+     bmw512_4way_init( &quark_4way_ctx.bmw );
+     init_groestl( &quark_4way_ctx.groestl, 64 );
+     skein512_4way_init( &quark_4way_ctx.skein );
+     jh512_4way_init( &quark_4way_ctx.jh );
+     keccak512_4way_init( &quark_4way_ctx.keccak );
+}
+
+void quark_4way_hash( void *state, const void *input )
+{
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+    __m256i* vh  = (__m256i*)vhash;
+    __m256i* vhA = (__m256i*)vhashA;
+    __m256i* vhB = (__m256i*)vhashB;
+    __m256i vh_mask;
+    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
+    int i;
+    quark_4way_ctx_holder ctx;
+    memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
+
+    blake512_4way( &ctx.blake, input, 80 );
+    blake512_4way_close( &ctx.blake, vhash );
+
+    bmw512_4way( &ctx.bmw, vhash, 64 );
+    bmw512_4way_close( &ctx.bmw, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  mm256_zero );
+
+       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_close( &ctx.skein, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+    jh512_4way( &ctx.jh, vhash, 64 );
+    jh512_4way_close( &ctx.jh, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  mm256_zero );
+
+       blake512_4way_init( &ctx.blake );
+       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_close( &ctx.blake, vhashA );
+
+       bmw512_4way_init( &ctx.bmw );
+       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_close( &ctx.bmw, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    keccak512_4way( &ctx.keccak, vhash, 64 );
+    keccak512_4way_close( &ctx.keccak, vhash );
+
+    skein512_4way_init( &ctx.skein );
+    skein512_4way( &ctx.skein, vhash, 64 );
+    skein512_4way_close( &ctx.skein, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  mm256_zero );
+
+       keccak512_4way_init( &ctx.keccak );
+       keccak512_4way( &ctx.keccak, vhash, 64 );
+       keccak512_4way_close( &ctx.keccak, vhashA );
+
+       jh512_4way_init( &ctx.jh );
+       jh512_4way( &ctx.jh, vhash, 64 );
+       jh512_4way_close( &ctx.jh, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done)
+{
+    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+    uint32_t endiandata[20] __attribute__((aligned(64)));
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t *nonces = work->nonces;
+    bool *found = work->nfound;
+    int num_found = 0;
+    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+    uint32_t *noncep1 = vdata + 75;
+    uint32_t *noncep2 = vdata + 77;
+    uint32_t *noncep3 = vdata + 79;
+
+    swab32_array( endiandata, pdata, 20 );
+
+    uint64_t *edata = (uint64_t*)endiandata;
+    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+    do
+    {
+       found[0] = found[1] = found[2] = found[3] = false;
+       be32enc( noncep0, n   );
+       be32enc( noncep1, n+1 );
+       be32enc( noncep2, n+2 );
+       be32enc( noncep3, n+3 );
+
+       quark_4way_hash( hash, vdata );
+       pdata[19] = n;
+
+       if ( ( hash[7] & 0xFFFFFF00 ) == 0 && fulltest( hash, ptarget ) ) 
+       {
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          work_set_target_ratio( work, hash );
+       }
+       if ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+8, ptarget ) )
+       {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash );
+       }
+       if ( ( (hash+16)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+16, ptarget ) )
+       {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash );
+       }
+       if ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+24, ptarget ) )
+       {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash );
+       }
+       n += 4;
+    } while ( ( num_found == 0 ) && ( n < max_nonce )
+              && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce + 1;
+    return num_found;
+}
+
+#endif
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -0,0 +1,17 @@
+#include "quark-gate.h"
+
+bool register_quark_algo( algo_gate_t* gate )
+{
+#if defined (QUARK_4WAY)
+  init_quark_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_quark_4way;
+  gate->hash      = (void*)&quark_4way_hash;
+#else
+  init_quark_ctx();
+  gate->scanhash  = (void*)&scanhash_quark;
+  gate->hash      = (void*)&quark_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  return true;
+};
+
--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -0,0 +1,32 @@
+#ifndef QUARK_GATE_H__
+#define QUARK_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define QUARK_4WAY
+#endif
+
+bool register_quark_algo( algo_gate_t* gate );
+
+#if defined(QUARK_4WAY)
+
+void quark_4way_hash( void *state, const void *input );
+
+int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_quark_4way_ctx();
+
+#endif
+
+void quark_hash( void *state, const void *input );
+
+int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_quark_ctx();
+
+#endif
+
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -1,5 +1,5 @@
 #include "cpuminer-config.h"
-#include "algo-gate-api.h"
+#include "quark-gate.h"

 #include <stdio.h>
 #include <string.h>
@@ -47,7 +47,7 @@ void init_quark_ctx()
 #endif
 }

-inline static void quarkhash(void *state, const void *input)
+void quark_hash(void *state, const void *input)
 {
    unsigned char hashbuf[128];
    size_t hashptr;
@@ -187,11 +187,12 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
 	do {
 		pdata[19] = ++n;
 		be32enc(&endiandata[19], n); 
-		quarkhash(hash64, &endiandata);
+		quark_hash(hash64, &endiandata);
                if ((hash64[7]&0xFFFFFF00)==0)
                {
                  if (fulltest(hash64, ptarget)) 
                  {
+                    work_set_target_ratio( work, hash64 );
                    *hashes_done = n - first_nonce + 1;
 		    return true;
                  }
@@ -203,12 +204,3 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-bool register_quark_algo( algo_gate_t* gate )
-{
-  init_quark_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT;
-  gate->scanhash         = (void*)&scanhash_quark;
-  gate->hash             = (void*)&quarkhash;
-  return true;
-};
-
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -1,31 +1,20 @@
 #include "algo-gate-api.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
-
 #include "algo/luffa/sse2/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-#include "algo/simd/sse2/nist.h"
-#include "algo/shavite/sph_shavite.h"
-
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
+#else
+#include "algo/echo/sph_echo.h"
 #endif

 typedef struct
 {
        hashState_luffa         luffa;
        cubehashParam           cubehash;
-        sph_shavite512_context  shavite;
-        hashState_sd            simd;
 #ifdef NO_AES_NI
        sph_echo512_context echo;
 #else
@@ -133,6 +122,7 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
 	        	if (!(hash64[7] & mask)) {
 		            printf("[%d]",thr_id);
 			    if (fulltest(hash64, ptarget)) {
+                             work_set_target_ratio( work, hash64 );
                             *hashes_done = n - first_nonce + 1;
 				return true;
 	                    }
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -1,23 +1,16 @@
 #include "algo-gate-api.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
-
 #include "algo/luffa/sse2/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
 #include "algo/simd/sse2/nist.h"
 #include "algo/shavite/sph_shavite.h"
-
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
+#else
+#include "algo/echo/sph_echo.h"
 #endif

 typedef struct
@@ -141,6 +134,7 @@ int scanhash_qubit(int thr_id, struct work *work,
 	        	if (!(hash64[7] & mask)) {
 		            printf("[%d]",thr_id);
 			    if (fulltest(hash64, ptarget)) {
+                             work_set_target_ratio( work, hash64 );
                             *hashes_done = n - first_nonce + 1;
 				return true;
 	                    }
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -754,6 +754,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
 			if (unlikely(hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget))) {
 				*hashes_done = n - pdata[19] + 1;
 				pdata[19] = data[i * 20 + 19];
+                                work_set_target_ratio( work, hash );
 				return 1;
 			}
 		}
--- a/algo/scryptjane/scrypt-jane-chacha.h
+++ b/algo/scryptjane/scrypt-jane-chacha.h
@@ -114,7 +114,7 @@ available_implementations() {
 	return flags;
 }
 #endif
-
+/*
 static int
 scrypt_test_mix() {
 	static const uint8_t expected[16] = {
@@ -145,4 +145,4 @@ scrypt_test_mix() {

 	return ret;
 }
-
+*/
--- a/algo/scryptjane/scrypt-jane-hash.h
+++ b/algo/scryptjane/scrypt-jane-hash.h
@@ -26,7 +26,7 @@
 #include "scrypt-jane-pbkdf2.h"

 #define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
-
+/*
 static int
 scrypt_test_hash() {
 	scrypt_hash_state st;
@@ -45,4 +45,4 @@ scrypt_test_hash() {
 	scrypt_hash_finish(&st, final);
 	return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
 }
-
+*/
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -36,15 +36,15 @@ void sha256t_hash(void* output, const void* input,  uint32_t len)
        memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );

        SHA256_Update( &ctx_sha256, input + midlen, tail );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );

        memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
        SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );

        memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
        SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
 #else
        sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
        memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -0,0 +1,618 @@
+/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
+/*
+ * Shabal implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#ifdef __AVX2__
+
+#include "shabal-hash-4way.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * Part of this code was automatically generated (the part between
+ * the "BEGIN" and "END" markers).
+ */
+
+#define sM    16
+
+#define C32   SPH_C32
+#define T32   SPH_T32
+
+#define O1   13
+#define O2    9
+#define O3    6
+
+/*
+ * We copy the state into local variables, so that the compiler knows
+ * that it can optimize them at will.
+ */
+
+/* BEGIN -- automatically generated code. */
+
+#define DECL_STATE   \
+	__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
+	        A08, A09, A0A, A0B; \
+	__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
+	        B8, B9, BA, BB, BC, BD, BE, BF; \
+	__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
+	        C8, C9, CA, CB, CC, CD, CE, CF; \
+	__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
+	        M8, M9, MA, MB, MC, MD, ME, MF; \
+	sph_u32 Wlow, Whigh;
+
+#define READ_STATE(state)   do { \
+		A00 = (state)->A[0]; \
+		A01 = (state)->A[1]; \
+		A02 = (state)->A[2]; \
+		A03 = (state)->A[3]; \
+		A04 = (state)->A[4]; \
+		A05 = (state)->A[5]; \
+		A06 = (state)->A[6]; \
+		A07 = (state)->A[7]; \
+		A08 = (state)->A[8]; \
+		A09 = (state)->A[9]; \
+		A0A = (state)->A[10]; \
+		A0B = (state)->A[11]; \
+		B0 = (state)->B[0]; \
+		B1 = (state)->B[1]; \
+		B2 = (state)->B[2]; \
+		B3 = (state)->B[3]; \
+		B4 = (state)->B[4]; \
+		B5 = (state)->B[5]; \
+		B6 = (state)->B[6]; \
+		B7 = (state)->B[7]; \
+		B8 = (state)->B[8]; \
+		B9 = (state)->B[9]; \
+		BA = (state)->B[10]; \
+		BB = (state)->B[11]; \
+		BC = (state)->B[12]; \
+		BD = (state)->B[13]; \
+		BE = (state)->B[14]; \
+		BF = (state)->B[15]; \
+		C0 = (state)->C[0]; \
+		C1 = (state)->C[1]; \
+		C2 = (state)->C[2]; \
+		C3 = (state)->C[3]; \
+		C4 = (state)->C[4]; \
+		C5 = (state)->C[5]; \
+		C6 = (state)->C[6]; \
+		C7 = (state)->C[7]; \
+		C8 = (state)->C[8]; \
+		C9 = (state)->C[9]; \
+		CA = (state)->C[10]; \
+		CB = (state)->C[11]; \
+		CC = (state)->C[12]; \
+		CD = (state)->C[13]; \
+		CE = (state)->C[14]; \
+		CF = (state)->C[15]; \
+		Wlow = (state)->Wlow; \
+		Whigh = (state)->Whigh; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->A[0] = A00; \
+		(state)->A[1] = A01; \
+		(state)->A[2] = A02; \
+		(state)->A[3] = A03; \
+		(state)->A[4] = A04; \
+		(state)->A[5] = A05; \
+		(state)->A[6] = A06; \
+		(state)->A[7] = A07; \
+		(state)->A[8] = A08; \
+		(state)->A[9] = A09; \
+		(state)->A[10] = A0A; \
+		(state)->A[11] = A0B; \
+		(state)->B[0] = B0; \
+		(state)->B[1] = B1; \
+		(state)->B[2] = B2; \
+		(state)->B[3] = B3; \
+		(state)->B[4] = B4; \
+		(state)->B[5] = B5; \
+		(state)->B[6] = B6; \
+		(state)->B[7] = B7; \
+		(state)->B[8] = B8; \
+		(state)->B[9] = B9; \
+		(state)->B[10] = BA; \
+		(state)->B[11] = BB; \
+		(state)->B[12] = BC; \
+		(state)->B[13] = BD; \
+		(state)->B[14] = BE; \
+		(state)->B[15] = BF; \
+		(state)->C[0] = C0; \
+		(state)->C[1] = C1; \
+		(state)->C[2] = C2; \
+		(state)->C[3] = C3; \
+		(state)->C[4] = C4; \
+		(state)->C[5] = C5; \
+		(state)->C[6] = C6; \
+		(state)->C[7] = C7; \
+		(state)->C[8] = C8; \
+		(state)->C[9] = C9; \
+		(state)->C[10] = CA; \
+		(state)->C[11] = CB; \
+		(state)->C[12] = CC; \
+		(state)->C[13] = CD; \
+		(state)->C[14] = CE; \
+		(state)->C[15] = CF; \
+		(state)->Wlow = Wlow; \
+		(state)->Whigh = Whigh; \
+	} while (0)
+
+#define DECODE_BLOCK \
+do { \
+   M0 = buf[ 0]; \
+   M1 = buf[ 1]; \
+   M2 = buf[ 2]; \
+   M3 = buf[ 3]; \
+   M4 = buf[ 4]; \
+   M5 = buf[ 5]; \
+   M6 = buf[ 6]; \
+   M7 = buf[ 7]; \
+   M8 = buf[ 8]; \
+   M9 = buf[ 9]; \
+   MA = buf[10]; \
+   MB = buf[11]; \
+   MC = buf[12]; \
+   MD = buf[13]; \
+   ME = buf[14]; \
+   MF = buf[15]; \
+} while (0)
+
+#define INPUT_BLOCK_ADD \
+do { \
+    B0 = _mm_add_epi32( B0, M0 );\
+    B1 = _mm_add_epi32( B1, M1 );\
+    B2 = _mm_add_epi32( B2, M2 );\
+    B3 = _mm_add_epi32( B3, M3 );\
+    B4 = _mm_add_epi32( B4, M4 );\
+    B5 = _mm_add_epi32( B5, M5 );\
+    B6 = _mm_add_epi32( B6, M6 );\
+    B7 = _mm_add_epi32( B7, M7 );\
+    B8 = _mm_add_epi32( B8, M8 );\
+    B9 = _mm_add_epi32( B9, M9 );\
+    BA = _mm_add_epi32( BA, MA );\
+    BB = _mm_add_epi32( BB, MB );\
+    BC = _mm_add_epi32( BC, MC );\
+    BD = _mm_add_epi32( BD, MD );\
+    BE = _mm_add_epi32( BE, ME );\
+    BF = _mm_add_epi32( BF, MF );\
+} while (0)
+
+#define INPUT_BLOCK_SUB \
+do { \
+    C0 = _mm_sub_epi32( C0, M0 ); \
+    C1 = _mm_sub_epi32( C1, M1 ); \
+    C2 = _mm_sub_epi32( C2, M2 ); \
+    C3 = _mm_sub_epi32( C3, M3 ); \
+    C4 = _mm_sub_epi32( C4, M4 ); \
+    C5 = _mm_sub_epi32( C5, M5 ); \
+    C6 = _mm_sub_epi32( C6, M6 ); \
+    C7 = _mm_sub_epi32( C7, M7 ); \
+    C8 = _mm_sub_epi32( C8, M8 ); \
+    C9 = _mm_sub_epi32( C9, M9 ); \
+    CA = _mm_sub_epi32( CA, MA ); \
+    CB = _mm_sub_epi32( CB, MB ); \
+    CC = _mm_sub_epi32( CC, MC ); \
+    CD = _mm_sub_epi32( CD, MD ); \
+    CE = _mm_sub_epi32( CE, ME ); \
+    CF = _mm_sub_epi32( CF, MF ); \
+} while (0)
+
+#define XOR_W \
+do { \
+   A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
+   A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
+} while (0)
+/*
+#define SWAP(v1, v2)   do { \
+		sph_u32 tmp = (v1); \
+		(v1) = (v2); \
+		(v2) = tmp; \
+	} while (0)
+*/
+#define SWAP_BC \
+do { \
+    mm_swap_128( B0, C0 ); \
+    mm_swap_128( B1, C1 ); \
+    mm_swap_128( B2, C2 ); \
+    mm_swap_128( B3, C3 ); \
+    mm_swap_128( B4, C4 ); \
+    mm_swap_128( B5, C5 ); \
+    mm_swap_128( B6, C6 ); \
+    mm_swap_128( B7, C7 ); \
+    mm_swap_128( B8, C8 ); \
+    mm_swap_128( B9, C9 ); \
+    mm_swap_128( BA, CA ); \
+    mm_swap_128( BB, CB ); \
+    mm_swap_128( BC, CC ); \
+    mm_swap_128( BD, CD ); \
+    mm_swap_128( BE, CE ); \
+    mm_swap_128( BF, CF ); \
+} while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+do { \
+   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
+            _mm_andnot_si128( xb3, xb2 ), \
+            _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
+               _mm_mullo_epi32(  mm_rotl_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
+                   ) ), _mm_set1_epi32(3UL) ) ) ) ); \
+   xb0 = mm_not( _mm_xor_si128( xa0, mm_rotl_32( xb0, 1 ) ) ); \
+} while (0)
+
+#define PERM_STEP_0   do { \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_1   do { \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_2   do { \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define APPLY_P \
+do { \
+    B0 = mm_rotr_32( B0, 15 ); \
+    B1 = mm_rotr_32( B1, 15 ); \
+    B2 = mm_rotr_32( B2, 15 ); \
+    B3 = mm_rotr_32( B3, 15 ); \
+    B4 = mm_rotr_32( B4, 15 ); \
+    B5 = mm_rotr_32( B5, 15 ); \
+    B6 = mm_rotr_32( B6, 15 ); \
+    B7 = mm_rotr_32( B7, 15 ); \
+    B8 = mm_rotr_32( B8, 15 ); \
+    B9 = mm_rotr_32( B9, 15 ); \
+    BA = mm_rotr_32( BA, 15 ); \
+    BB = mm_rotr_32( BB, 15 ); \
+    BC = mm_rotr_32( BC, 15 ); \
+    BD = mm_rotr_32( BD, 15 ); \
+    BE = mm_rotr_32( BE, 15 ); \
+    BF = mm_rotr_32( BF, 15 ); \
+    PERM_STEP_0; \
+    PERM_STEP_1; \
+    PERM_STEP_2; \
+    A0B = _mm_add_epi32( A0B, C6 ); \
+    A0A = _mm_add_epi32( A0A, C5 ); \
+    A09 = _mm_add_epi32( A09, C4 ); \
+    A08 = _mm_add_epi32( A08, C3 ); \
+    A07 = _mm_add_epi32( A07, C2 ); \
+    A06 = _mm_add_epi32( A06, C1 ); \
+    A05 = _mm_add_epi32( A05, C0 ); \
+    A04 = _mm_add_epi32( A04, CF ); \
+    A03 = _mm_add_epi32( A03, CE ); \
+    A02 = _mm_add_epi32( A02, CD ); \
+    A01 = _mm_add_epi32( A01, CC ); \
+    A00 = _mm_add_epi32( A00, CB ); \
+    A0B = _mm_add_epi32( A0B, CA ); \
+    A0A = _mm_add_epi32( A0A, C9 ); \
+    A09 = _mm_add_epi32( A09, C8 ); \
+    A08 = _mm_add_epi32( A08, C7 ); \
+    A07 = _mm_add_epi32( A07, C6 ); \
+    A06 = _mm_add_epi32( A06, C5 ); \
+    A05 = _mm_add_epi32( A05, C4 ); \
+    A04 = _mm_add_epi32( A04, C3 ); \
+    A03 = _mm_add_epi32( A03, C2 ); \
+    A02 = _mm_add_epi32( A02, C1 ); \
+    A01 = _mm_add_epi32( A01, C0 ); \
+    A00 = _mm_add_epi32( A00, CF ); \
+    A0B = _mm_add_epi32( A0B, CE ); \
+    A0A = _mm_add_epi32( A0A, CD ); \
+    A09 = _mm_add_epi32( A09, CC ); \
+    A08 = _mm_add_epi32( A08, CB ); \
+    A07 = _mm_add_epi32( A07, CA ); \
+    A06 = _mm_add_epi32( A06, C9 ); \
+    A05 = _mm_add_epi32( A05, C8 ); \
+    A04 = _mm_add_epi32( A04, C7 ); \
+    A03 = _mm_add_epi32( A03, C6 ); \
+    A02 = _mm_add_epi32( A02, C5 ); \
+    A01 = _mm_add_epi32( A01, C4 ); \
+    A00 = _mm_add_epi32( A00, C3 ); \
+} while (0)
+
+#define INCR_W   do { \
+		if ((Wlow = T32(Wlow + 1)) == 0) \
+			Whigh = T32(Whigh + 1); \
+	} while (0)
+
+static const sph_u32 A_init_256[] = {
+	C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
+	C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
+	C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
+};
+
+static const sph_u32 B_init_256[] = {
+	C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
+	C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
+	C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
+	C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
+};
+
+static const sph_u32 C_init_256[] = {
+	C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
+	C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
+	C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
+	C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
+};
+
+static const sph_u32 A_init_512[] = {
+	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+static const sph_u32 B_init_512[] = {
+	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+static const sph_u32 C_init_512[] = {
+	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+static void
+shabal_4way_init( void *cc, unsigned size )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+   int i;
+
+   if ( size == 512 )
+   {
+      for ( i = 0; i < 12; i++ )
+         sc->A[i] = _mm_set1_epi32( A_init_512[i] );
+      for ( i = 0; i < 16; i++ )
+      {
+         sc->B[i] = _mm_set1_epi32( B_init_512[i] );
+         sc->C[i] = _mm_set1_epi32( C_init_512[i] );
+      }
+   }
+   else
+   {
+      for ( i = 0; i < 12; i++ )
+         sc->A[i] = _mm_set1_epi32( A_init_256[i] );
+      for ( i = 0; i < 16; i++ )
+      {
+         sc->B[i] = _mm_set1_epi32( B_init_256[i] );
+         sc->C[i] = _mm_set1_epi32( C_init_256[i] );
+      }
+    }
+    sc->Wlow = 1;
+    sc->Whigh = 0;
+    sc->ptr = 0;
+}
+
+static void
+shabal_4way_core( void *cc, const unsigned char *data, size_t len )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+    __m128i *buf;
+    __m128i *vdata = (__m128i*)data;
+   const int buf_size = 64;  
+   size_t ptr;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr ) )
+   {
+      memcpy_128( buf + (ptr>>2), vdata, len>>2 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }
+   READ_STATE(sc);
+
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
+
+      ptr += clen;
+      vdata += clen>>2;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         DECODE_BLOCK;
+         INPUT_BLOCK_ADD;
+         XOR_W;
+         APPLY_P;
+         INPUT_BLOCK_SUB;
+         SWAP_BC;
+         INCR_W;
+         ptr = 0;
+      }
+   }
+   WRITE_STATE(sc);
+   sc->ptr = ptr;
+}
+
+static void
+shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
+                   unsigned size_words )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+    __m128i *buf;
+   const int buf_size = 64;
+   size_t ptr;
+   int i;
+   unsigned z, zz;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   zz = ((ub & -z) | z) & 0xFF;
+   buf[ptr>>2] = _mm_set1_epi32( zz );
+   memset_zero_128( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
+   READ_STATE(sc);
+   DECODE_BLOCK;
+   INPUT_BLOCK_ADD;
+   XOR_W;
+   APPLY_P;
+
+   for ( i = 0; i < 3; i ++ )
+   {
+      SWAP_BC;
+      XOR_W;
+      APPLY_P;
+   }
+
+   __m128i *d = (__m128i*)dst;
+   if ( size_words == 16 )   // 512
+   {
+      d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
+      d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
+      d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
+      d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
+   }
+   else    // 256
+   {
+      d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
+      d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
+   }
+}
+
+void
+shabal256_4way_init( void *cc )
+{
+	shabal_4way_init(cc, 256);
+}
+
+void
+shabal256_4way( void *cc, const void *data, size_t len )
+{
+	shabal_4way_core( cc, data, len );
+}
+
+void
+shabal256_4way_close( void *cc, void *dst )
+{
+	shabal_4way_close(cc, 0, 0, dst, 8);
+}
+
+void
+shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                  void *dst )
+{
+	shabal_4way_close(cc, ub, n, dst, 8);
+}
+
+void
+shabal512_4way_init(void *cc)
+{
+	shabal_4way_init(cc, 512);
+}
+
+void
+shabal512_4way(void *cc, const void *data, size_t len)
+{
+	shabal_4way_core(cc, data, len);
+}
+
+void
+shabal512_4way_close(void *cc, void *dst)
+{
+	shabal_4way_close(cc, 0, 0, dst, 16);
+}
+
+void
+shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_4way_close(cc, ub, n, dst, 16);
+}
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -0,0 +1,82 @@
+/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
+/**
+ * Shabal interface. Shabal is a family of functions which differ by
+ * their output size; this implementation defines Shabal for output
+ * sizes 192, 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shabal.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SHABAL_HASH_4WAY_H__
+#define SHABAL_HASH_4WAY_H__ 1
+
+#ifdef __AVX2__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#define SPH_SIZE_shabal256   256
+
+#define SPH_SIZE_shabal512   512
+
+typedef struct {
+	__m128i buf[16] __attribute__ ((aligned (64)));
+	__m128i A[12], B[16], C[16];
+	sph_u32 Whigh, Wlow;
+        size_t ptr;
+} shabal_4way_context;
+
+typedef shabal_4way_context shabal256_4way_context;
+typedef shabal_4way_context shabal512_4way_context;
+
+void shabal256_4way_init( void *cc );
+void shabal256_4way( void *cc, const void *data, size_t len );
+void shabal256_4way_close( void *cc, void *dst );
+void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+void shabal512_4way_init( void *cc );
+void shabal512_4way( void *cc, const void *data, size_t len );
+void shabal512_4way_close( void *cc, void *dst );
+void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
+
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -0,0 +1,670 @@
+/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SHAvite-3 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+
+#ifdef __AES__
+
+#include "sph_shavite.h"
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE
+#define SPH_SMALL_FOOTPRINT_SHAVITE   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#define C32   SPH_C32
+
+/*
+ * As of round 2 of the SHA-3 competition, the published reference
+ * implementation and test vectors are wrong, because they use
+ * big-endian AES tables while the internal decoding uses little-endian.
+ * The code below follows the specification. To turn it into a code
+ * which follows the reference implementation (the one called "BugFix"
+ * on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
+ * the code below (from the '#define AES_BIG_ENDIAN...' to the definition
+ * of the AES_ROUND_NOKEY macro) and replace it with the version which
+ * is commented out afterwards.
+ */
+
+#define AES_BIG_ENDIAN   0
+#include "algo/sha/aes_helper.c"
+
+static const sph_u32 IV512[] = {
+	C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
+	C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
+	C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47),
+	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
+};
+
+#define AES_ROUND_NOKEY(x0, x1, x2, x3)   do { \
+		sph_u32 t0 = (x0); \
+		sph_u32 t1 = (x1); \
+		sph_u32 t2 = (x2); \
+		sph_u32 t3 = (x3); \
+		AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
+	} while (0)
+
+  
+#define KEY_EXPAND_ELT(k0, k1, k2, k3)   do { \
+		sph_u32 kt; \
+		AES_ROUND_NOKEY(k1, k2, k3, k0); \
+		kt = (k0); \
+		(k0) = (k1); \
+		(k1) = (k2); \
+		(k2) = (k3); \
+		(k3) = kt; \
+	} while (0)
+
+
+#if SPH_SMALL_FOOTPRINT_SHAVITE
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c512(sph_shavite_big_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
+	sph_u32 rk[448];
+	size_t u;
+	int r, s;
+
+#if SPH_LITTLE_ENDIAN
+	memcpy(rk, msg, 128);
+#else
+	for (u = 0; u < 32; u += 4) {
+		rk[u + 0] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  0);
+		rk[u + 1] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  4);
+		rk[u + 2] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  8);
+		rk[u + 3] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) + 12);
+	}
+#endif
+	u = 32;
+	for (;;) {
+		for (s = 0; s < 4; s ++) {
+			sph_u32 x0, x1, x2, x3;
+
+			x0 = rk[u - 31];
+			x1 = rk[u - 30];
+			x2 = rk[u - 29];
+			x3 = rk[u - 32];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 32) {
+				rk[ 32] ^= sc->count0;
+				rk[ 33] ^= sc->count1;
+				rk[ 34] ^= sc->count2;
+				rk[ 35] ^= SPH_T32(~sc->count3);
+			} else if (u == 440) {
+				rk[440] ^= sc->count1;
+				rk[441] ^= sc->count0;
+				rk[442] ^= sc->count3;
+				rk[443] ^= SPH_T32(~sc->count2);
+			}
+			u += 4;
+
+			x0 = rk[u - 31];
+			x1 = rk[u - 30];
+			x2 = rk[u - 29];
+			x3 = rk[u - 32];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 164) {
+				rk[164] ^= sc->count3;
+				rk[165] ^= sc->count2;
+				rk[166] ^= sc->count1;
+				rk[167] ^= SPH_T32(~sc->count0);
+			} else if (u == 316) {
+				rk[316] ^= sc->count2;
+				rk[317] ^= sc->count3;
+				rk[318] ^= sc->count0;
+				rk[319] ^= SPH_T32(~sc->count1);
+			}
+			u += 4;
+		}
+		if (u == 448)
+			break;
+		for (s = 0; s < 8; s ++) {
+			rk[u + 0] = rk[u - 32] ^ rk[u - 7];
+			rk[u + 1] = rk[u - 31] ^ rk[u - 6];
+			rk[u + 2] = rk[u - 30] ^ rk[u - 5];
+			rk[u + 3] = rk[u - 29] ^ rk[u - 4];
+			u += 4;
+		}
+	}
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	p8 = sc->h[0x8];
+	p9 = sc->h[0x9];
+	pA = sc->h[0xA];
+	pB = sc->h[0xB];
+	pC = sc->h[0xC];
+	pD = sc->h[0xD];
+	pE = sc->h[0xE];
+	pF = sc->h[0xF];
+	u = 0;
+	for (r = 0; r < 14; r ++) {
+#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3)   do { \
+		sph_u32 x0, x1, x2, x3; \
+		x0 = r0 ^ rk[u ++]; \
+		x1 = r1 ^ rk[u ++]; \
+		x2 = r2 ^ rk[u ++]; \
+		x3 = r3 ^ rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		l0 ^= x0; \
+		l1 ^= x1; \
+		l2 ^= x2; \
+		l3 ^= x3; \
+	} while (0)
+
+#define WROT(a, b, c, d)   do { \
+		sph_u32 t = d; \
+		d = c; \
+		c = b; \
+		b = a; \
+		a = t; \
+	} while (0)
+
+		C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
+		C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
+
+		WROT(p0, p4, p8, pC);
+		WROT(p1, p5, p9, pD);
+		WROT(p2, p6, pA, pE);
+		WROT(p3, p7, pB, pF);
+
+#undef C512_ELT
+#undef WROT
+	}
+	sc->h[0x0] ^= p0;
+	sc->h[0x1] ^= p1;
+	sc->h[0x2] ^= p2;
+	sc->h[0x3] ^= p3;
+	sc->h[0x4] ^= p4;
+	sc->h[0x5] ^= p5;
+	sc->h[0x6] ^= p6;
+	sc->h[0x7] ^= p7;
+	sc->h[0x8] ^= p8;
+	sc->h[0x9] ^= p9;
+	sc->h[0xA] ^= pA;
+	sc->h[0xB] ^= pB;
+	sc->h[0xC] ^= pC;
+	sc->h[0xD] ^= pD;
+	sc->h[0xE] ^= pE;
+	sc->h[0xF] ^= pF;
+}
+
+#else
+
+static void
+c512( sph_shavite_big_context *sc, const void *msg )
+{
+   __m128i p0, p1, p2, p3, x;
+   __m128i k00, k01, k02, k03, k10, k11, k12, k13;
+   __m128i *m = (__m128i*)msg;
+   __m128i *h = (__m128i*)sc->h;
+   int r;
+
+   p0 = h[0];
+   p1 = h[1];
+   p2 = h[2];
+   p3 = h[3];   
+
+   // round
+   k00 = m[0];
+   x = _mm_xor_si128( p1, k00 );
+   x = _mm_aesenc_si128( x, mm_zero );
+  
+   k01 = m[1];
+   x = _mm_xor_si128( x, k01 );
+   x = _mm_aesenc_si128( x, mm_zero );
+
+   k02 = m[2];
+   x = _mm_xor_si128( x, k02 );
+   x = _mm_aesenc_si128( x, mm_zero );
+
+   k03 = m[3];
+   x = _mm_xor_si128( x, k03 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   p0 = _mm_xor_si128( p0, x );
+
+   k10 = m[4];
+   x = _mm_xor_si128( p3, k10 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   
+   k11 = m[5];
+   x = _mm_xor_si128( x, k11 );
+   x = _mm_aesenc_si128( x, mm_zero );
+
+   k12 = m[6];
+   x = _mm_xor_si128( x, k12 );
+   x = _mm_aesenc_si128( x, mm_zero );
+
+   k13 = m[7];
+   x = _mm_xor_si128( x, k13 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   p2 = _mm_xor_si128( p2, x );
+
+   for ( r = 0; r < 3; r ++ )
+   {
+      // round 1, 5, 9
+      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = _mm_xor_si128( k00, k13 ); 
+
+      if ( r == 0 )
+         k00 = _mm_xor_si128( k00, _mm_set_epi32(
+                  ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 
+
+      x = _mm_xor_si128( p0, k00 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      k01 = _mm_xor_si128( k01, k00 );
+
+      if ( r == 1 )
+         k01 = _mm_xor_si128( k01, _mm_set_epi32(
+                  ~sc->count0, sc->count1, sc->count2, sc->count3 ) );
+
+      x = _mm_xor_si128( x, k01 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      k02 = _mm_xor_si128( k02, k01 );
+
+      x = _mm_xor_si128( x, k02 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      k03 = _mm_xor_si128( k03, k02 );
+
+      x = _mm_xor_si128( x, k03 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p3 = _mm_xor_si128( p3, x );
+      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = _mm_xor_si128( k10, k03 );
+
+      x = _mm_xor_si128( p2, k10 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      k11 = _mm_xor_si128( k11, k10 );
+
+      x = _mm_xor_si128( x, k11 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      k12 = _mm_xor_si128( k12, k11 );
+
+      x = _mm_xor_si128( x, k12 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      k13 = _mm_xor_si128( k13, k12 );
+
+      if ( r == 2 )
+         k13 = _mm_xor_si128( k13, _mm_set_epi32(
+                  ~sc->count1, sc->count0, sc->count3, sc->count2 ) );
+
+      x = _mm_xor_si128( x, k13 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p1 = _mm_xor_si128( p1, x );
+
+      // round 2, 6, 10
+
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
+      x = _mm_xor_si128( p3, k00 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
+      x = _mm_xor_si128( x, k01 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
+      x = _mm_xor_si128( x, k02 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
+      x = _mm_xor_si128( x, k03 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      p2 = _mm_xor_si128( p2, x );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
+      x = _mm_xor_si128( p1, k10 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
+      x = _mm_xor_si128( x, k11 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
+      x = _mm_xor_si128( x, k12 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
+      x = _mm_xor_si128( x, k13 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p0 = _mm_xor_si128( p0, x );
+
+      // round 3, 7, 11
+
+      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = _mm_xor_si128( k00, k13 );
+
+      x = _mm_xor_si128( p2, k00 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      k01 = _mm_xor_si128( k01, k00 );
+
+      x = _mm_xor_si128( x, k01 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      k02 = _mm_xor_si128( k02, k01 );
+
+      x = _mm_xor_si128( x, k02 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      k03 = _mm_xor_si128( k03, k02 );
+
+      x = _mm_xor_si128( x, k03 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p1 = _mm_xor_si128( p1, x );
+      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = _mm_xor_si128( k10, k03 );
+
+      x = _mm_xor_si128( p0, k10 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      k11 = _mm_xor_si128( k11, k10 );
+
+      x = _mm_xor_si128( x, k11 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      k12 = _mm_xor_si128( k12, k11 );
+
+      x = _mm_xor_si128( x, k12 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      k13 = _mm_xor_si128( k13, k12 );
+
+      x = _mm_xor_si128( x, k13 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p3 = _mm_xor_si128( p3, x );
+
+      // round 4, 8, 12
+
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
+
+      x = _mm_xor_si128( p1, k00 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
+
+      x = _mm_xor_si128( x, k01 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
+
+      x = _mm_xor_si128( x, k02 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
+
+      x = _mm_xor_si128( x, k03 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p0 = _mm_xor_si128( p0, x );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
+
+      x = _mm_xor_si128( p3, k10 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
+
+      x = _mm_xor_si128( x, k11 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
+
+      x = _mm_xor_si128( x, k12 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
+
+      x = _mm_xor_si128( x, k13 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p2 = _mm_xor_si128( p2, x );
+   }
+
+   // round 13
+
+   k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+   k00 = _mm_xor_si128( k00, k13 );
+
+   x = _mm_xor_si128( p0, k00 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); 
+   k01 = _mm_xor_si128( k01, k00 );
+
+   x = _mm_xor_si128( x, k01 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+   k02 = _mm_xor_si128( k02, k01 );
+
+   x = _mm_xor_si128( x, k02 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+   k03 = _mm_xor_si128( k03, k02 );
+
+   x = _mm_xor_si128( x, k03 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   p3 = _mm_xor_si128( p3, x );
+   k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+   k10 = _mm_xor_si128( k10, k03 );
+
+   x = _mm_xor_si128( p2, k10 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+   k11 = _mm_xor_si128( k11, k10 );
+
+   x = _mm_xor_si128( x, k11 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+   k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
+               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
+
+   x = _mm_xor_si128( x, k12 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+   k13 = _mm_xor_si128( k13, k12 );
+
+   x = _mm_xor_si128( x, k13 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   p1 = _mm_xor_si128( p1, x );
+
+   h[0] = _mm_xor_si128( h[0], p2 );
+   h[1] = _mm_xor_si128( h[1], p3 );
+   h[2] = _mm_xor_si128( h[2], p0 );
+   h[3] = _mm_xor_si128( h[3], p1 );
+}
+
+#endif
+
+static void
+shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv )
+{
+	memcpy( sc->h, iv, sizeof sc->h );
+	sc->ptr    = 0;
+	sc->count0 = 0;
+	sc->count1 = 0;
+	sc->count2 = 0;
+	sc->count3 = 0;
+}
+
+static void
+shavite_big_aesni_core( sph_shavite_big_context *sc, const void *data,
+                        size_t len )
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		ptr += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) {
+				sc->count1 = SPH_T32(sc->count1 + 1);
+				if (sc->count1 == 0) {
+					sc->count2 = SPH_T32(sc->count2 + 1);
+					if (sc->count2 == 0) {
+						sc->count3 = SPH_T32(
+							sc->count3 + 1);
+					}
+				}
+			}
+			c512(sc, buf);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n,
+                         void *dst, size_t out_size_w32 )
+{
+	unsigned char *buf;
+	size_t ptr, u;
+	unsigned z;
+	sph_u32 count0, count1, count2, count3;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	count0 = (sc->count0 += SPH_T32(ptr << 3) + n);
+	count1 = sc->count1;
+	count2 = sc->count2;
+	count3 = sc->count3;
+	z = 0x80 >> n;
+	z = ((ub & -z) | z) & 0xFF;
+	if (ptr == 0 && n == 0) {
+		buf[0] = 0x80;
+		memset(buf + 1, 0, 109);
+		sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
+	} else if (ptr < 110) {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 110 - ptr);
+	} else {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 128 - ptr);
+		c512(sc, buf);
+		memset(buf, 0, 110);
+		sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
+	}
+	sph_enc32le(buf + 110, count0);
+	sph_enc32le(buf + 114, count1);
+	sph_enc32le(buf + 118, count2);
+	sph_enc32le(buf + 122, count3);
+	buf[126] = (unsigned char) (out_size_w32 << 5);
+	buf[127] = (unsigned char) (out_size_w32 >> 3);
+	c512(sc, buf);
+	for (u = 0; u < out_size_w32; u ++)
+		sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]);
+}
+
+void
+sph_shavite512_aesni_init(void *cc)
+{
+	shavite_big_aesni_init(cc, IV512);
+}
+
+void
+sph_shavite512_aesni(void *cc, const void *data, size_t len)
+{
+	shavite_big_aesni_core(cc, data, len);
+}
+
+void
+sph_shavite512_aesni_close(void *cc, void *dst)
+{
+	shavite_big_aesni_close(cc, 0, 0, dst, 16);
+}
+
+void
+sph_shavite512_aesni_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                        void *dst)
+{
+	shavite_big_aesni_close(cc, ub, n, dst, 16);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/shavite/sph_shavite.c
+++ b/algo/shavite/sph_shavite.c
@@ -1731,21 +1731,21 @@ sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 /* see sph_shavite.h */
 void
-sph_shavite512_init(void *cc)
+sph_shavite512_sw_init(void *cc)
 {
 	shavite_big_init(cc, IV512);
 }

 /* see sph_shavite.h */
 void
-sph_shavite512(void *cc, const void *data, size_t len)
+sph_shavite512_sw(void *cc, const void *data, size_t len)
 {
 	shavite_big_core(cc, data, len);
 }

 /* see sph_shavite.h */
 void
-sph_shavite512_close(void *cc, void *dst)
+sph_shavite512_sw_close(void *cc, void *dst)
 {
 	shavite_big_close(cc, 0, 0, dst, 16);
 //	shavite_big_init(cc, IV512);
@@ -1753,7 +1753,7 @@ sph_shavite512_close(void *cc, void *dst)

 /* see sph_shavite.h */
 void
-sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+sph_shavite512_sw_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	shavite_big_close(cc, ub, n, dst, 16);
 //	shavite_big_init(cc, IV512);
--- a/algo/shavite/sph_shavite.h
+++ b/algo/shavite/sph_shavite.h
@@ -77,9 +77,9 @@ extern "C"{
 */
 typedef struct {
 #ifndef DOXYGEN_IGNORE
-	unsigned char buf[64];    /* first field, for alignment */
+	unsigned char buf[64] __attribute__ ((aligned (64))); 
+        sph_u32 h[8] __attribute__ ((aligned (32)));
 	size_t ptr;
-	sph_u32 h[8];
 	sph_u32 count0, count1;
 #endif
 } sph_shavite_small_context;
@@ -108,9 +108,9 @@ typedef sph_shavite_small_context sph_shavite256_context;
 */
 typedef struct {
 #ifndef DOXYGEN_IGNORE
-	unsigned char buf[128];    /* first field, for alignment */
+	unsigned char buf[128] __attribute__ ((aligned (64))); 
+        sph_u32 h[16] __attribute__ ((aligned (32)));;
 	size_t ptr;
-	sph_u32 h[16];
 	sph_u32 count0, count1, count2, count3;
 #endif
 } sph_shavite_big_context;
@@ -262,51 +262,37 @@ void sph_shavite384_close(void *cc, void *dst);
 void sph_shavite384_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

-/**
- * Initialize a SHAvite-512 context. This process performs no memory allocation.
- *
- * @param cc   the SHAvite-512 context (pointer to a
- *             <code>sph_shavite512_context</code>)
- */
-void sph_shavite512_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the SHAvite-512 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_shavite512(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current SHAvite-512 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (64 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the SHAvite-512 context
- * @param dst   the destination buffer
- */
-void sph_shavite512_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (64 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the SHAvite-512 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_shavite512_addbits_and_close(
+// Always define sw but only define aesni when available
+// Define fptrs for aesni or sw, not both.
+void sph_shavite512_sw_init(void *cc);
+void sph_shavite512_sw(void *cc, const void *data, size_t len);
+void sph_shavite512_sw_close(void *cc, void *dst);
+void sph_shavite512_sw_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
-	
+
+#ifdef __AES__
+void sph_shavite512_aesni_init(void *cc);
+void sph_shavite512_aesni(void *cc, const void *data, size_t len);
+void sph_shavite512_aesni_close(void *cc, void *dst);
+void sph_shavite512_aesni_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#define sph_shavite512_init  sph_shavite512_aesni_init
+#define sph_shavite512       sph_shavite512_aesni
+#define sph_shavite512_close sph_shavite512_aesni_close
+#define sph_shavite512_addbits_and_close \
+                             sph_shavite512_aesni_addbits_and_close
+
+#else
+
+#define sph_shavite512_init  sph_shavite512_sw_init
+#define sph_shavite512       sph_shavite512_sw
+#define sph_shavite512_close sph_shavite512_sw_close
+#define sph_shavite512_addbits_and_close \
+                             sph_shavite512_sw_addbits_and_close
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif	
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -6,12 +6,11 @@ int64_t skein_get_max64() { return 0x7ffffLL; }

 bool register_skein_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AVX_OPT| AVX2_OPT | SHA_OPT;
+    gate->optimizations = FOUR_WAY_OPT | SHA_OPT;
 #if defined (SKEIN_4WAY)
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
 #else
-    gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash  = (void*)&scanhash_skein;
    gate->hash      = (void*)&skeinhash;
 #endif
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -342,17 +342,6 @@ do { \
 do { \
  sph_u64 t0, t1, t2; \
  __m256i h8; \
-/* can LE be assumed? \
-   dec64le does nothing when SPH_LITTLE endian is set, as it is. \
-  __m256i m0 = _mm256_dec64le( buf ); \
-  __m256i m1 = _mm256_dec64le( buf +  8*4 ); \
-  __m256i m2 = _mm256_dec64le( buf + 16*4 ); \
-  __m256i m3 = _mm256_dec64le( buf + 24*4 ); \
-  __m256i m4 = _mm256_dec64le( buf + 32*4 ); \
-  __m256i m5 = _mm256_dec64le( buf + 40*4 ); \
-  __m256i m6 = _mm256_dec64le( buf + 48*4 ); \
-  __m256i m7 = _mm256_dec64le( buf + 56*4 ); \
-*/ \
  __m256i m0 =  buf[0]; \
  __m256i m1 =  buf[1]; \
  __m256i m2 =  buf[2]; \
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -39,7 +39,9 @@
 */

 #ifndef __SKEIN_HASH_4WAY_H__
-#define __SKEIN_HASH_4WAY_H__
+#define __SKEIN_HASH_4WAY_H__ 1
+
+#ifdef __AVX2__

 #ifdef __cplusplus
 extern "C"{
@@ -53,14 +55,15 @@ extern "C"{
 #define SPH_SIZE_skein256   256
 #define SPH_SIZE_skein512   512

-#ifdef __AVX2__
-
 typedef struct {
        __m256i buf[8] __attribute__ ((aligned (32)));
        __m256i h0, h1, h2, h3, h4, h5, h6, h7;
        size_t ptr;
 	sph_u64 bcount;
-} skein512_4way_context;
+} sph_skein_4way_big_context;
+
+typedef sph_skein_4way_big_context skein512_4way_context;
+typedef sph_skein_4way_big_context skein256_4way_context;

 void skein512_4way_init(void *cc);
 void skein512_4way(void *cc, const void *data, size_t len);
@@ -68,26 +71,15 @@ void skein512_4way_close(void *cc, void *dst);
 //void sph_skein512_addbits_and_close(
 //        void *cc, unsigned ub, unsigned n, void *dst);

-#endif
-
-#ifdef __AVX__
-
-typedef struct {
-        __m128i buf[8] __attribute__ ((aligned (32)));
-        __m128i h0, h1, h2, h3, h4, h5, h6, h7;
-        size_t ptr;
-        sph_u64 bcount;
-} skein256_4way_context;
-
 void skein256_4way_init(void *cc);
 void skein256_4way(void *cc, const void *data, size_t len);
 void skein256_4way_close(void *cc, void *dst);
 //void sph_skein256_addbits_and_close(
 //	void *cc, unsigned ub, unsigned n, void *dst);

-#endif

 #ifdef __cplusplus
 }
 #endif
 #endif
+#endif
--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -0,0 +1,231 @@
+/* ====================================================================
+ * Copyright (c) 2014 - 2017 The GmSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the GmSSL Project.
+ *    (http://gmssl.org/)"
+ *
+ * 4. The name "GmSSL Project" must not be used to endorse or promote
+ *    products derived from this software without prior written
+ *    permission. For written permission, please contact
+ *    guanzhi1980@gmail.com.
+ *
+ * 5. Products derived from this software may not be called "GmSSL"
+ *    nor may "GmSSL" appear in their names without prior written
+ *    permission of the GmSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the GmSSL Project
+ *    (http://gmssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE GmSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <string.h>
+#include "sm3-hash-4way.h"
+
+#ifdef __AVX__
+
+void sm3_4way_init( sm3_4way_ctx_t *ctx )
+{
+	ctx->digest[0] = _mm_set1_epi32( 0x7380166F );
+	ctx->digest[1] = _mm_set1_epi32( 0x4914B2B9 );
+	ctx->digest[2] = _mm_set1_epi32( 0x172442D7 );
+	ctx->digest[3] = _mm_set1_epi32( 0xDA8A0600 );
+	ctx->digest[4] = _mm_set1_epi32( 0xA96F30BC );
+	ctx->digest[5] = _mm_set1_epi32( 0x163138AA );
+	ctx->digest[6] = _mm_set1_epi32( 0xE38DEE4D );
+	ctx->digest[7] = _mm_set1_epi32( 0xB0FB0E4E );
+	ctx->nblocks = 0;
+	ctx->num = 0;
+}
+
+void sm3_4way( void *cc, const void *data, size_t len )
+{
+   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
+   __m128i *block = (__m128i*)ctx->block;
+   __m128i *vdata = (__m128i*)data;
+
+   if ( ctx->num )
+   {
+      unsigned int left = SM3_BLOCK_SIZE - ctx->num;
+      if ( len < left )
+      {
+         memcpy_128( block + (ctx->num >> 2), vdata , len>>2 ); 
+         ctx->num += len;
+         return;
+      }
+      else
+      {
+         memcpy_128( block + (ctx->num >> 2), vdata , left>>2 );
+         sm3_4way_compress( ctx->digest, block );
+         ctx->nblocks++;
+         vdata += left>>2;
+         len -= left;
+      }
+   }
+   while ( len >= SM3_BLOCK_SIZE )
+   {
+      sm3_4way_compress( ctx->digest, vdata );
+      ctx->nblocks++;
+      vdata += SM3_BLOCK_SIZE>>2;
+      len -= SM3_BLOCK_SIZE;
+   }
+   ctx->num = len;
+   if ( len )
+      memcpy_128( block, vdata, len>>2 );
+}
+
+void sm3_4way_close( void *cc, void *dst )
+{
+   sm3_4way_ctx_t *ctx = (sm3_4way_ctx_t*)cc;
+   __m128i *hash = (__m128i*)dst;
+   __m128i *count = (__m128i*)(ctx->block + ( (SM3_BLOCK_SIZE - 8) >> 2 ) );
+   __m128i *block = (__m128i*)ctx->block;
+   int i;
+
+   block[ctx->num] = _mm_set1_epi32( 0x80 );
+
+   if ( ctx->num + 8 <= SM3_BLOCK_SIZE )
+   {
+      memset_zero_128( block + (ctx->num >> 2) + 1, 
+                      ( SM3_BLOCK_SIZE - ctx->num - 8 ) >> 2 ); 
+   }
+   else
+   {
+      memset_zero_128( block + (ctx->num >> 2) + 1, 
+                             ( SM3_BLOCK_SIZE - (ctx->num >> 2) - 1 ) );
+      sm3_4way_compress( ctx->digest, block );
+      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
+   }
+
+   count[0] = mm_byteswap_32(
+                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
+   count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+                                              ( ctx->num     << 3 ) ) );
+   sm3_4way_compress( ctx->digest, block );
+
+   for ( i = 0; i < 8 ; i++ )
+     hash[i] = mm_byteswap_32( ctx->digest[i] );
+}
+
+#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x,  9 ), \
+                                               mm_rotl_32( x, 17 ) ) ) 
+#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x, 15 ), \
+                                               mm_rotl_32( x, 23 ) ) ) 
+
+#define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
+#define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
+                                               _mm_and_si128( x, z ) ), \
+                                               _mm_and_si128( y, z ) )
+
+#define GG0(x,y,z) FF0(x,y,z)
+#define GG1(x,y,z) _mm_or_si128( _mm_and_si128( x, y ), \
+                                 _mm_andnot_si128( x, z ) )
+
+
+void sm3_4way_compress( __m128i *digest, __m128i *block )
+{
+   __m128i W[68], W1[64];
+   __m128i A = digest[ 0 ];
+   __m128i B = digest[ 1 ];
+   __m128i C = digest[ 2 ];
+   __m128i D = digest[ 3 ];
+   __m128i E = digest[ 4 ];
+   __m128i F = digest[ 5 ];
+   __m128i G = digest[ 6 ];
+   __m128i H = digest[ 7 ];
+   __m128i SS1, SS2, TT1, TT2, T;
+   int j;
+
+   for ( j = 0; j < 16; j++ )
+      W[j] = mm_byteswap_32( block[j] );
+
+   for ( j = 16; j < 68; j++ )
+      W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
+                                                              W[ j-9 ] ),
+                                               mm_rotl_32( W[ j-3 ], 15 ) ) ),
+                            _mm_xor_si128( mm_rotl_32( W[ j-13 ], 7 ),
+                                           W[ j-6 ] ) );
+
+   for( j = 0; j < 64; j++ )
+       W1[j] = _mm_xor_si128( W[j], W[j+4] );
+
+   T = _mm_set1_epi32( 0x79CC4519UL );
+   for( j =0; j < 16; j++ )
+   {
+      SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
+                                      mm_rotl_32( T, j ) ), 7 );
+      SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
+      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
+                                          SS2 ), W1[j] );
+      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
+                                          SS1 ), W[j] );
+      D = C;
+      C = mm_rotl_32( B, 9 );
+      B = A;
+      A = TT1;
+      H = G;
+      G = mm_rotl_32( F, 19 );
+      F = E;
+      E = P0( TT2 );
+   }
+
+   T = _mm_set1_epi32( 0x7A879D8AUL );
+   for( j =16; j < 64; j++ )
+   {
+      SS1 = mm_rotl_32( _mm_add_epi32( _mm_add_epi32( mm_rotl_32( A, 12 ), E ),
+                                      mm_rotl_32( T, j&31 ) ), 7 );
+      SS2 = _mm_xor_si128( SS1, mm_rotl_32( A, 12 ) );
+      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ), 
+                                          SS2 ), W1[j] );
+      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
+                                          SS1 ), W[j] );
+      D = C;
+      C = mm_rotl_32( B, 9 );
+      B = A;
+      A = TT1;
+      H = G;
+      G = mm_rotl_32( F, 19 );
+      F = E;
+      E = P0( TT2 );
+   }
+
+   digest[0] = _mm_xor_si128( digest[0], A );
+   digest[1] = _mm_xor_si128( digest[1], B );
+   digest[2] = _mm_xor_si128( digest[2], C );
+   digest[3] = _mm_xor_si128( digest[3], D );
+   digest[4] = _mm_xor_si128( digest[4], E );
+   digest[5] = _mm_xor_si128( digest[5], F );
+   digest[6] = _mm_xor_si128( digest[6], G );
+   digest[7] = _mm_xor_si128( digest[7], H );
+}
+
+#endif
+
--- a/algo/sm3/sm3-hash-4way.h
+++ b/algo/sm3/sm3-hash-4way.h
@@ -0,0 +1,89 @@
+/* ====================================================================
+ * Copyright (c) 2014 - 2016 The GmSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the GmSSL Project.
+ *    (http://gmssl.org/)"
+ *
+ * 4. The name "GmSSL Project" must not be used to endorse or promote
+ *    products derived from this software without prior written
+ *    permission. For written permission, please contact
+ *    guanzhi1980@gmail.com.
+ *
+ * 5. Products derived from this software may not be called "GmSSL"
+ *    nor may "GmSSL" appear in their names without prior written
+ *    permission of the GmSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the GmSSL Project
+ *    (http://gmssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE GmSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#ifndef SPH_SM3_HASH_4WAY_H
+#define SPH_SM3_HASH_4WAY_H
+
+#define SM3_DIGEST_LENGTH	32
+#define SM3_BLOCK_SIZE		64
+#define SM3_CBLOCK		(SM3_BLOCK_SIZE)
+#define SM3_HMAC_SIZE		(SM3_DIGEST_LENGTH)
+
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <string.h>
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct {
+   __m128i block[16] __attribute__ ((aligned (64)));
+   __m128i digest[8];
+   uint32_t nblocks;
+   uint32_t num;
+} sm3_4way_ctx_t;
+
+void sm3_4way_init( sm3_4way_ctx_t *ctx );
+//void sm3_4way_update( sm3_4way_ctx_t *ctx, const unsigned char* data,
+//                      size_t data_len );
+//void sm3_4way_final( sm3_4way_ctx_t *ctx,
+//                      unsigned char digest[SM3_DIGEST_LENGTH] );
+void sm3_4way_compress( __m128i *digest, __m128i *block );
+
+void sm3_4way(void *cc, const void *data, size_t len);
+void sm3_4way_close(void *cc, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/algo/sm3/sm3.c
+++ b/algo/sm3/sm3.c
@@ -189,7 +189,7 @@ void sm3_compress(uint32_t digest[8], const unsigned char block[64])
 	for(j =16; j < 64; j++) {

 		T[j] = 0x7A879D8A;
-		SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7);
+		SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j&31)), 7);
 		SS2 = SS1 ^ ROTATELEFT(A,12);
 		TT1 = FF1(A,B,C) + D + SS2 + W1[j];
 		TT2 = GG1(E,F,G) + H + SS1 + W[j];
--- a/algo/whirlpool/md_helper.c
+++ b/algo/whirlpool/md_helper.c
@@ -252,8 +252,8 @@ SPH_XCAT(HASH, _addbits_and_close)(void *cc,
 	current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
 #endif

-uint64_t *b= (uint64_t*)sc->buf;
-uint64_t *s= (uint64_t*)sc->state;
+//uint64_t *b= (uint64_t*)sc->buf;
+//uint64_t *s= (uint64_t*)sc->state;
 // printf("Sptr 1= %u\n",current);   
 // printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
 // printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );
--- a/algo/whirlpool/sph_whirlpool.c
+++ b/algo/whirlpool/sph_whirlpool.c
@@ -3468,9 +3468,10 @@ sph_ ## name ## _close(void *cc, void *dst) \
 	for (i = 0; i < 8; i ++) \
 		sph_enc64le((unsigned char *)dst + 8 * i, sc->state[i]); \
 }
-//	sph_ ## name ## _init(cc); \
-//}
-
+/*
+	sph_ ## name ## _init(cc); \
+}
+*/
 MAKE_CLOSE(whirlpool)
 MAKE_CLOSE(whirlpool0)
 MAKE_CLOSE(whirlpool1)
--- a/algo/whirlpool/whirlpool-4way.c
+++ b/algo/whirlpool/whirlpool-4way.c
@@ -1,4 +1,7 @@
 #include "whirlpool-gate.h"
+
+#if defined(__AVX2__)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -6,8 +9,6 @@
 #include "sph_whirlpool.h"
 #include "whirlpool-hash-4way.h"

-#if defined(__AVX2__)
-
 static __thread whirlpool_4way_context whirl_mid;

 void whirlpool_hash_4way( void *state, const void *input )
@@ -50,7 +51,7 @@ void whirlpool_hash_4way( void *state, const void *input )
 }

 int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
-                             unsigned long *hashes_done )
+                             uint64_t *hashes_done )
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
@@ -67,8 +68,8 @@ int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
   uint32_t *noncep2 = vdata + 77;
   uint32_t *noncep3 = vdata + 79;

-//   if (opt_benchmark)
-//      ((uint32_t*)ptarget)[7] = 0x0000ff;
+   if (opt_benchmark)
+      ((uint32_t*)ptarget)[7] = 0x0000ff;

    for (int i=0; i < 19; i++)
      be32enc(&endiandata[i], pdata[i]);
--- a/algo/whirlpool/whirlpool-gate.c
+++ b/algo/whirlpool/whirlpool-gate.c
@@ -2,14 +2,16 @@

 bool register_whirlpool_algo( algo_gate_t* gate )
 {
-//#if defined (WHIRLPOOL_4WAY)
-//  gate->scanhash  = (void*)&scanhash_whirlpool_4way;
-//  gate->hash      = (void*)&whirlpool_hash_4way;
-//#else
+#if defined (WHIRLPOOL_4WAY)
+  four_way_not_tested();
+  gate->optimizations = FOUR_WAY_OPT;
+  gate->scanhash  = (void*)&scanhash_whirlpool_4way;
+  gate->hash      = (void*)&whirlpool_hash_4way;
+#else
  gate->scanhash  = (void*)&scanhash_whirlpool;
  gate->hash      = (void*)&whirlpool_hash;
  init_whirlpool_ctx();
-//#endif
+#endif
  return true;
 };

--- a/algo/whirlpool/whirlpool-gate.h
+++ b/algo/whirlpool/whirlpool-gate.h
@@ -4,21 +4,25 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

+/*
 #if defined(FOUR_WAY) && defined(__AVX2__)
  #define WHIRLPOOL_4WAY
 #endif
+*/

-//#if defined (WHIRLPOOL_4WAY) 
+#if defined (WHIRLPOOL_4WAY) 

-//void whirlpool_hash_4way(void *state, const void *input);
+void whirlpool_hash_4way(void *state, const void *input);

-//int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce,
-//                              uint64_t *hashes_done );
-//#endif
+int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                              uint64_t *hashes_done );
+#else

 void whirlpool_hash( void *state, const void *input );

 int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done );
+void init_whirlpool_ctx();
 #endif

+#endif
--- a/algo/whirlpool/whirlpool-hash-4way.c
+++ b/algo/whirlpool/whirlpool-hash-4way.c
@@ -3345,8 +3345,12 @@ do { \
 #define READ_STATE     MUL8(READ_STATE_W)
 #define ROUND0         MUL8(ROUND0_W)
 #define UPDATE_STATE   MUL8(UPDATE_STATE_W)
+/*
 #define BYTE(x, n) \
   _mm256_and_si256( _mm256_srli_epi64( x, n<<3 ), _mm256_set1_epi64x( 0xFF ) )
+*/
+#define BYTE(x, n)     ((unsigned)((x) >> (8 * (n))) & 0xFF)
+

 // A very complex, but structured, expression with a mix of scalar
 // and vector operations to retrieve specific 64 bit constants from
@@ -3357,21 +3361,51 @@ do { \
 // Extract 64 bit vector elements from "in" representing offsets. Unmask the
 // low byte of each and scale for use as vector indexes.
 // Pack the data in a vector and return it.
+
+/*
 #define t_row( inv, row ) \
   _mm256_and_si256( \
        _mm256_srli_epi64( inv, row << 3 ), _mm256_set1_epi64x( 0xFF ) )
-
-// Extract vector element from "lane" of vector "in[row]" and use it to index
-// scalar array of constants "table" and return referenced 64 bit entry.
-#define t_lane( table, inv, row, lane ) \
-   table[ _mm256_extract_epi64( t_row( inv, row ), lane ) ]
+*/

 // Build a vector from elements of non-contiguous 64 bit data extracted from
 // scalar "table".
+// reference scalar version 1480 kH/s
+/*
+// version 1, extract with gather
+// 955 kH/s
+#define t_lane( inv, row, lane ) \
+    BYTE( _mm256_extract_epi64( inv, lane ), row ) \
+
+
 #define t_vec( table, inv, row ) \
-    _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
-                t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
-                t_lane( table, inv, row, 0 ) )
+   _mm256_i32gather_epi64( table, _mm_set_epi32( t_lane( inv, row, 3 ), \
+                              t_lane( inv, row, 2 ), t_lane( inv, row, 1 ), \
+                              t_lane( inv, row, 0) ), 1 )
+*/
+/*
+// version 2, extract with set
+// 1100 kH/s 
+#define t_lane( table, inv, row, lane ) \
+   table[ BYTE( _mm256_extract_epi64( inv, lane ), row ) ] \
+
+#define t_vec( table, inv, row ) \
+   _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
+                 t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
+                 t_lane( table, inv, row, 0 ) )
+*/
+
+// version 3, vector indexing with set
+// 1105 kH/s
+#define t_lane( table, inv, row, lane ) \
+   table[ BYTE( inv[ lane ], row ) ] \
+
+#define t_vec( table, inv, row ) \
+   _mm256_set_epi64x( t_lane( table, inv, row, 3 ), \
+                 t_lane( table, inv, row, 2 ), t_lane( table, inv, row, 1 ), \
+                 t_lane( table, inv, row, 0 ) )
+
+
 
 #if SPH_SMALL_FOOTPRINT_WHIRLPOOL

--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -0,0 +1,252 @@
+#include "cpuminer-config.h"
+#include "c11-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;    
+    keccak512_4way_context  keccak;    
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} c11_4way_ctx_holder;
+
+c11_4way_ctx_holder c11_4way_ctx;
+
+void init_c11_4way_ctx()
+{
+     blake512_4way_init( &c11_4way_ctx.blake );
+     bmw512_4way_init( &c11_4way_ctx.bmw );
+     init_groestl( &c11_4way_ctx.groestl, 64 );
+     skein512_4way_init( &c11_4way_ctx.skein );
+     jh512_4way_init( &c11_4way_ctx.jh );
+     keccak512_4way_init( &c11_4way_ctx.keccak );
+     init_luffa( &c11_4way_ctx.luffa, 512 );
+     cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &c11_4way_ctx.shavite );
+     init_sd( &c11_4way_ctx.simd, 512 );
+     init_echo( &c11_4way_ctx.echo, 512 );
+}
+
+void c11_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     c11_4way_ctx_holder ctx;
+     memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
+
+     // 1 Blake 4way
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &c11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 5 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // 6 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &c11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &c11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &c11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for (int m=0; m < 6; m++) 
+       if (Htarg <= htmax[m])
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            c11_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x11/c11-gate.c
+++ b/algo/x11/c11-gate.c
@@ -0,0 +1,18 @@
+#include "c11-gate.h"
+
+bool register_c11_algo( algo_gate_t* gate )
+{
+#if defined (C11_4WAY)
+  init_c11_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_c11_4way;
+  gate->hash      = (void*)&c11_4way_hash;
+#else
+  init_c11_ctx();
+  gate->scanhash  = (void*)&scanhash_c11;
+  gate->hash      = (void*)&c11_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x11/c11-gate.h
+++ b/algo/x11/c11-gate.h
@@ -0,0 +1,32 @@
+#ifndef C11_GATE_H__
+#define C11_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define C11_4WAY
+#endif
+
+bool register_c11_algo( algo_gate_t* gate );
+
+#if defined(C11_4WAY)
+
+void c11_4way_hash( void *state, const void *input );
+
+int scanhash_c11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_c11_4way_ctx();
+
+#endif
+
+void c11_hash( void *state, const void *input );
+
+int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_c11_ctx();
+
+#endif
+
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "c11-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -64,7 +64,7 @@ void init_c11_ctx()
 #endif
 }

-void c11hash( void *output, const void *input )
+void c11_hash( void *output, const void *input )
 {
        unsigned char hash[128] _ALIGN(64); // uint32_t hashA[16], hashB[16];
 //	uint32_t _ALIGN(64) hash[16];
@@ -157,12 +157,13 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
 	do
        {
 		be32enc( &endiandata[19], nonce );
-		c11hash( hash, endiandata );
+		c11_hash( hash, endiandata );
 		if ( hash[7] <= Htarg && fulltest(hash, ptarget) )
                {
 			pdata[19] = nonce;
 			*hashes_done = pdata[19] - first_nonce;
-			return 1;
+                        work_set_target_ratio( work, hash );
+ 			return 1;
 		}
 		nonce++;
 	} while ( nonce < max_nonce && !(*restart) );
@@ -171,13 +172,3 @@ int scanhash_c11( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-bool register_c11_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_c11_ctx();
-  gate->scanhash  = (void*)&scanhash_c11;
-  gate->hash      = (void*)&c11hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
--- a/algo/x11/fresh.c
+++ b/algo/x11/fresh.c
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -0,0 +1,274 @@
+#include "timetravel-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+} tt8_4way_ctx_holder;
+
+tt8_4way_ctx_holder tt8_4way_ctx __attribute__ ((aligned (64)));
+
+void init_tt8_4way_ctx()
+{
+    blake512_4way_init( &tt8_4way_ctx.blake );
+    bmw512_4way_init( &tt8_4way_ctx.bmw );
+    init_groestl( &tt8_4way_ctx.groestl, 64 );
+    skein512_4way_init( &tt8_4way_ctx.skein );
+    jh512_4way_init( &tt8_4way_ctx.jh );
+    keccak512_4way_init( &tt8_4way_ctx.keccak );
+    init_luffa( &tt8_4way_ctx.luffa, 512 );
+    cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 );
+};
+
+void timetravel_4way_hash(void *output, const void *input)
+{
+   uint64_t hash0[8] __attribute__ ((aligned (64)));
+   uint64_t hash1[8] __attribute__ ((aligned (64)));
+   uint64_t hash2[8] __attribute__ ((aligned (64)));
+   uint64_t hash3[8] __attribute__ ((aligned (64)));
+   uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
+   uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
+   uint64_t *vhashA, *vhashB;
+   tt8_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+   uint32_t dataLen = 64;
+   int i;
+
+   memcpy( &ctx, &tt8_4way_ctx, sizeof(tt8_4way_ctx) );
+
+   for ( i = 0; i < TT8_FUNC_COUNT; i++ )
+   {
+      if (i == 0)
+      {
+	 dataLen = 80;
+         vhashA = (uint64_t*)input;
+         vhashB = vhashX;
+      }
+      else
+      {
+         dataLen = 64;
+         if ( i % 2 == 0 )
+         {
+           vhashA = vhashY;
+           vhashB = vhashX;
+         }
+         else
+         {
+           vhashA = vhashX;
+           vhashB = vhashY;
+         }
+      }
+
+      switch ( permutation[i] )
+      {
+        case 0:
+           blake512_4way( &ctx.blake, vhashA, dataLen );
+           blake512_4way_close( &ctx.blake, vhashB );
+           if ( i == 7 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 1:
+           bmw512_4way( &ctx.bmw, vhashA, dataLen );
+           bmw512_4way_close( &ctx.bmw, vhashB );
+           if ( i == 7 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 2:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                   (char*)hash0, dataLen<<3 );
+           reinit_groestl( &ctx.groestl );
+           update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                   (char*)hash1, dataLen<<3 );
+           reinit_groestl( &ctx.groestl );     
+           update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                   (char*)hash2, dataLen<<3 );
+           reinit_groestl( &ctx.groestl );     
+           update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                   (char*)hash3, dataLen<<3 );
+           if ( i != 7 )
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        case 3:
+           skein512_4way( &ctx.skein, vhashA, dataLen );
+           skein512_4way_close( &ctx.skein, vhashB );
+           if ( i == 7 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 4:
+           jh512_4way( &ctx.jh, vhashA, dataLen );
+           jh512_4way_close( &ctx.jh, vhashB );
+           if ( i == 7 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 5:
+           keccak512_4way( &ctx.keccak, vhashA, dataLen );
+           keccak512_4way_close( &ctx.keccak, vhashB );
+           if ( i == 7 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 6:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                                        (const BitSequence *)hash0, dataLen );
+           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                                         (const BitSequence*)hash1, dataLen );
+           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                                         (const BitSequence*)hash2, dataLen );
+           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                                         (const BitSequence*)hash3, dataLen );
+           if ( i != 7 )           
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        case 7:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                      (const byte*)hash0, dataLen );
+           memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
+                                      (const byte*)hash1, dataLen );
+           memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
+                                      (const byte*)hash2, dataLen );
+           memcpy( &ctx.cube, &tt8_4way_ctx.cube, sizeof(cubehashParam) );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
+                                      (const byte*)hash3, dataLen );
+           if ( i != 7 )           
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        default:
+           applog(LOG_ERR,"SWERR: timetravel invalid permutation");
+	break;
+      }
+   }
+
+   memcpy( output,    hash0, 32 );
+   memcpy( output+32, hash1, 32 );
+   memcpy( output+64, hash2, 32 );
+   memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                              uint64_t *hashes_done )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+   const uint32_t Htarg = ptarget[7];
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   int i;
+
+   if ( opt_benchmark )
+	ptarget[7] = 0x0cff;
+
+   for ( int k = 0; k < 19; k++ )
+	be32enc( &endiandata[k], pdata[k] );
+
+   const uint32_t timestamp = endiandata[17];
+   if ( timestamp != s_ntime )
+   {
+      const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
+                    % TT8_FUNC_COUNT_PERMUTATIONS;
+      for ( i = 0; i < TT8_FUNC_COUNT; i++ )
+         permutation[i] = i;
+      for ( i = 0; i < steps; i++ )
+         tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
+      s_ntime = timestamp;
+   }
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   do
+   {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      timetravel_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
+      {
+         found[0] = true;
+         num_found++;
+         nonces[0] = n;
+         work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
+      {
+         found[1] = true;
+         num_found++;
+         nonces[1] = n+1;
+         work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
+      {
+         found[2] = true;
+         num_found++;
+         nonces[2] = n+2;
+         work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
+      {
+         found[3] = true;
+         num_found++;
+         nonces[3] = n+3;
+         work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/x11/timetravel-gate.c
+++ b/algo/x11/timetravel-gate.c
@@ -0,0 +1,78 @@
+#include "timetravel-gate.h"
+
+void tt8_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_timetravel_algo( algo_gate_t* gate )
+{
+#ifdef TIMETRAVEL_4WAY
+  init_tt8_4way_ctx();
+  gate->scanhash   = (void*)&scanhash_timetravel_4way;
+  gate->hash       = (void*)&timetravel_4way_hash;
+#else
+  init_tt8_ctx();
+  gate->scanhash   = (void*)&scanhash_timetravel;
+  gate->hash       = (void*)&timetravel_hash;
+#endif
+  gate->set_target = (void*)&tt8_set_target;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  return true;
+};
+
+inline void tt_swap( int *a, int *b )
+{
+        int c = *a;
+        *a = *b;
+        *b = c;
+}
+
+inline void reverse( int *pbegin, int *pend )
+{
+   while ( (pbegin != pend) && (pbegin != --pend) )
+   {
+      tt_swap( pbegin, pend );
+      pbegin++;
+   }
+}
+
+void tt8_next_permutation( int *pbegin, int *pend )
+{
+   if ( pbegin == pend )
+        return;
+
+   int *i = pbegin;
+   ++i;
+   if ( i == pend )
+        return;
+
+   i = pend;
+   --i;
+
+   while (1)
+   {
+        int *j = i;
+        --i;
+
+        if ( *i < *j )
+        {
+           int *k = pend;
+
+           while ( !(*i < *--k) ) /* do nothing */ ;
+
+           tt_swap( i, k );
+           reverse(j, pend);
+                return; // true
+        }
+
+        if ( i == pbegin )
+        {
+           reverse(pbegin, pend);
+           return; // false
+        }
+        // else?
+   }
+}
+
--- a/algo/x11/timetravel-gate.h
+++ b/algo/x11/timetravel-gate.h
@@ -0,0 +1,40 @@
+#ifndef TIMETRAVEL_GATE_H__
+#define TIMETRAVEL_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define TIMETRAVEL_4WAY
+#endif
+
+// Machinecoin Genesis Timestamp
+#define TT8_FUNC_BASE_TIMESTAMP 1389040865
+
+#define TT8_FUNC_COUNT 8
+#define TT8_FUNC_COUNT_PERMUTATIONS 40320
+
+void tt8_next_permutation( int *pbegin, int *pend );
+
+bool register_timetravel_algo( algo_gate_t* gate );
+
+#if defined(TIMETRAVEL_4WAY)
+
+void timetravel_4way_hash( void *state, const void *input );
+
+int scanhash_timetravel_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_tt8_4way_ctx();
+
+#endif
+
+void timetravel_hash( void *state, const void *input );
+
+int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_tt8_ctx();
+
+#endif
+
--- a/algo/x11/timetravel.c
+++ b/algo/x11/timetravel.c
@@ -1,11 +1,9 @@
-#include "algo-gate-api.h"
+#include "timetravel-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "avxdefs.h"
-
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/jh/sph_jh.h"
@@ -13,75 +11,14 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/luffa/sse2/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-
 #ifdef NO_AES_NI
  #include "algo/groestl/sph_groestl.h"
 #else
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif

-// Machinecoin Genesis Timestamp
-#define HASH_FUNC_BASE_TIMESTAMP 1389040865
-
-#define HASH_FUNC_COUNT 8
-#define HASH_FUNC_COUNT_PERMUTATIONS 40320
-
 static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread int permutation[HASH_FUNC_COUNT] = { 0 };
-
-inline void tt_swap( int *a, int *b )
-{
-	int c = *a;
-	*a = *b;
-	*b = c;
-}
-
-inline void reverse( int *pbegin, int *pend )
-{
-   while ( (pbegin != pend) && (pbegin != --pend) )
-   {
-      tt_swap( pbegin, pend );
-      pbegin++;
-   }
-}
-
-static void next_permutation( int *pbegin, int *pend )
-{
-   if ( pbegin == pend )
-	return;
-
-   int *i = pbegin;
-   ++i;
-   if ( i == pend )
-	return;
-
-   i = pend;
-   --i;
-
-   while (1)
-   {
-	int *j = i;
-	--i;
-
-	if ( *i < *j )
-        {
-           int *k = pend;
-
-	   while ( !(*i < *--k) ) /* do nothing */ ;
-
-	   tt_swap( i, k );
-	   reverse(j, pend);
-		return; // true
-	}
-
-	if ( i == pbegin )
-        {
-	   reverse(pbegin, pend);
-	   return; // false
-	}
-        // else?
-   }
-}
+static __thread int permutation[TT8_FUNC_COUNT] = { 0 };

 typedef struct {
        sph_blake512_context    blake;
@@ -101,7 +38,7 @@ typedef struct {
 tt_ctx_holder tt_ctx __attribute__ ((aligned (64)));
 __thread tt_ctx_holder tt_mid __attribute__ ((aligned (64)));

-void init_tt_ctx()
+void init_tt8_ctx()
 {
        sph_blake512_init( &tt_ctx.blake );
        sph_bmw512_init( &tt_ctx.bmw );
@@ -119,7 +56,7 @@ void init_tt_ctx()

 void timetravel_hash(void *output, const void *input)
 {
-   uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64)));
+   uint32_t hash[ 16 * TT8_FUNC_COUNT ] __attribute__ ((aligned (64)));
   uint32_t *hashA, *hashB;
   tt_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
@@ -130,7 +67,7 @@ void timetravel_hash(void *output, const void *input)

   memcpy( &ctx, &tt_ctx, sizeof(tt_ctx) );

-   for ( i = 0; i < HASH_FUNC_COUNT; i++ )
+   for ( i = 0; i < TT8_FUNC_COUNT; i++ )
   {
        if (i == 0)
        {
@@ -270,7 +207,7 @@ void timetravel_hash(void *output, const void *input)
    }
  }

-	memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
+	memcpy(output, &hash[16 * (TT8_FUNC_COUNT - 1)], 32);
 }

 int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
@@ -296,12 +233,12 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t timestamp = endiandata[17];
   if ( timestamp != s_ntime )
   {
-      const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
-                    % HASH_FUNC_COUNT_PERMUTATIONS;
-      for ( i = 0; i < HASH_FUNC_COUNT; i++ )
+      const int steps = ( timestamp - TT8_FUNC_BASE_TIMESTAMP )
+                    % TT8_FUNC_COUNT_PERMUTATIONS;
+      for ( i = 0; i < TT8_FUNC_COUNT; i++ )
         permutation[i] = i;
      for ( i = 0; i < steps; i++ )
-         next_permutation( permutation, permutation + HASH_FUNC_COUNT );
+         tt8_next_permutation( permutation, permutation + TT8_FUNC_COUNT );
      s_ntime = timestamp;

      // do midstate precalc for first function
@@ -359,6 +296,7 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
              work_set_target_ratio( work, hash );
              pdata[19] = nonce;
              *hashes_done = pdata[19] - first_nonce;
+              work_set_target_ratio( work, hash );
              return 1;
         }
         nonce++;
@@ -370,19 +308,4 @@ int scanhash_timetravel( int thr_id, struct work *work, uint32_t max_nonce,
  return 0;
 }

-void timetravel_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool register_timetravel_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_tt_ctx();
-  gate->scanhash   = (void*)&scanhash_timetravel;
-  gate->hash       = (void*)&timetravel_hash;
-  gate->set_target = (void*)&timetravel_set_target;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  return true;
-};

--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -0,0 +1,316 @@
+#include "timetravel10-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+} tt10_4way_ctx_holder;
+
+tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64)));
+
+void init_tt10_4way_ctx()
+{
+    blake512_4way_init( &tt10_4way_ctx.blake );
+    bmw512_4way_init( &tt10_4way_ctx.bmw );
+    init_groestl( &tt10_4way_ctx.groestl, 64 );
+    skein512_4way_init( &tt10_4way_ctx.skein );
+    jh512_4way_init( &tt10_4way_ctx.jh );
+    keccak512_4way_init( &tt10_4way_ctx.keccak );
+    init_luffa( &tt10_4way_ctx.luffa, 512 );
+    cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 );
+    sph_shavite512_init( &tt10_4way_ctx.shavite );
+    init_sd( &tt10_4way_ctx.simd, 512 );
+};
+
+void timetravel10_4way_hash(void *output, const void *input)
+{
+   uint64_t hash0[8] __attribute__ ((aligned (64)));
+   uint64_t hash1[8] __attribute__ ((aligned (64)));
+   uint64_t hash2[8] __attribute__ ((aligned (64)));
+   uint64_t hash3[8] __attribute__ ((aligned (64)));
+   uint64_t vhashX[8*4] __attribute__ ((aligned (64)));
+   uint64_t vhashY[8*4] __attribute__ ((aligned (64)));
+   uint64_t *vhashA, *vhashB;
+   tt10_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+   uint32_t dataLen = 64;
+   int i;
+
+   memcpy( &ctx, &tt10_4way_ctx, sizeof(tt10_4way_ctx) );
+
+   for ( i = 0; i < TT10_FUNC_COUNT; i++ )
+   {
+      if (i == 0)
+      {
+	 dataLen = 80;
+         vhashA = (uint64_t*)input;
+         vhashB = vhashX;
+      }
+      else
+      {
+         dataLen = 64;
+         if ( i % 2 == 0 )
+         {
+           vhashA = vhashY;
+           vhashB = vhashX;
+         }
+         else
+         {
+           vhashA = vhashX;
+           vhashB = vhashY;
+         }
+      }
+
+      switch ( permutation[i] )
+      {
+        case 0:
+           blake512_4way( &ctx.blake, vhashA, dataLen );
+           blake512_4way_close( &ctx.blake, vhashB );
+           if ( i == 9 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 1:
+           bmw512_4way( &ctx.bmw, vhashA, dataLen );
+           bmw512_4way_close( &ctx.bmw, vhashB );
+           if ( i == 9 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 2:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                   (char*)hash0, dataLen<<3 );
+           reinit_groestl( &ctx.groestl );
+           update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                   (char*)hash1, dataLen<<3 );
+           reinit_groestl( &ctx.groestl );     
+           update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                   (char*)hash2, dataLen<<3 );
+           reinit_groestl( &ctx.groestl );     
+           update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                   (char*)hash3, dataLen<<3 );
+           if ( i != 9 )
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        case 3:
+           skein512_4way( &ctx.skein, vhashA, dataLen );
+           skein512_4way_close( &ctx.skein, vhashB );
+           if ( i == 9 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 4:
+           jh512_4way( &ctx.jh, vhashA, dataLen );
+           jh512_4way_close( &ctx.jh, vhashB );
+           if ( i == 9 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 5:
+           keccak512_4way( &ctx.keccak, vhashA, dataLen );
+           keccak512_4way_close( &ctx.keccak, vhashB );
+           if ( i == 9 )
+              mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                       vhashB, dataLen<<3 );
+        break;
+        case 6:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                                        (const BitSequence *)hash0, dataLen );
+           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                                         (const BitSequence*)hash1, dataLen );
+           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                                         (const BitSequence*)hash2, dataLen );
+           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
+           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                                         (const BitSequence*)hash3, dataLen );
+           if ( i != 9 )           
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        case 7:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                      (const byte*)hash0, dataLen );
+           memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
+                                      (const byte*)hash1, dataLen );
+           memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
+                                      (const byte*)hash2, dataLen );
+           memcpy( &ctx.cube, &tt10_4way_ctx.cube, sizeof(cubehashParam) );
+           cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
+                                      (const byte*)hash3, dataLen );
+           if ( i != 9 )           
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        case 8:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           sph_shavite512( &ctx.shavite, hash0, dataLen );
+           sph_shavite512_close( &ctx.shavite, hash0 );
+           memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
+           sph_shavite512( &ctx.shavite, hash1, dataLen );
+           sph_shavite512_close( &ctx.shavite, hash1 );
+           memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
+           sph_shavite512( &ctx.shavite, hash2, dataLen );
+           sph_shavite512_close( &ctx.shavite, hash2 );
+           memcpy( &ctx.shavite, &tt10_4way_ctx.shavite, sizeof ctx.shavite );
+           sph_shavite512( &ctx.shavite, hash3, dataLen );
+           sph_shavite512_close( &ctx.shavite, hash3 );
+           if ( i != 9 )
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        case 9:
+           mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                    vhashA, dataLen<<3 );
+           update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                            (const BitSequence *)hash0, dataLen<<3 );
+           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
+           update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                            (const BitSequence *)hash1, dataLen<<3 );
+           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
+           update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                            (const BitSequence *)hash2, dataLen<<3 );
+           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
+           update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                            (const BitSequence *)hash3, dataLen<<3 );
+           if ( i != 9 )
+              mm256_interleave_4x64( vhashB,
+                                     hash0, hash1, hash2, hash3, dataLen<<3 );
+        break;
+        default:
+           applog(LOG_ERR,"SWERR: timetravel invalid permutation");
+	break;
+      }
+   }
+
+   memcpy( output,    hash0, 32 );
+   memcpy( output+32, hash1, 32 );
+   memcpy( output+64, hash2, 32 );
+   memcpy( output+96, hash3, 32 );
+}
+
+int scanhash_timetravel10_4way( int thr_id, struct work *work,
+                                uint32_t max_nonce, uint64_t *hashes_done )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+   const uint32_t Htarg = ptarget[7];
+   volatile uint8_t *restart = &(work_restart[thr_id].restart);
+   int i;
+
+   if ( opt_benchmark )
+	ptarget[7] = 0x0cff;
+
+   for ( int k = 0; k < 19; k++ )
+	be32enc( &endiandata[k], pdata[k] );
+
+   const uint32_t timestamp = endiandata[17];
+   if ( timestamp != s_ntime )
+   {
+      const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
+                    % TT10_FUNC_COUNT_PERMUTATIONS;
+      for ( i = 0; i < TT10_FUNC_COUNT; i++ )
+         permutation[i] = i;
+      for ( i = 0; i < steps; i++ )
+         tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
+      s_ntime = timestamp;
+   }
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   do
+   {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      timetravel10_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget) )
+      {
+         found[0] = true;
+         num_found++;
+         nonces[0] = n;
+         work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget) )
+      {
+         found[1] = true;
+         num_found++;
+         nonces[1] = n+1;
+         work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget) )
+      {
+         found[2] = true;
+         num_found++;
+         nonces[2] = n+2;
+         work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget) )
+      {
+         found[3] = true;
+         num_found++;
+         nonces[3] = n+3;
+         work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) );
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/x11/timetravel10-gate.c
+++ b/algo/x11/timetravel10-gate.c
@@ -0,0 +1,78 @@
+#include "timetravel10-gate.h"
+
+void tt10_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_timetravel10_algo( algo_gate_t* gate )
+{
+#ifdef TIMETRAVEL10_4WAY
+  init_tt10_4way_ctx();
+  gate->scanhash   = (void*)&scanhash_timetravel10_4way;
+  gate->hash       = (void*)&timetravel10_4way_hash;
+#else
+  init_tt10_ctx();
+  gate->scanhash   = (void*)&scanhash_timetravel10;
+  gate->hash       = (void*)&timetravel10_hash;
+#endif
+  gate->set_target = (void*)&tt10_set_target;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  return true;
+};
+
+inline void tt10_swap( int *a, int *b )
+{
+        int c = *a;
+        *a = *b;
+        *b = c;
+}
+
+inline void reverse( int *pbegin, int *pend )
+{
+   while ( (pbegin != pend) && (pbegin != --pend) )
+   {
+      tt10_swap( pbegin, pend );
+      pbegin++;
+   }
+}
+
+void tt10_next_permutation( int *pbegin, int *pend )
+{
+   if ( pbegin == pend )
+        return;
+
+   int *i = pbegin;
+   ++i;
+   if ( i == pend )
+        return;
+
+   i = pend;
+   --i;
+
+   while (1)
+   {
+        int *j = i;
+        --i;
+
+        if ( *i < *j )
+        {
+           int *k = pend;
+
+           while ( !(*i < *--k) ) /* do nothing */ ;
+
+           tt10_swap( i, k );
+           reverse(j, pend);
+                return; // true
+        }
+
+        if ( i == pbegin )
+        {
+           reverse(pbegin, pend);
+           return; // false
+        }
+        // else?
+   }
+}
+
--- a/algo/x11/timetravel10-gate.h
+++ b/algo/x11/timetravel10-gate.h
@@ -0,0 +1,39 @@
+#ifndef TIMETRAVEL10_GATE_H__
+#define TIMETRAVEL10_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define TIMETRAVEL10_4WAY
+#endif
+
+// BitCore Genesis Timestamp
+#define TT10_FUNC_BASE_TIMESTAMP 1492973331U
+#define TT10_FUNC_COUNT 10
+#define TT10_FUNC_COUNT_PERMUTATIONS 40320
+
+void tt10_next_permutation( int *pbegin, int *pend );
+
+bool register_timetravel10_algo( algo_gate_t* gate );
+
+#if defined(TIMETRAVEL10_4WAY)
+
+void timetravel10_4way_hash( void *state, const void *input );
+
+int scanhash_timetravel10_4way( int thr_id, struct work *work,
+                                uint32_t max_nonce, uint64_t *hashes_done );
+
+void init_tt10_4way_ctx();
+
+#endif
+
+void timetravel10_hash( void *state, const void *input );
+
+int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_tt10_ctx();
+
+#endif
+
--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -1,11 +1,8 @@
-#include "algo-gate-api.h"
-
+#include "timetravel10-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "avxdefs.h"
-
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/jh/sph_jh.h"
@@ -22,68 +19,8 @@
  #include "algo/groestl/aes_ni/hash-groestl.h"
 #endif

-// BitCore Genesis Timestamp
-#define HASH_FUNC_BASE_TIMESTAMP 1492973331U
-
-#define HASH_FUNC_COUNT 10
-#define HASH_FUNC_COUNT_PERMUTATIONS 40320
-
 static __thread uint32_t s_ntime = UINT32_MAX;
-static __thread int permutation[HASH_FUNC_COUNT] = { 0 };
-
-inline void tt10_swap( int *a, int *b )
-{
-	int c = *a;
-	*a = *b;
-	*b = c;
-}
-
-inline void reverse( int *pbegin, int *pend )
-{
-   while ( (pbegin != pend) && (pbegin != --pend) )
-   {
-      tt10_swap( pbegin, pend );
-      pbegin++;
-   }
-}
-
-static void next_permutation( int *pbegin, int *pend )
-{
-   if ( pbegin == pend )
-	return;
-
-   int *i = pbegin;
-   ++i;
-   if ( i == pend )
-	return;
-
-   i = pend;
-   --i;
-
-   while (1)
-   {
-	int *j = i;
-	--i;
-
-	if ( *i < *j )
-        {
-           int *k = pend;
-
-	   while ( !(*i < *--k) ) /* do nothing */ ;
-
-	   tt10_swap( i, k );
-	   reverse(j, pend);
-		return; // true
-	}
-
-	if ( i == pbegin )
-        {
-	   reverse(pbegin, pend);
-	   return; // false
-	}
-        // else?
-   }
-}
+static __thread int permutation[TT10_FUNC_COUNT] = { 0 };

 typedef struct {
        sph_blake512_context    blake;
@@ -125,7 +62,7 @@ void init_tt10_ctx()

 void timetravel10_hash(void *output, const void *input)
 {
-   uint32_t hash[ 16 * HASH_FUNC_COUNT ] __attribute__ ((aligned (64)));
+   uint32_t hash[ 16 * TT10_FUNC_COUNT ] __attribute__ ((aligned (64)));
   uint32_t *hashA, *hashB;
   tt10_ctx_holder ctx __attribute__ ((aligned (64)));
   uint32_t dataLen = 64;
@@ -136,7 +73,7 @@ void timetravel10_hash(void *output, const void *input)

   memcpy( &ctx, &tt10_ctx, sizeof(tt10_ctx) );

-   for ( i = 0; i < HASH_FUNC_COUNT; i++ )
+   for ( i = 0; i < TT10_FUNC_COUNT; i++ )
   {
        if (i == 0)
        {
@@ -302,7 +239,7 @@ void timetravel10_hash(void *output, const void *input)
    }
  }

-	memcpy(output, &hash[16 * (HASH_FUNC_COUNT - 1)], 32);
+	memcpy(output, &hash[16 * (TT10_FUNC_COUNT - 1)], 32);
 }

 int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
@@ -328,12 +265,12 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t timestamp = endiandata[17];
   if ( timestamp != s_ntime )
   {
-      const int steps = ( timestamp - HASH_FUNC_BASE_TIMESTAMP )
-                    % HASH_FUNC_COUNT_PERMUTATIONS;
-      for ( i = 0; i < HASH_FUNC_COUNT; i++ )
+      const int steps = ( timestamp - TT10_FUNC_BASE_TIMESTAMP )
+                    % TT10_FUNC_COUNT_PERMUTATIONS;
+      for ( i = 0; i < TT10_FUNC_COUNT; i++ )
         permutation[i] = i;
      for ( i = 0; i < steps; i++ )
-         next_permutation( permutation, permutation + HASH_FUNC_COUNT );
+         tt10_next_permutation( permutation, permutation + TT10_FUNC_COUNT );
      s_ntime = timestamp;

      // do midstate precalc for first function
@@ -398,6 +335,7 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
        {
              work_set_target_ratio( work, hash );
              pdata[19] = nonce;
+              work_set_target_ratio( work, hash );
              *hashes_done = pdata[19] - first_nonce;
              return 1;
         }
@@ -409,20 +347,3 @@ int scanhash_timetravel10( int thr_id, struct work *work, uint32_t max_nonce,
        *hashes_done = pdata[19] - first_nonce + 1;
  return 0;
 }
-
-void timetravel10_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool register_timetravel10_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_tt10_ctx();
-  gate->scanhash   = (void*)&scanhash_timetravel10;
-  gate->hash       = (void*)&timetravel10_hash;
-  gate->set_target = (void*)&timetravel10_set_target;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  return true;
-};
-
--- a/algo/tribus/tribus-4way.c
+++ b/algo/tribus/tribus-4way.c
@@ -10,8 +10,14 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/echo/aes_ni/hash_api.h"

+//hashState_echo tribus_4way_ctx __attribute__ ((aligned (64)));
 static __thread jh512_4way_context ctx_mid;
-
+/*
+void init_tribus_4way_ctx()
+{
+     init_echo( &tribus_4way_ctx, 512 );
+}
+*/
 void tribus_hash_4way(void *state, const void *input)
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
--- a/algo/tribus/tribus-gate.c
+++ b/algo/tribus/tribus-gate.c
@@ -1,22 +1,11 @@
 #include "tribus-gate.h"
-/*
-bool tribus_thread_init()
-{
-   sph_jh512_init( &tribus_ctx.jh );
-   sph_keccak512_init( &tribus_ctx.keccak );
-#ifdef NO_AES_NI
-   sph_echo512_init( &tribus_ctx.echo );
-#else
-   init_echo( &tribus_ctx.echo, 512 );
-#endif
-  return true;
-}
-*/
+
 bool register_tribus_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->get_max64     = (void*)&get_max64_0x1ffff;
 #if defined (TRIBUS_4WAY)
+//  init_tribus_4way_ctx();
  gate->scanhash      = (void*)&scanhash_tribus_4way;
  gate->hash          = (void*)&tribus_hash_4way;
 #else
--- a/algo/tribus/tribus-gate.h
+++ b/algo/tribus/tribus-gate.h
@@ -4,12 +4,14 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+#if defined(HASH_4WAY) && defined(__AES__)
  #define TRIBUS_4WAY
 #endif

 #if defined(TRIBUS_4WAY)

+//void init_tribus_4way_ctx();
+
 void tribus_hash_4way( void *state, const void *input );

 int scanhash_tribus_4way( int thr_id, struct work *work, uint32_t max_nonce,
--- a/algo/tribus/tribus.c
+++ b/algo/tribus/tribus.c
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -0,0 +1,252 @@
+#include "cpuminer-config.h"
+#include "x11-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;    
+    keccak512_4way_context  keccak;    
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} x11_4way_ctx_holder;
+
+x11_4way_ctx_holder x11_4way_ctx;
+
+void init_x11_4way_ctx()
+{
+     blake512_4way_init( &x11_4way_ctx.blake );
+     bmw512_4way_init( &x11_4way_ctx.bmw );
+     init_groestl( &x11_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x11_4way_ctx.skein );
+     jh512_4way_init( &x11_4way_ctx.jh );
+     keccak512_4way_init( &x11_4way_ctx.keccak );
+     init_luffa( &x11_4way_ctx.luffa, 512 );
+     cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11_4way_ctx.shavite );
+     init_sd( &x11_4way_ctx.simd, 512 );
+     init_echo( &x11_4way_ctx.echo, 512 );
+}
+
+void x11_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x11_4way_ctx_holder ctx;
+     memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
+
+     // 1 Blake 4way
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     // 2 Bmw
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 3 Groestl
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     // 4 Skein
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // 5 JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // 6 Keccak
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // 7 Luffa
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     // 8 Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x11_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // 9 Shavite
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     // 10 Simd
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     // 11 Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for (int m=0; m < 6; m++) 
+       if (Htarg <= htmax[m])
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x11_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x11/x11-gate.c
+++ b/algo/x11/x11-gate.c
@@ -0,0 +1,18 @@
+#include "x11-gate.h"
+
+bool register_x11_algo( algo_gate_t* gate )
+{
+#if defined (X11_4WAY)
+  init_x11_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11_4way;
+  gate->hash      = (void*)&x11_4way_hash;
+#else
+  init_x11_ctx();
+  gate->scanhash  = (void*)&scanhash_x11;
+  gate->hash      = (void*)&x11_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x11/x11-gate.h
+++ b/algo/x11/x11-gate.h
@@ -0,0 +1,32 @@
+#ifndef X11_GATE_H__
+#define X11_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X11_4WAY
+#endif
+
+bool register_x11_algo( algo_gate_t* gate );
+
+#if defined(X11_4WAY)
+
+void x11_4way_hash( void *state, const void *input );
+
+int scanhash_x11_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_x11_4way_ctx();
+
+#endif
+
+void x11_hash( void *state, const void *input );
+
+int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_x11_ctx();
+
+#endif
+
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -1,5 +1,5 @@
 #include "cpuminer-config.h"
-#include "algo-gate-api.h"
+#include "x11-gate.h"

 #include <string.h>
 #include <stdint.h>
@@ -61,7 +61,7 @@ void init_x11_ctx()
 #endif
 }

-static void x11_hash( void *state, const void *input )
+void x11_hash( void *state, const void *input )
 {
     unsigned char hash[128] __attribute__ ((aligned (32)));
     unsigned char hashbuf[128] __attribute__ ((aligned (16)));
@@ -179,6 +179,7 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
                 if ( fulltest( hash64, ptarget ) )
                 {
                    *hashes_done = n - first_nonce + 1;
+                    work_set_target_ratio( work, hash64 );
                    return true;
                 }
              }
@@ -189,14 +190,3 @@ int scanhash_x11( int thr_id, struct work *work, uint32_t max_nonce,
        pdata[19] = n;
        return 0;
 }
-
-bool register_x11_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_x11_ctx();
-  gate->scanhash  = (void*)&scanhash_x11;
-  gate->hash      = (void*)&x11_hash;
-  gate->get_max64 = (void*)&get_max64_0x3ffff;
-  return true;
-};
-
--- a/algo/x11/x11evo-4way.c
+++ b/algo/x11/x11evo-4way.c
@@ -0,0 +1,340 @@
+#include "cpuminer-config.h"
+#include "x11evo-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <string.h>
+#include <stdint.h>
+#include <compat/portable_endian.h>
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/luffa/sph_luffa.h"
+#include "algo/cubehash/sph_cubehash.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sph_simd.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/echo/aes_ni/hash_api.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/simd/sse2/nist.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    keccak512_4way_context  keccak;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} x11evo_4way_ctx_holder;
+
+static x11evo_4way_ctx_holder x11evo_4way_ctx __attribute__ ((aligned (64)));
+
+void init_x11evo_4way_ctx()
+{
+     blake512_4way_init( &x11evo_4way_ctx.blake );
+     bmw512_4way_init( &x11evo_4way_ctx.bmw );
+     init_groestl( &x11evo_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x11evo_4way_ctx.skein );
+     jh512_4way_init( &x11evo_4way_ctx.jh );
+     keccak512_4way_init( &x11evo_4way_ctx.keccak );
+     init_luffa( &x11evo_4way_ctx.luffa, 512 );
+     cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11evo_4way_ctx.shavite );
+     init_sd( &x11evo_4way_ctx.simd, 512 );
+     init_echo( &x11evo_4way_ctx.echo, 512 );
+}
+
+static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
+static __thread uint32_t s_ntime = UINT32_MAX;
+
+void x11evo_4way_hash( void *state, const void *input )
+{
+   uint32_t hash0[16] __attribute__ ((aligned (64)));
+   uint32_t hash1[16] __attribute__ ((aligned (64)));
+   uint32_t hash2[16] __attribute__ ((aligned (64)));
+   uint32_t hash3[16] __attribute__ ((aligned (64)));
+   uint32_t vhash[16*4] __attribute__ ((aligned (64)));
+   x11evo_4way_ctx_holder ctx __attribute__ ((aligned (64)));
+   memcpy( &ctx, &x11evo_4way_ctx, sizeof(x11evo_4way_ctx) );
+
+   if ( s_seq == -1 )
+   {
+       uint32_t *data = (uint32_t*) input;
+       const uint32_t ntime = data[17];
+       evo_twisted_code( ntime, hashOrder );
+    }
+
+   int i;
+   int len = strlen( hashOrder );
+   for ( i = 0; i < len; i++ )
+   {
+      char elem = hashOrder[i];
+      uint8_t idx;
+      if ( elem >= 'A' )
+         idx = elem - 'A' + 10;
+      else
+         idx = elem - '0';
+
+//      int size = 64;
+
+      switch ( idx )
+      {
+         case 0:
+            blake512_4way( &ctx.blake, input, 80 );
+            blake512_4way_close( &ctx.blake, vhash );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                        vhash, 64<<3 );
+         break;
+         case 1:
+            bmw512_4way( &ctx.bmw, vhash, 64 );
+            bmw512_4way_close( &ctx.bmw, vhash );
+            if ( i >= len-1 )
+               mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                        vhash, 64<<3 );
+         break;
+         case 2:
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                     vhash, 64<<3 );
+            update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                                    (char*)hash0, 512 );
+            reinit_groestl( &ctx.groestl );
+            update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                                    (char*)hash1, 512 );
+            reinit_groestl( &ctx.groestl );
+            update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                                      (char*)hash2, 512 );
+            reinit_groestl( &ctx.groestl );
+            update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                                      (char*)hash3, 512 );
+            if ( i < len-1 )
+               mm256_interleave_4x64( vhash,
+                                      hash0, hash1, hash2, hash3, 64<<3 );
+         break;
+         case 3:
+            skein512_4way( &ctx.skein, vhash, 64 );
+            skein512_4way_close( &ctx.skein, vhash );
+            if ( i >= len-1 )
+               mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                        vhash, 64<<3 );
+         break;
+         case 4:
+            jh512_4way( &ctx.jh, vhash, 64 );
+            jh512_4way_close( &ctx.jh, vhash );
+            if ( i >= len-1 )
+               mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                        vhash, 64<<3 );
+         break;
+         case 5:
+            keccak512_4way( &ctx.keccak, vhash, 64 );
+            keccak512_4way_close( &ctx.keccak, vhash );
+            if ( i >= len-1 )
+               mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                        vhash, 64<<3 );
+         break;
+         case 6:
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                     vhash, 64<<3 );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                                          (const BitSequence*)hash0, 64 );
+            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
+                    sizeof(hashState_luffa) );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                                          (const BitSequence*)hash1, 64 );
+            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
+                    sizeof(hashState_luffa) );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                                          (const BitSequence*)hash2, 64 );
+            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
+                    sizeof(hashState_luffa) );
+            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                                          (const BitSequence*)hash3, 64 );
+            if ( i < len-1 )
+               mm256_interleave_4x64( vhash,
+                                      hash0, hash1, hash2, hash3, 64<<3 );
+         break;
+         case 7:
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                     vhash, 64<<3 );
+            cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                                      (const byte*) hash0, 64 );
+            memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
+            cubehashUpdateDigest( &ctx.cube, (byte*)hash1,
+                                      (const byte*) hash1, 64 );
+            memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
+            cubehashUpdateDigest( &ctx.cube, (byte*)hash2,
+                                      (const byte*) hash2, 64 );
+            memcpy( &ctx.cube, &x11evo_4way_ctx.cube, sizeof(cubehashParam) );
+            cubehashUpdateDigest( &ctx.cube, (byte*)hash3,
+                                      (const byte*) hash3, 64 );
+            if ( i < len-1 )
+               mm256_interleave_4x64( vhash,
+                                      hash0, hash1, hash2, hash3, 64<<3 );
+         break;
+         case 8:
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                     vhash, 64<<3 );
+            sph_shavite512( &ctx.shavite, hash0, 64 );
+            sph_shavite512_close( &ctx.shavite, hash0 );
+            memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
+                    sizeof(sph_shavite512_context) );
+            sph_shavite512( &ctx.shavite, hash1, 64 );
+            sph_shavite512_close( &ctx.shavite, hash1 );
+            memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
+                    sizeof(sph_shavite512_context) );
+            sph_shavite512( &ctx.shavite, hash2, 64 );
+            sph_shavite512_close( &ctx.shavite, hash2 );
+            memcpy( &ctx.shavite, &x11evo_4way_ctx.shavite,
+                    sizeof(sph_shavite512_context) );
+            sph_shavite512( &ctx.shavite, hash3, 64 );
+            sph_shavite512_close( &ctx.shavite, hash3 );
+            if ( i < len-1 )
+               mm256_interleave_4x64( vhash,
+                                      hash0, hash1, hash2, hash3, 64<<3 );
+         break;
+         case 9:
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                     vhash, 64<<3 );
+            update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                                  (const BitSequence *)hash0, 512 );
+            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
+            update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                                  (const BitSequence *)hash1, 512 );
+            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
+            update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                                  (const BitSequence *)hash2, 512 );
+            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
+            update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                                  (const BitSequence *)hash3, 512 );
+            if ( i < len-1 )
+               mm256_interleave_4x64( vhash,
+                                      hash0, hash1, hash2, hash3, 64<<3 );
+         break;
+         case 10:
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
+                                     vhash, 64<<3 );
+            update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                                   (const BitSequence *) hash0, 512 );
+            memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
+            update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                                   (const BitSequence *) hash1, 512 );
+            memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
+            update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                                   (const BitSequence *) hash2, 512 );
+            memcpy( &ctx.echo, &x11evo_4way_ctx.echo, sizeof(hashState_echo) );
+            update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                                   (const BitSequence *) hash3, 512 );
+            if ( i < len-1 )
+               mm256_interleave_4x64( vhash,
+                                      hash0, hash1, hash2, hash3, 64<<3 );
+         break;
+      }
+   }
+
+   memcpy( state,    hash0, 32 );
+   memcpy( state+32, hash1, 32 );
+   memcpy( state+64, hash2, 32 );
+   memcpy( state+96, hash3, 32 );
+}
+
+//static const uint32_t diff1targ = 0x0000ffff;
+
+int scanhash_x11evo_4way( int thr_id, struct work* work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+
+     swab32_array( endiandata, pdata, 20 );
+
+     int ntime = endiandata[17];
+     if ( ntime != s_ntime  ||  s_seq == -1 )
+     {
+         evo_twisted_code( ntime, hashOrder );
+         s_ntime = ntime;
+     }
+
+     uint32_t hmask = 0xFFFFFFFF;
+     if ( Htarg  > 0 )
+     {
+        if ( Htarg <= 0xF )
+            hmask = 0xFFFFFFF0;
+        else if ( Htarg <= 0xFF )
+            hmask = 0xFFFFFF00;
+        else if ( Htarg <= 0xFFF )
+            hmask = 0xFFFF000;
+        else if ( Htarg <= 0xFFFF )
+           hmask = 0xFFFF000;
+      }
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     do
+     {
+         found[0] = found[1] = found[2] = found[3] = false;
+         be32enc( noncep0, n   );
+         be32enc( noncep1, n+1 );
+         be32enc( noncep2, n+2 );
+         be32enc( noncep3, n+3 );
+
+         x11evo_4way_hash( hash, vdata );
+         pdata[19] = n;
+
+         if ( ( hash[7] & hmask ) == 0 && fulltest( hash, ptarget ) )
+         {
+            found[0] = true;
+            num_found++;
+            nonces[0] = n;
+            work_set_target_ratio( work, hash );
+         }
+         if ( ( (hash+8)[7] & hmask ) == 0 && fulltest( hash+8, ptarget ) )
+         {
+            found[1] = true;
+            num_found++;
+            nonces[1] = n+1;
+            work_set_target_ratio( work, hash+8 );
+         }
+         if ( ( (hash+16)[7] & hmask ) == 0 && fulltest( hash+16, ptarget ) )
+         {
+            found[2] = true;
+            num_found++;
+            nonces[2] = n+2;
+            work_set_target_ratio( work, hash+16 );
+         }
+         if ( ( (hash+24)[7] & hmask ) == 0 && fulltest( hash+24, ptarget ) )
+         {
+            found[3] = true;
+            num_found++;
+            nonces[3] = n+3;
+            work_set_target_ratio( work, hash+24 );
+         }
+         n += 4;
+     } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x11/x11evo-gate.c
+++ b/algo/x11/x11evo-gate.c
@@ -0,0 +1,95 @@
+#include "x11evo-gate.h"
+
+int s_seq = -1;
+
+static inline int getCurrentAlgoSeq( uint32_t current_time )
+{
+   // change once per day
+   return (int) (current_time - X11EVO_INITIAL_DATE) / (60 * 60 * 24);
+}
+
+// swap_vars doesn't work here
+void evo_swap( uint8_t *a, uint8_t *b )
+{
+   uint8_t __tmp = *a;
+   *a = *b;
+   *b = __tmp;
+}
+
+void initPerm( uint8_t n[], uint8_t count )
+{
+   int i;
+   for ( i = 0; i<count; i++ )
+       n[i] = i;
+}
+
+int nextPerm( uint8_t n[], uint32_t count )
+{
+   uint32_t tail = 0, i = 0, j = 0;
+
+   if (unlikely( count <= 1 ))
+      return 0;
+
+   for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
+      tail = i;
+
+   if ( tail > 0 )
+      for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
+           evo_swap( &n[tail - 1], &n[j] );
+
+   for ( i = tail, j = count - 1; i<j; i++, j-- )
+      evo_swap( &n[i], &n[j] );
+
+   return ( tail != 0 );
+}
+
+void getAlgoString( char *str, uint32_t count )
+{
+   uint8_t algoList[X11EVO_FUNC_COUNT];
+   char *sptr;
+   int j;
+   int k;
+   initPerm( algoList, X11EVO_FUNC_COUNT );
+
+   for ( k = 0; k < count; k++ )
+      nextPerm( algoList, X11EVO_FUNC_COUNT );
+
+   sptr = str;
+   for ( j = 0; j < X11EVO_FUNC_COUNT; j++ )
+   {
+      if ( algoList[j] >= 10 )
+          sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
+      else
+          sprintf( sptr, "%u", algoList[j] );
+      sptr++;
+   }
+  *sptr = 0;
+
+        //applog(LOG_DEBUG, "nextPerm %s", str);
+}
+
+void evo_twisted_code( uint32_t ntime, char *permstr )
+{
+   int seq = getCurrentAlgoSeq( ntime );
+   if ( s_seq != seq )
+   {
+       getAlgoString( permstr, seq );
+       s_seq = seq;
+   }
+}
+
+bool register_x11evo_algo( algo_gate_t* gate )
+{
+#if defined (X11EVO_4WAY)
+  init_x11evo_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11evo_4way;
+  gate->hash      = (void*)&x11evo_4way_hash;
+#else
+  init_x11evo_ctx();
+  gate->scanhash  = (void*)&scanhash_x11evo;
+  gate->hash      = (void*)&x11evo_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  return true;
+};
+
--- a/algo/x11/x11evo-gate.h
+++ b/algo/x11/x11evo-gate.h
@@ -0,0 +1,39 @@
+#ifndef X11EVO_GATE_H__
+#define X11EVO_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X11EVO_4WAY
+#endif
+
+#define X11EVO_INITIAL_DATE 1462060800
+#define X11EVO_FUNC_COUNT 11
+
+extern int s_seq;
+
+bool register_x11evo_algo( algo_gate_t* gate );
+
+#if defined(X11EVO_4WAY)
+
+void x11evo_4way_hash( void *state, const void *input );
+
+int scanhash_x11evo_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_x11evo_4way_ctx();
+
+#endif
+
+void x11evo_hash( void *state, const void *input );
+
+int scanhash_x11evo( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_x11evo_ctx();
+
+void evo_twisted_code( uint32_t ntime, char *permstr );
+
+#endif
+
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -1,5 +1,5 @@
 #include "cpuminer-config.h"
-#include "algo-gate-api.h"
+#include "x11evo-gate.h"

 #include <string.h>
 #include <stdint.h>
@@ -26,9 +26,6 @@
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/simd/sse2/nist.h"

-#define INITIAL_DATE 1462060800
-#define HASH_FUNC_COUNT 11
-
 typedef struct {
 #ifdef NO_AES_NI
    sph_groestl512_context  groestl;
@@ -70,94 +67,10 @@ void init_x11evo_ctx()
     sph_shavite512_init( &x11evo_ctx.shavite );
 }

-/*
-uint32_t getCurrentAlgoSeq(uint32_t current_time, uint32_t base_time)
-{
-	return (current_time - base_time) / (60 * 60 * 24);
-}
-*/
-
-static inline int getCurrentAlgoSeq( uint32_t current_time )
-{
-        // change once per day
-        return (int) (current_time - INITIAL_DATE) / (60 * 60 * 24);
-}
-
-// swap_vars doesn't work here
-void evo_swap( uint8_t *a, uint8_t *b )
-{
-	uint8_t __tmp = *a;
-	*a = *b;
-	*b = __tmp;
-}
-
-void initPerm( uint8_t n[], uint8_t count )
-{
-	int i;
-	for ( i = 0; i<count; i++ )
-		n[i] = i;
-}
-
-int nextPerm( uint8_t n[], uint32_t count )
-{
-	uint32_t tail = 0, i = 0, j = 0;
-
-	if (unlikely( count <= 1 ))
-		return 0;
-
-	for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
-           tail = i;
-
-	if ( tail > 0 )
-            for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
-	         evo_swap( &n[tail - 1], &n[j] );
-
-	for ( i = tail, j = count - 1; i<j; i++, j-- )
-		evo_swap( &n[i], &n[j] );
-
-	return ( tail != 0 );
-}
-
-void getAlgoString( char *str, uint32_t count )
-{
-	uint8_t algoList[HASH_FUNC_COUNT];
-	char *sptr;
-        int j;
-        int k;
-	initPerm( algoList, HASH_FUNC_COUNT );
-
-	for ( k = 0; k < count; k++ )
-		nextPerm( algoList, HASH_FUNC_COUNT );
-
-	sptr = str;
-	for ( j = 0; j < HASH_FUNC_COUNT; j++ )
-        {
-		if ( algoList[j] >= 10 )
-			sprintf( sptr, "%c", 'A' + (algoList[j] - 10) );
-		else
-			sprintf( sptr, "%u", algoList[j] );
-		sptr++;
-	}
-	*sptr = 0;
-
-	//applog(LOG_DEBUG, "nextPerm %s", str);
-}
-
-static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+static char hashOrder[X11EVO_FUNC_COUNT + 1] = { 0 };
 static __thread uint32_t s_ntime = UINT32_MAX;
-static int s_seq = -1;

-static void evo_twisted_code(uint32_t ntime, char *permstr)
-{
-        int seq = getCurrentAlgoSeq(ntime);
-        if (s_seq != seq)
-        {
-                getAlgoString(permstr, seq);
-                s_seq = seq;
-        }
-}
-
-static inline void x11evo_hash( void *state, const void *input )
+void x11evo_hash( void *state, const void *input )
 {
   uint32_t hash[16] __attribute__ ((aligned (64)));
   x11evo_ctx_holder ctx __attribute__ ((aligned (64)));
@@ -242,10 +155,10 @@ static inline void x11evo_hash( void *state, const void *input )
    memcpy( state, hash, 32 );
 }

-static const uint32_t diff1targ = 0x0000ffff;
+//static const uint32_t diff1targ = 0x0000ffff;

 int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
-                     unsigned long *hashes_done )
+                     uint64_t *hashes_done )
 {
        uint32_t endiandata[20] __attribute__((aligned(64)));
        uint32_t hash64[8] __attribute__((aligned(64)));
@@ -274,19 +187,20 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
         else if ( Htarg <= 0xFFF )
            hmask = 0xFFFF000;
         else if ( Htarg <= 0xFFFF )
-            hmask = 0xFFFF000;
+           hmask = 0xFFFF000;
        }

        do
        {
          pdata[19] = ++n;
          be32enc( &endiandata[19], n );
-          x11evo_hash( hash64, &endiandata );
+          x11evo_hash( hash64, endiandata );
          if ( ( hash64[7] & hmask ) == 0 )
          {
             if ( fulltest( hash64, ptarget ) )
             {
                 *hashes_done = n - first_nonce + 1;
+                 work_set_target_ratio( work, hash64 );
                 return true;
             }
           }
@@ -296,13 +210,3 @@ int scanhash_x11evo( int thr_id, struct work* work, uint32_t max_nonce,
 	pdata[19] = n;
 	return 0;
 }
-
-bool register_x11evo_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  gate->scanhash  = (void*)&scanhash_x11evo;
-  gate->hash      = (void*)&x11evo_hash;
-  init_x11evo_ctx();
-  return true;
-};
-
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -0,0 +1,259 @@
+#include "cpuminer-config.h"
+#include "x11gost-gate.h"
+
+#if defined (__AVX2__) && defined (__AES__)
+
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/simd/sse2/nist.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    blake512_4way_context   blake;
+    bmw512_4way_context     bmw;
+    hashState_groestl       groestl;
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;    
+    keccak512_4way_context  keccak;    
+    sph_gost512_context     gost;
+    hashState_luffa         luffa;
+    cubehashParam           cube;
+    sph_shavite512_context  shavite;
+    hashState_sd            simd;
+    hashState_echo          echo;
+} x11gost_4way_ctx_holder;
+
+x11gost_4way_ctx_holder x11gost_4way_ctx;
+
+void init_x11gost_4way_ctx()
+{
+     blake512_4way_init( &x11gost_4way_ctx.blake );
+     bmw512_4way_init( &x11gost_4way_ctx.bmw );
+     init_groestl( &x11gost_4way_ctx.groestl, 64 );
+     skein512_4way_init( &x11gost_4way_ctx.skein );
+     jh512_4way_init( &x11gost_4way_ctx.jh );
+     keccak512_4way_init( &x11gost_4way_ctx.keccak );
+     sph_gost512_init( &x11gost_4way_ctx.gost );
+     init_luffa( &x11gost_4way_ctx.luffa, 512 );
+     cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
+     sph_shavite512_init( &x11gost_4way_ctx.shavite );
+     init_sd( &x11gost_4way_ctx.simd, 512 );
+     init_echo( &x11gost_4way_ctx.echo, 512 );
+}
+
+void x11gost_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     x11gost_4way_ctx_holder ctx;
+     memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
+
+     blake512_4way( &ctx.blake, input, 80 );
+     blake512_4way_close( &ctx.blake, vhash );
+
+     bmw512_4way( &ctx.bmw, vhash, 64 );
+     bmw512_4way_close( &ctx.bmw, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl,
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+     memcpy( &ctx.groestl, &x11gost_4way_ctx.groestl, 
+             sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+
+     // 4way
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     skein512_4way( &ctx.skein, vhash, 64 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     keccak512_4way( &ctx.keccak, vhash, 64 );
+     keccak512_4way_close( &ctx.keccak, vhash );
+
+     // Serial
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     memcpy( &ctx.gost, &x11gost_4way_ctx.gost, sizeof(sph_gost512_context) );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
+                             (const BitSequence*)hash0, 64 );
+     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
+                             (const BitSequence*)hash1, 64 );
+     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
+                             (const BitSequence*)hash2, 64 );
+     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
+     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
+                             (const BitSequence*)hash3, 64 );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+     memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash2, 64 );
+     sph_shavite512_close( &ctx.shavite, hash2 );
+     memcpy( &ctx.shavite, &x11gost_4way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash3, 64 );
+     sph_shavite512_close( &ctx.shavite, hash3 );
+
+     update_final_sd( &ctx.simd, (BitSequence *)hash0,
+                      (const BitSequence *)hash0, 512 );
+     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash1,
+                      (const BitSequence *)hash1, 512 );
+     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash2,
+                      (const BitSequence *)hash2, 512 );
+     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
+     update_final_sd( &ctx.simd, (BitSequence *)hash3,
+                      (const BitSequence *)hash3, 512 );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     memcpy( &ctx.echo, &x11gost_4way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                   uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     for (int m=0; m < 6; m++) 
+       if (Htarg <= htmax[m])
+       {
+         uint32_t mask = masks[m];
+         do
+         {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            x11gost_4way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+            {
+               found[2] = true;
+               num_found++;
+               nonces[2] = n+2;
+               work_set_target_ratio( work, hash+16 );
+            }
+            if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+            {
+               found[3] = true;
+               num_found++;
+               nonces[3] = n+3;
+               work_set_target_ratio( work, hash+24 );
+            }
+            n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+       }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x11/x11gost-gate.c
+++ b/algo/x11/x11gost-gate.c
@@ -0,0 +1,18 @@
+#include "x11gost-gate.h"
+
+bool register_x11gost_algo( algo_gate_t* gate )
+{
+#if defined (X11GOST_4WAY)
+  init_x11gost_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_x11gost_4way;
+  gate->hash      = (void*)&x11gost_4way_hash;
+#else
+  init_x11gost_ctx();
+  gate->scanhash  = (void*)&scanhash_x11gost;
+  gate->hash      = (void*)&x11gost_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/x11/x11gost-gate.h
+++ b/algo/x11/x11gost-gate.h
@@ -0,0 +1,32 @@
+#ifndef X11GOST_GATE_H__
+#define X11GOST_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define X11GOST_4WAY
+#endif
+
+bool register_x11gost_algo( algo_gate_t* gate );
+
+#if defined(X11GOST_4WAY)
+
+void x11gost_4way_hash( void *state, const void *input );
+
+int scanhash_x11gost_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_x11gost_4way_ctx();
+
+#endif
+
+void x11gost_hash( void *state, const void *input );
+
+int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_x11gost_ctx();
+
+#endif
+
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "x11gost-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -37,28 +37,28 @@ typedef struct {
     hashState_echo          echo;
     hashState_groestl       groestl;
 #endif
-} sib_ctx_holder;
+} x11gost_ctx_holder;

-sib_ctx_holder sib_ctx;
+x11gost_ctx_holder x11gost_ctx;

-void init_sib_ctx()
+void init_x11gost_ctx()
 {
-     sph_gost512_init(&sib_ctx.gost);
-     sph_shavite512_init(&sib_ctx.shavite);
-     init_luffa( &sib_ctx.luffa, 512 );
-     cubehashInit( &sib_ctx.cube, 512, 16, 32 );
-     init_sd( &sib_ctx.simd, 512 );
+     sph_gost512_init( &x11gost_ctx.gost );
+     sph_shavite512_init( &x11gost_ctx.shavite );
+     init_luffa( &x11gost_ctx.luffa, 512 );
+     cubehashInit( &x11gost_ctx.cube, 512, 16, 32 );
+     init_sd( &x11gost_ctx.simd, 512 );
 #ifdef NO_AES_NI
-     sph_groestl512_init( &sib_ctx.groestl );
-     sph_echo512_init( &sib_ctx.echo );
+     sph_groestl512_init( &x11gost_ctx.groestl );
+     sph_echo512_init( &x11gost_ctx.echo );
 #else
-     init_echo( &sib_ctx.echo, 512 );
-     init_groestl( &sib_ctx.groestl, 64 );
+     init_echo( &x11gost_ctx.echo, 512 );
+     init_groestl( &x11gost_ctx.groestl, 64 );
 #endif

 }

-void sibhash(void *output, const void *input)
+void x11gost_hash(void *output, const void *input)
 {
     unsigned char hash[128] __attribute__ ((aligned (64)));
     #define hashA hash
@@ -69,8 +69,8 @@ void sibhash(void *output, const void *input)
     sph_u64 hashctA;
     sph_u64 hashctB;

-     sib_ctx_holder ctx __attribute__ ((aligned (64)));
-     memcpy( &ctx, &sib_ctx, sizeof(sib_ctx) );
+     x11gost_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &x11gost_ctx, sizeof(x11gost_ctx) );

     DECL_BLK;
     BLK_I;
@@ -135,8 +135,8 @@ void sibhash(void *output, const void *input)
     memcpy(output, hashA, 32);
 }

-int scanhash_sib(int thr_id, struct work *work,
-	uint32_t max_nonce, uint64_t *hashes_done)
+int scanhash_x11gost( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done)
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;
@@ -156,11 +156,12 @@ int scanhash_sib(int thr_id, struct work *work,
 	do {
 		uint32_t hash[8];
 		be32enc(&endiandata[19], nonce);
-		sibhash(hash, endiandata);
+		x11gost_hash(hash, endiandata);

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
 			*hashes_done = pdata[19] - first_nonce;
+                        work_set_target_ratio( work, hash );
 			return 1;
 		}
 		nonce++;
@@ -172,12 +173,3 @@ int scanhash_sib(int thr_id, struct work *work,
 	return 0;
 }

-bool register_sib_algo( algo_gate_t* gate )
-{
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-    init_sib_ctx();
-    gate->scanhash = (void*)&scanhash_sib;
-    gate->hash     = (void*)&sibhash;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-}
--- a/algo/x13/drop.c
+++ b/algo/x13/drop.c
--- a/algo/x13/phi1612-4way.c
+++ b/algo/x13/phi1612-4way.c
@@ -0,0 +1,186 @@
+#include "x13-gate.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct {
+    skein512_4way_context   skein;
+    jh512_4way_context      jh;
+    cubehashParam           cube;
+    sph_fugue512_context    fugue;
+    sph_gost512_context     gost;
+    hashState_echo          echo;
+} phi1612_4way_ctx_holder;
+
+phi1612_4way_ctx_holder phi1612_4way_ctx __attribute__ ((aligned (64)));
+
+void init_phi1612_4way_ctx()
+{
+     skein512_4way_init( &phi1612_4way_ctx.skein );
+     jh512_4way_init( &phi1612_4way_ctx.jh );
+     cubehashInit( &phi1612_4way_ctx.cube, 512, 16, 32 );
+     sph_fugue512_init( &phi1612_4way_ctx.fugue );
+     sph_gost512_init( &phi1612_4way_ctx.gost );
+     init_echo( &phi1612_4way_ctx.echo, 512 );
+};
+
+void phi1612_4way_hash( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     phi1612_4way_ctx_holder ctx;
+     memcpy( &ctx, &phi1612_4way_ctx, sizeof(phi1612_4way_ctx) );
+
+     // Skein parallel 4way
+     skein512_4way( &ctx.skein, input, 80 );
+     skein512_4way_close( &ctx.skein, vhash );
+
+     // JH
+     jh512_4way( &ctx.jh, vhash, 64 );
+     jh512_4way_close( &ctx.jh, vhash );
+
+     // Serial to the end
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // Cubehash
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+     memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*) hash2, 64 );
+     memcpy( &ctx.cube, &phi1612_4way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*) hash3, 64 );
+
+     // Fugue
+     sph_fugue512( &ctx.fugue, hash0, 64 );
+     sph_fugue512_close( &ctx.fugue, hash0 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash1, 64 );
+     sph_fugue512_close( &ctx.fugue, hash1 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash2, 64 );
+     sph_fugue512_close( &ctx.fugue, hash2 );
+     sph_fugue512_init( &ctx.fugue );
+     sph_fugue512( &ctx.fugue, hash3, 64 );
+     sph_fugue512_close( &ctx.fugue, hash3 );
+
+     // Gost
+     sph_gost512( &ctx.gost, hash0, 64 );
+     sph_gost512_close( &ctx.gost, hash0 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash1, 64 );
+     sph_gost512_close( &ctx.gost, hash1 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash2, 64 );
+     sph_gost512_close( &ctx.gost, hash2 );
+     sph_gost512_init( &ctx.gost );
+     sph_gost512( &ctx.gost, hash3, 64 );
+     sph_gost512_close( &ctx.gost, hash3 );
+
+     // Echo
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash2,
+                       (const BitSequence *) hash2, 512 );
+     init_echo( &ctx.echo, 512 );
+     update_final_echo( &ctx.echo, (BitSequence *)hash3,
+                       (const BitSequence *) hash3, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_phi1612_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                  uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     const uint32_t first_nonce = pdata[19];
+     uint32_t _ALIGN(64) endiandata[20];
+     uint32_t n = first_nonce;
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+     const uint32_t Htarg = ptarget[7];
+
+     if ( opt_benchmark )
+          ( (uint32_t*)ptarget )[7] = 0x0cff;
+
+     for ( int k = 0; k < 19; k++ )
+        be32enc( &endiandata[k], pdata[k] );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     do {
+        found[0] = found[1] = found[2] = found[3] = false;
+        be32enc( noncep0, n   );
+        be32enc( noncep1, n+1 );
+        be32enc( noncep2, n+2 );
+        be32enc( noncep3, n+3 );
+
+        phi1612_4way_hash( hash, vdata );
+        pdata[19] = n;
+
+        if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+        {
+            found[0] = true;
+            num_found++;
+            nonces[0] = n;
+            work_set_target_ratio( work, hash );
+        }
+        if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) 
+        {
+            found[1] = true;
+            num_found++;
+            nonces[1] = n+1;
+            work_set_target_ratio( work, hash+8 );
+        }
+        if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) ) 
+        {
+            found[2] = true;
+            num_found++;
+            nonces[2] = n+2;
+            work_set_target_ratio( work, hash+16 );
+        }
+        if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) ) 
+        {
+            found[3] = true;
+            num_found++;
+            nonces[3] = n+3;
+            work_set_target_ratio( work, hash+24 );
+        }
+        n += 4;
+     } while ( ( num_found == 0 ) && ( n < max_nonce )
+               && !work_restart[thr_id].restart );
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/x13/phi1612-gate.c
+++ b/algo/x13/phi1612-gate.c
@@ -0,0 +1,18 @@
+#include "phi1612-gate.h"
+
+bool register_phi1612_algo( algo_gate_t* gate )
+{
+#if defined(PHI1612_4WAY)
+  init_phi1612_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_phi1612_4way;
+  gate->hash      = (void*)&phi1612_4way_hash;
+#else
+  init_phi1612_ctx();
+  gate->scanhash  = (void*)&scanhash_phi1612;
+  gate->hash      = (void*)&phi1612_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	a90d75b8f5	v3.7.10	2018-01-16 15:11:44 -05:00
Jay D Dee	bee78eac76	v3.7.9	2018-01-08 22:04:43 -05:00
Jay D Dee	2d2e54f001	v3.7.8	2017-12-30 19:19:46 -05:00
Jay D Dee	79164c24b5	v3.7.7	2017-12-17 12:00:42 -05:00
Jay D Dee	7a1389998b	v3.7.6	2017-12-14 18:28:51 -05:00