v3.8.4.1

v3.8.4
v3.8.3.3
2025-09-17 23:44:27 +00:00 · 2018-03-22 14:28:03 -04:00 · 2018-03-18 12:51:03 -04:00 · 2018-02-25 14:15:07 -05:00 · 2018-02-24 14:36:19 -05:00 · 2018-02-23 15:45:32 -05:00
157 changed files with 11217 additions and 7298 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -45,7 +45,10 @@ cpuminer_SOURCES = \
  algo/blake/sph_blake2b.c \
  algo/blake/blake2b.c \
  algo/blake/sph-blake2s.c \
+  algo/blake/blake2s-hash-4way.c \
  algo/blake/blake2s.c \
+  algo/blake/blake2s-gate.c \
+  algo/blake/blake2s-4way.c \
  algo/blake/blakecoin-gate.c \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
@@ -65,11 +68,14 @@ cpuminer_SOURCES = \
  algo/cryptonight/cryptonight.c\
  algo/cubehash/sph_cubehash.c \
  algo/cubehash/sse2/cubehash_sse2.c\
+  algo/cubehash/cube-hash-2way.c \
  algo/echo/sph_echo.c \
  algo/echo/aes_ni/hash.c\
  algo/gost/sph_gost.c \
  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
+  algo/groestl/myrgr-gate.c \
+  algo/groestl/myrgr-4way.c \
  algo/groestl/myr-groestl.c \
  algo/groestl/aes_ni/hash-groestl.c \
  algo/groestl/aes_ni/hash-groestl256.c \
@@ -97,10 +103,10 @@ cpuminer_SOURCES = \
  algo/keccak/keccak-4way.c\
  algo/keccak/keccak-gate.c \
  algo/keccak/sse2/keccak.c \
-  algo/lbry.c \
  algo/luffa/sph_luffa.c \
  algo/luffa/luffa.c \
-  algo/luffa/sse2/luffa_for_sse2.c \
+  algo/luffa/luffa_for_sse2.c \
+  algo/luffa/luffa-hash-2way.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
  algo/lyra2/lyra2rev2-gate.c \
@@ -114,6 +120,9 @@ cpuminer_SOURCES = \
  algo/lyra2/lyra2h-gate.c \
  algo/lyra2/lyra2h.c \
  algo/lyra2/lyra2h-4way.c \
+  algo/lyra2/allium-gate.c \
+  algo/lyra2/allium-4way.c \
+  algo/lyra2/allium.c \
  algo/m7m.c \
  algo/neoscrypt/neoscrypt.c \
  algo/nist5/nist5-gate.c \
@@ -127,9 +136,17 @@ cpuminer_SOURCES = \
  algo/quark/anime-gate.c \
  algo/quark/anime.c \
  algo/quark/anime-4way.c \
+  algo/qubit/qubit-gate.c \
  algo/qubit/qubit.c \
+  algo/qubit/qubit-2way.c \
+  algo/qubit/deep-gate.c \
+  algo/qubit/deep-2way.c \
  algo/qubit/deep.c \
  algo/ripemd/sph_ripemd.c \
+  algo/ripemd/ripemd-hash-4way.c \
+  algo/ripemd/lbry-gate.c \
+  algo/ripemd/lbry.c \
+  algo/ripemd/lbry-4way.c \
  algo/scrypt.c \
  algo/scryptjane/scrypt-jane.c \
  algo/sha/sph_sha2.c \
@@ -143,8 +160,9 @@ cpuminer_SOURCES = \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite.c \
  algo/simd/sph_simd.c \
-  algo/simd/sse2/nist.c \
-  algo/simd/sse2/vector.c \
+  algo/simd/nist.c \
+  algo/simd/vector.c \
+  algo/simd/simd-hash-2way.c \
  algo/skein/sph_skein.c \
  algo/skein/skein-hash-4way.c \
  algo/skein/skein.c \
@@ -184,6 +202,9 @@ cpuminer_SOURCES = \
  algo/x11/x11evo.c \
  algo/x11/x11evo-4way.c \
  algo/x11/x11evo-gate.c \
+  algo/x12/x12-gate.c \
+  algo/x12/x12.c \
+  algo/x12/x12-4way.c \
  algo/x13/x13-gate.c \
  algo/x13/x13.c \
  algo/x13/x13-4way.c \
@@ -222,7 +243,7 @@ cpuminer_SOURCES = \
  algo/x17/hmq1725.c \
  algo/yescrypt/yescrypt.c \
  algo/yescrypt/sha256_Y.c \
-  algo/yescrypt/yescrypt-simd.c
+  algo/yescrypt/yescrypt-best.c

 disable_flags =

--- a/README.md
+++ b/README.md
@@ -13,9 +13,35 @@ mailto://jayddee246@gmail.com

 See file RELEASE_NOTES for change log and compile instructions.

+Requirements
+------------
+
+1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
+Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
+optimizations a CPU with AES_NI is required. This includes Intel Westbridge
+and newer and AMD equivalents. Further optimizations are available on some
+algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
+
+Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
+performance.
+
+ARM CPUs are not supported.
+
+2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
+Centos, are known to work and have all dependencies in their repositories.
+Others may work but may require more effort. Older versions such as Centos 6
+don't work due to missing features. 
+64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
+
+MacOS, OSx and Android are not supported.
+
+3. Stratum pool. Some algos may work wallet mining using getwork or GBT. YMMV.
+
 Supported Algorithms
 --------------------

+                          allium       Garlicoin
+                          anime        Animecoin
                          argon2
                          axiom        Shabal-256 MemoHash
                          bastion
@@ -74,40 +100,20 @@ Supported Algorithms
                          x11          Dash
                          x11evo       Revolvercoin
                          x11gost      sib (SibCoin)
+                          x12          Galaxie Cash (GCH)
                          x13          X13
                          x13sm3       hsr (Hshare)
                          x14          X14
                          x15          X15
+                          x16r         Ravencoin
                          x17
                          xevan        Bitsend
                          yescrypt     Globalboost-Y (BSTY)
-                          yescryptr8   BitZeny (ZNY)\n\
+                          yescryptr8   BitZeny (ZNY)
                          yescryptr16  Yenten (YTN)
+                          yescryptr32  WAVI
                          zr5          Ziftr

-Requirements
------------
-
-1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
-Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
-optimizations a CPU with AES_NI is required. This includes Intel Westbridge
-and newer and AMD equivalents. Further optimizations are available on some
-algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
-
-Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
-performance.
-
-ARM CPUs are not supported.
-
-2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
-Centos are known to work and have all dependencies in their repositories.
-Others may work but may require more effort.
-64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
-
-MacOS, OSx is not supported.
-
-3. Stratum pool. Some algos may work wallet mining using getwork.
-
 Errata
 ------

@@ -136,10 +142,13 @@ output from the miner showing the startup and any errors.
 Donations
 ---------

-I do not do this for money but I have a donation address if users
-are so inclined.
+cpuminer-opt has no fees of any kind but donations are accepted.

-bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations
+ BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
+ ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
+ LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
+ BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
+ BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ

 Happy mining!

--- a/README.txt
+++ b/README.txt
@@ -25,3 +25,12 @@ cpuminer-aes-avx.exe   "-march=corei7-avx"         Sandybridge, Ivybridge
 cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
 cpuminer-avx2-sha.exe  "-march=core-avx2 -msha"    Ryzen

+If you like this software feel free to donate:
+
+BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
+ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
+LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
+BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
+BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
+
+
--- a/77
+++ b/77
@@ -1,4 +1,4 @@
-cpuminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
+puminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
 This feature requires recent SW including GCC version 5 or higher and
 openssl version 1.1 or higher. It may also require using "-march=znver1"
 compile flag.
@@ -90,7 +90,8 @@ Additional optional compile flags, add the following to CFLAGS to activate:

 SPH may give slightly better performance on algos that use sha256 when using
 openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
-better than SPH.
+better than SPH. This option is ignored when 4-way is used, even for CPUs
+with SHA.

 Start mining.

@@ -98,8 +99,8 @@ Start mining.

 Windows

-The following in how the Windows binary releases are built. It's old and
-not very good but it works, for me anyway.
+Precompiled Windows binaries are built on a Linux host using Mingw
+with a more recent compiler than the following Windows hosted procedure.

 Building on Windows prerequisites:

@@ -131,10 +132,10 @@ or similar Windows program.
 In msys shell cd to miner directory.
 cd /c/path/to/cpuminer-opt

-Run winbuild.sh to build on Windows or execute the following commands.
+Run build.sh to build on Windows or execute the following commands.

 ./autogen.sh
-CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 make

 Start mining
@@ -159,6 +160,70 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.8.4.1
+
+Fixed sha256t low difficulty rejects.
+Fixed compile error on CPUs with AVX512.
+
+v3.8.4
+
+Added yescryptr32 algo for WAVI coin.
+Added URL to API data.
+Improved detection of __int128 support (linux only)
+Compile support for CPUs without SSSE3 (no binary support)
+
+v3.8.3.3
+
+Integrated getblocktemplate with algo_gate.
+Added support for hodl gbt (untested).
+Reworked some recent quick fixes.
+
+v3.8.3.2
+
+Reverted gbt changes from v3.8.0 that broke getwork.
+Reverted scaled hash rate for API, added HS term in addition to KHS. 
+Added blocks solved to console display and API.
+
+v3.8.3.1
+
+Fixed regression in v3.8.3 that broke several algos.
+
+v3.8.3
+
+More restoration of lost lyra2 hash.
+8 way AVX2 and 4way AVX optimization for blakecoin, vanilla & blake2s.
+8 way AVX2 for lbry.
+Scaled hashrate for API output.
+A couple of GBT fixes.
+
+v3.8.2.1
+
+Fixed low difficulty rejects with allium.
+Fixed qubit AVX2.
+Restored lyra2z lost hash.
+Fixed build.sh
+
+v3.8.2
+
+Fixed and faster myr-gr.
+Added x12 algo (Galaxie Cash), allium algo (Garlicoin).
+Faster lyra2rev2, lbry, skein.
+Large reduction in compiler warnings.
+
+v3.8.1.1
+
+Fixed Windows AVX2 crash.
+
+v3.8.1
+
+Fixes x16r on CPUs with only SSE2.
+More Optimizations for X algos, qubit & deep.
+Corrected algo optimizations for scrypt and yescrypt, no new optimizations.
+
+v3.8.0.1
+
+Fixed x16r AVX2 low hash rate.
+
 v3.8.0

 4way no longer a seperate feature, included in AVX2.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -119,9 +119,11 @@ void init_algo_gate( algo_gate_t* gate )
   gate->gen_merkle_root         = (void*)&sha256d_gen_merkle_root;
   gate->stratum_gen_work        = (void*)&std_stratum_gen_work;
   gate->build_stratum_request   = (void*)&std_le_build_stratum_request;
+   gate->malloc_txs_request      = (void*)&std_malloc_txs_request;
   gate->set_target              = (void*)&std_set_target;
   gate->work_decode             = (void*)&std_le_work_decode;
   gate->submit_getwork_result   = (void*)&std_le_submit_getwork_result;
+   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
   gate->calc_network_diff       = (void*)&std_calc_network_diff;
@@ -155,6 +157,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )

   switch (algo)
   {
+     case ALGO_ALLIUM:       register_allium_algo      ( gate ); break;
     case ALGO_ANIME:        register_anime_algo       ( gate ); break;
     case ALGO_ARGON2:       register_argon2_algo      ( gate ); break;
     case ALGO_AXIOM:        register_axiom_algo       ( gate ); break;
@@ -213,6 +216,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_X11:          register_x11_algo         ( gate ); break;
     case ALGO_X11EVO:       register_x11evo_algo      ( gate ); break;
     case ALGO_X11GOST:      register_x11gost_algo     ( gate ); break;
+     case ALGO_X12:          register_x12_algo         ( gate ); break;
     case ALGO_X13:          register_x13_algo         ( gate ); break;
     case ALGO_X13SM3:       register_x13sm3_algo      ( gate ); break;
     case ALGO_X14:          register_x14_algo         ( gate ); break;
@@ -223,6 +227,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
     case ALGO_YESCRYPTR8:   register_yescryptr8_algo  ( gate ); break;
     case ALGO_YESCRYPTR16:  register_yescryptr16_algo ( gate ); break;
+     case ALGO_YESCRYPTR32:  register_yescryptr32_algo ( gate ); break;
     case ALGO_ZR5:          register_zr5_algo         ( gate ); break;
    default:
        applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
@@ -298,6 +303,7 @@ const char* const algo_alias_map[][2] =
  { "lyra2",             "lyra2re"      },
  { "lyra2v2",           "lyra2rev2"    },
  { "lyra2zoin",         "lyra2z330"    },
+  { "myrgr",             "myr-gr"       },
  { "myriad",            "myr-gr"       },
  { "neo",               "neoscrypt"    },
  { "phi",               "phi1612"      },
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -127,7 +127,10 @@ void ( *set_target)              ( struct work*, double );
 bool ( *submit_getwork_result )  ( CURL*, struct work* );
 void ( *gen_merkle_root )        ( char*, struct stratum_ctx* );
 void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );
+void ( *build_block_header )     ( struct work*, uint32_t, uint32_t*,
+                                   uint32_t*, uint32_t, uint32_t );
 void ( *build_stratum_request )  ( char*, struct work*, struct stratum_ctx* );
+char* ( *malloc_txs_request )    ( struct work* );
 void ( *set_work_data_endian )   ( struct work* );
 double ( *calc_network_diff )    ( struct work* );
 bool ( *ready_to_mine )          ( struct work*, struct stratum_ctx*, int );
@@ -228,11 +231,17 @@ void std_le_build_stratum_request( char *req, struct work *work );
 void std_be_build_stratum_request( char *req, struct work *work );
 void jr2_build_stratum_request   ( char *req, struct work *work );

+char* std_malloc_txs_request( struct work *work );
+
 // Default is do_nothing (assumed LE)
 void set_work_data_big_endian( struct work *work );

 double std_calc_network_diff( struct work *work );

+void std_build_block_header( struct work* g_work, uint32_t version,
+                             uint32_t *prevhash, uint32_t *merkle_root,
+                             uint32_t ntime, uint32_t nbits );
+
 void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );

 json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url );
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -1,19 +1,18 @@
 #include "blake-gate.h"
-
-#if defined (BLAKE_4WAY)
-
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>

-blake256r14_4way_context blake_ctx;
+#if defined (BLAKE_4WAY)
+
+blake256r14_4way_context blake_4w_ctx;

 void blakehash_4way(void *state, const void *input)
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256r14_4way_context ctx;
-     memcpy( &ctx, &blake_ctx, sizeof ctx );
+     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
     blake256r14_4way( &ctx, input + (64<<2), 16 );
     blake256r14_4way_close( &ctx, vhash );
     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
@@ -31,7 +30,6 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;

   if (opt_benchmark)
@@ -39,15 +37,12 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,

   // we need big endian data...
   swab32_array( edata, pdata, 20 );
-
   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
-   blake256r14_4way_init( &blake_ctx );
-   blake256r14_4way( &blake_ctx, vdata, 64 );
+   blake256r14_4way_init( &blake_4w_ctx );
+   blake256r14_4way( &blake_4w_ctx, vdata, 64 );

   uint32_t *noncep = vdata + 76;   // 19*4
   do {
-      found[0] = found[1] = found[2] = found[3] = false;
      be32enc( noncep,    n   );
      be32enc( noncep +1, n+1 );
      be32enc( noncep +2, n+2 );
@@ -55,34 +50,12 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,

      blakehash_4way( hash, vdata );

-      if (  hash[7] <= HTarget && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = n;
-          pdata[19] = n;
-          work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
-      {
-           found[2] = true;
-           num_found++;
-           nonces[2] = n+2;
-           work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
-      {
-           found[3] = true;
-           num_found++;
-           nonces[3] = n+3;
-           work_set_target_ratio( work, hash+24 );
+          pdata[19] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;

@@ -95,3 +68,77 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,

 #endif

+#if defined(BLAKE_8WAY)
+
+blake256r14_8way_context blake_8w_ctx;
+
+void blakehash_8way( void *state, const void *input )
+{
+     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+     blake256r14_8way_context ctx;
+     memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
+     blake256r14_8way( &ctx, input + (64<<3), 16 );
+     blake256r14_8way_close( &ctx, vhash );
+     mm256_deinterleave_8x32( state,     state+ 32, state+ 64, state+ 96,
+                              state+128, state+160, state+192, state+224,
+                              vhash, 256 );
+}
+
+int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+
+   if (opt_benchmark)
+      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( edata, pdata, 20 );
+
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+
+   blake256r14_8way_init( &blake_8w_ctx );
+   blake256r14_8way( &blake_8w_ctx, vdata, 64 );
+
+   uint32_t *noncep = vdata + 152;   // 19*8
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      be32enc( noncep +4, n+4 );
+      be32enc( noncep +5, n+5 );
+      be32enc( noncep +6, n+6 );
+      be32enc( noncep +7, n+7 );
+      pdata[19] = n;
+
+      blakehash_8way( hash, vdata );
+
+      for ( int i = 0; i < 8; i++ )
+      if ( (hash+i)[7] <= HTarget && fulltest( hash+i, ptarget ) )
+      {
+          pdata[19] = n+i;
+          num_found++;
+          nonces[i] = n+i;
+          work_set_target_ratio( work, hash+1 );
+      }
+      n += 8;
+
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -58,6 +58,8 @@ extern "C"{
 #pragma warning (disable: 4146)
 #endif

+// Blake-256
+
 static const sph_u32 IV256[8] = {
 	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
 	SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
@@ -67,6 +69,8 @@ static const sph_u32 IV256[8] = {

 #if defined (__AVX2__)

+// Blake-512
+
 static const sph_u64 IV512[8] = {
 	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
 	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
@@ -78,7 +82,7 @@ static const sph_u64 IV512[8] = {

 #if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64

-// Blake-256 4 & 8 way, Blake-512 4way
+// Blake-256 4 & 8 way, Blake-512 4 way

 static const unsigned sigma[16][16] = {
 	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
@@ -371,6 +375,8 @@ do { \

 #if SPH_COMPACT_BLAKE_32

+// Blake-256 4 way
+
 #define ROUND_S_4WAY(r)   do { \
 	GS_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
 		CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \
@@ -407,7 +413,7 @@ do { \

 #if defined (__AVX2__)

-// BLAKE256 8 WAY
+// Blake-256 8 way

 #define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
 do { \
@@ -487,6 +493,8 @@ do { \

 #endif

+// Blake-256 4 way
+
 #define DECL_STATE32_4WAY \
 	__m128i H0, H1, H2, H3, H4, H5, H6, H7; \
 	__m128i S0, S1, S2, S3; \
@@ -527,6 +535,7 @@ do { \
 	} while (0)

 #if SPH_COMPACT_BLAKE_32
+// not used

 #define COMPRESS32_4WAY( rounds )   do { \
 	__m128i M[16]; \
@@ -553,22 +562,22 @@ do { \
                          , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
-	M[0x0] = mm_byteswap_32( *(buf +  0) ); \
-	M[0x1] = mm_byteswap_32( *(buf +  1) ); \
-	M[0x2] = mm_byteswap_32( *(buf +  2) ); \
-	M[0x3] = mm_byteswap_32( *(buf +  3) ); \
-	M[0x4] = mm_byteswap_32( *(buf +  4) ); \
-	M[0x5] = mm_byteswap_32( *(buf +  5) ); \
-	M[0x6] = mm_byteswap_32( *(buf +  6) ); \
-	M[0x7] = mm_byteswap_32( *(buf +  7) ); \
-	M[0x8] = mm_byteswap_32( *(buf +  8) ); \
-	M[0x9] = mm_byteswap_32( *(buf +  9) ); \
-	M[0xA] = mm_byteswap_32( *(buf + 10) ); \
-	M[0xB] = mm_byteswap_32( *(buf + 11) ); \
-	M[0xC] = mm_byteswap_32( *(buf + 12) ); \
-	M[0xD] = mm_byteswap_32( *(buf + 13) ); \
-	M[0xE] = mm_byteswap_32( *(buf + 14) ); \
-	M[0xF] = mm_byteswap_32( *(buf + 15) ); \
+	M[0x0] = mm_bswap_32( *(buf +  0) ); \
+	M[0x1] = mm_bswap_32( *(buf +  1) ); \
+	M[0x2] = mm_bswap_32( *(buf +  2) ); \
+	M[0x3] = mm_bswap_32( *(buf +  3) ); \
+	M[0x4] = mm_bswap_32( *(buf +  4) ); \
+	M[0x5] = mm_bswap_32( *(buf +  5) ); \
+	M[0x6] = mm_bswap_32( *(buf +  6) ); \
+	M[0x7] = mm_bswap_32( *(buf +  7) ); \
+	M[0x8] = mm_bswap_32( *(buf +  8) ); \
+	M[0x9] = mm_bswap_32( *(buf +  9) ); \
+	M[0xA] = mm_bswap_32( *(buf + 10) ); \
+	M[0xB] = mm_bswap_32( *(buf + 11) ); \
+	M[0xC] = mm_bswap_32( *(buf + 12) ); \
+	M[0xD] = mm_bswap_32( *(buf + 13) ); \
+	M[0xE] = mm_bswap_32( *(buf + 14) ); \
+	M[0xF] = mm_bswap_32( *(buf + 15) ); \
 	for (r = 0; r < rounds; r ++) \
 		ROUND_S_4WAY(r); \
        H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -615,22 +624,22 @@ do { \
   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
-   M0 = mm_byteswap_32( * buf ); \
-   M1 = mm_byteswap_32( *(buf+1) ); \
-   M2 = mm_byteswap_32( *(buf+2) ); \
-   M3 = mm_byteswap_32( *(buf+3) ); \
-   M4 = mm_byteswap_32( *(buf+4) ); \
-   M5 = mm_byteswap_32( *(buf+5) ); \
-   M6 = mm_byteswap_32( *(buf+6) ); \
-   M7 = mm_byteswap_32( *(buf+7) ); \
-   M8 = mm_byteswap_32( *(buf+8) ); \
-   M9 = mm_byteswap_32( *(buf+9) ); \
-   MA = mm_byteswap_32( *(buf+10) ); \
-   MB = mm_byteswap_32( *(buf+11) ); \
-   MC = mm_byteswap_32( *(buf+12) ); \
-   MD = mm_byteswap_32( *(buf+13) ); \
-   ME = mm_byteswap_32( *(buf+14) ); \
-   MF = mm_byteswap_32( *(buf+15) ); \
+   M0 = mm_bswap_32( * buf ); \
+   M1 = mm_bswap_32( *(buf+1) ); \
+   M2 = mm_bswap_32( *(buf+2) ); \
+   M3 = mm_bswap_32( *(buf+3) ); \
+   M4 = mm_bswap_32( *(buf+4) ); \
+   M5 = mm_bswap_32( *(buf+5) ); \
+   M6 = mm_bswap_32( *(buf+6) ); \
+   M7 = mm_bswap_32( *(buf+7) ); \
+   M8 = mm_bswap_32( *(buf+8) ); \
+   M9 = mm_bswap_32( *(buf+9) ); \
+   MA = mm_bswap_32( *(buf+10) ); \
+   MB = mm_bswap_32( *(buf+11) ); \
+   MC = mm_bswap_32( *(buf+12) ); \
+   MD = mm_bswap_32( *(buf+13) ); \
+   ME = mm_bswap_32( *(buf+14) ); \
+   MF = mm_bswap_32( *(buf+15) ); \
   ROUND_S_4WAY(0); \
   ROUND_S_4WAY(1); \
   ROUND_S_4WAY(2); \
@@ -727,22 +736,22 @@ do { \
   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
   VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
-   M0 = mm256_byteswap_32( * buf ); \
-   M1 = mm256_byteswap_32( *(buf+1) ); \
-   M2 = mm256_byteswap_32( *(buf+2) ); \
-   M3 = mm256_byteswap_32( *(buf+3) ); \
-   M4 = mm256_byteswap_32( *(buf+4) ); \
-   M5 = mm256_byteswap_32( *(buf+5) ); \
-   M6 = mm256_byteswap_32( *(buf+6) ); \
-   M7 = mm256_byteswap_32( *(buf+7) ); \
-   M8 = mm256_byteswap_32( *(buf+8) ); \
-   M9 = mm256_byteswap_32( *(buf+9) ); \
-   MA = mm256_byteswap_32( *(buf+10) ); \
-   MB = mm256_byteswap_32( *(buf+11) ); \
-   MC = mm256_byteswap_32( *(buf+12) ); \
-   MD = mm256_byteswap_32( *(buf+13) ); \
-   ME = mm256_byteswap_32( *(buf+14) ); \
-   MF = mm256_byteswap_32( *(buf+15) ); \
+   M0 = mm256_bswap_32( * buf ); \
+   M1 = mm256_bswap_32( *(buf+1) ); \
+   M2 = mm256_bswap_32( *(buf+2) ); \
+   M3 = mm256_bswap_32( *(buf+3) ); \
+   M4 = mm256_bswap_32( *(buf+4) ); \
+   M5 = mm256_bswap_32( *(buf+5) ); \
+   M6 = mm256_bswap_32( *(buf+6) ); \
+   M7 = mm256_bswap_32( *(buf+7) ); \
+   M8 = mm256_bswap_32( *(buf+8) ); \
+   M9 = mm256_bswap_32( *(buf+9) ); \
+   MA = mm256_bswap_32( *(buf+10) ); \
+   MB = mm256_bswap_32( *(buf+11) ); \
+   MC = mm256_bswap_32( *(buf+12) ); \
+   MD = mm256_bswap_32( *(buf+13) ); \
+   ME = mm256_bswap_32( *(buf+14) ); \
+   MF = mm256_bswap_32( *(buf+15) ); \
   ROUND_S_8WAY(0); \
   ROUND_S_8WAY(1); \
   ROUND_S_8WAY(2); \
@@ -778,7 +787,6 @@ do { \
                                                              S3 ), H7 ); \
 } while (0)

-
 // Blake-512 4 way

 #define DECL_STATE64_4WAY \
@@ -848,22 +856,22 @@ do { \
                               _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
        VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
                               _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
-	M[0x0] = mm256_byteswap_64( *(buf+0) ); \
-	M[0x1] = mm256_byteswap_64( *(buf+1) ); \
-	M[0x2] = mm256_byteswap_64( *(buf+2) ); \
-	M[0x3] = mm256_byteswap_64( *(buf+3) ); \
-	M[0x4] = mm256_byteswap_64( *(buf+4) ); \
-	M[0x5] = mm256_byteswap_64( *(buf+5) ); \
-	M[0x6] = mm256_byteswap_64( *(buf+6) ); \
-	M[0x7] = mm256_byteswap_64( *(buf+7) ); \
-	M[0x8] = mm256_byteswap_64( *(buf+8) ); \
-	M[0x9] = mm256_byteswap_64( *(buf+9) ); \
-	M[0xA] = mm256_byteswap_64( *(buf+10) ); \
-	M[0xB] = mm256_byteswap_64( *(buf+11) ); \
-	M[0xC] = mm256_byteswap_64( *(buf+12) ); \
-	M[0xD] = mm256_byteswap_64( *(buf+13) ); \
-	M[0xE] = mm256_byteswap_64( *(buf+14) ); \
-	M[0xF] = mm256_byteswap_64( *(buf+15) ); \
+	M[0x0] = mm256_bswap_64( *(buf+0) ); \
+	M[0x1] = mm256_bswap_64( *(buf+1) ); \
+	M[0x2] = mm256_bswap_64( *(buf+2) ); \
+	M[0x3] = mm256_bswap_64( *(buf+3) ); \
+	M[0x4] = mm256_bswap_64( *(buf+4) ); \
+	M[0x5] = mm256_bswap_64( *(buf+5) ); \
+	M[0x6] = mm256_bswap_64( *(buf+6) ); \
+	M[0x7] = mm256_bswap_64( *(buf+7) ); \
+	M[0x8] = mm256_bswap_64( *(buf+8) ); \
+	M[0x9] = mm256_bswap_64( *(buf+9) ); \
+	M[0xA] = mm256_bswap_64( *(buf+10) ); \
+	M[0xB] = mm256_bswap_64( *(buf+11) ); \
+	M[0xC] = mm256_bswap_64( *(buf+12) ); \
+	M[0xD] = mm256_bswap_64( *(buf+13) ); \
+	M[0xE] = mm256_bswap_64( *(buf+14) ); \
+	M[0xF] = mm256_bswap_64( *(buf+15) ); \
 	for (r = 0; r < 16; r ++) \
 		ROUND_B_4WAY(r); \
        H0 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -913,22 +921,22 @@ do { \
                            _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) );  \
     VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
                            _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) );  \
-     M0 = mm256_byteswap_64( *(buf + 0) ); \
-     M1 = mm256_byteswap_64( *(buf + 1) ); \
-     M2 = mm256_byteswap_64( *(buf + 2) ); \
-     M3 = mm256_byteswap_64( *(buf + 3) ); \
-     M4 = mm256_byteswap_64( *(buf + 4) ); \
-     M5 = mm256_byteswap_64( *(buf + 5) ); \
-     M6 = mm256_byteswap_64( *(buf + 6) ); \
-     M7 = mm256_byteswap_64( *(buf + 7) ); \
-     M8 = mm256_byteswap_64( *(buf + 8) ); \
-     M9 = mm256_byteswap_64( *(buf + 9) ); \
-     MA = mm256_byteswap_64( *(buf + 10) ); \
-     MB = mm256_byteswap_64( *(buf + 11) ); \
-     MC = mm256_byteswap_64( *(buf + 12) ); \
-     MD = mm256_byteswap_64( *(buf + 13) ); \
-     ME = mm256_byteswap_64( *(buf + 14) ); \
-     MF = mm256_byteswap_64( *(buf + 15) ); \
+     M0 = mm256_bswap_64( *(buf + 0) ); \
+     M1 = mm256_bswap_64( *(buf + 1) ); \
+     M2 = mm256_bswap_64( *(buf + 2) ); \
+     M3 = mm256_bswap_64( *(buf + 3) ); \
+     M4 = mm256_bswap_64( *(buf + 4) ); \
+     M5 = mm256_bswap_64( *(buf + 5) ); \
+     M6 = mm256_bswap_64( *(buf + 6) ); \
+     M7 = mm256_bswap_64( *(buf + 7) ); \
+     M8 = mm256_bswap_64( *(buf + 8) ); \
+     M9 = mm256_bswap_64( *(buf + 9) ); \
+     MA = mm256_bswap_64( *(buf + 10) ); \
+     MB = mm256_bswap_64( *(buf + 11) ); \
+     MC = mm256_bswap_64( *(buf + 12) ); \
+     MD = mm256_bswap_64( *(buf + 13) ); \
+     ME = mm256_bswap_64( *(buf + 14) ); \
+     MF = mm256_bswap_64( *(buf + 15) ); \
     ROUND_B_4WAY(0); \
     ROUND_B_4WAY(1); \
     ROUND_B_4WAY(2); \
@@ -967,6 +975,8 @@ do { \

 #endif

+// Blake-256 4 way
+
 static const sph_u32 salt_zero_4way_small[4] = { 0, 0, 0, 0 };

 static void
@@ -988,52 +998,51 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
 {
   __m128i *vdata = (__m128i*)data;
   __m128i *buf;
-	size_t ptr;
-        const int buf_size = 64;   // number of elements, sizeof/4
-	DECL_STATE32_4WAY
+   size_t ptr;
+   const int buf_size = 64;   // number of elements, sizeof/4
+   DECL_STATE32_4WAY
+   buf = sc->buf;
+   ptr = sc->ptr;
+   if ( len < buf_size - ptr )
+   {
+      memcpy_128( buf + (ptr>>2), vdata, len>>2 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }

-	buf = sc->buf;
-	ptr = sc->ptr;
-	if ( len < buf_size - ptr )
-        {
-		memcpy_128( buf + (ptr>>2), vdata, len>>2 );
-		ptr += len;
-		sc->ptr = ptr;
-		return;
-	}
+   READ_STATE32_4WAY(sc);
+   while ( len > 0 )
+   {
+      size_t clen;

-	READ_STATE32_4WAY(sc);
-	while ( len > 0 )
-        {
-           size_t clen;
-
-	   clen = buf_size - ptr;
-	   if (clen > len)
-		clen = len;
-	   memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
-	   ptr += clen;
-           vdata += (clen>>2);
-	   len -= clen;
-	   if ( ptr == buf_size )
-           {
-		if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
-			T1 = SPH_T32(T1 + 1);
-                COMPRESS32_4WAY( sc->rounds );
-		ptr = 0;
-	   }
-	}
-	WRITE_STATE32_4WAY(sc);
-	sc->ptr = ptr;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
+      ptr += clen;
+      vdata += (clen>>2);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
+            T1 = SPH_T32(T1 + 1);
+         COMPRESS32_4WAY( sc->rounds );
+         ptr = 0;
+      }
+   }
+   WRITE_STATE32_4WAY(sc);
+   sc->ptr = ptr;
 }

 static void
 blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
               void *dst, size_t out_size_w32 )
 {
-   union {
+//   union {
 	__m128i buf[16];
-	sph_u32 dummy;
-   } u;
+//	sph_u32 dummy;
+//   } u;
   size_t ptr, k;
   unsigned bit_len;
   sph_u32 th, tl;
@@ -1041,7 +1050,7 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
+   buf[ptr>>2] = _mm_set1_epi32( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;

@@ -1060,30 +1069,30 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,

   if ( ptr <= 52 )
   {
-       memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+       memset_zero_128( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
       if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
+           buf[52>>2] = _mm_or_si128( buf[52>>2],
                                        _mm_set1_epi32( 0x01000000UL ) );
-       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
-       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
-       blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
+       *(buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
+       *(buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
+       blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
   }
   else
   {
-	memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
-	blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
+	memset_zero_128( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+	blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
 	sc->T0 = SPH_C32(0xFFFFFE00UL);
 	sc->T1 = SPH_C32(0xFFFFFFFFUL);
-	memset_zero_128( u.buf, 56>>2 );
+	memset_zero_128( buf, 56>>2 );
       if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
-        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
-        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
-	blake32_4way( sc, u.buf, 64 );
+           buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
+        *(buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
+        *(buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
+	blake32_4way( sc, buf, 64 );
   }
   out = (__m128i*)dst;
   for ( k = 0; k < out_size_w32; k++ )
-        out[k] = mm_byteswap_32( sc->H[k] );
+        out[k] = mm_bswap_32( sc->H[k] );
 }

 #if defined (__AVX2__)
@@ -1114,7 +1123,6 @@ blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
   size_t ptr;
   const int buf_size = 64;   // number of elements, sizeof/4
   DECL_STATE32_8WAY
-
   buf = sc->buf;
   ptr = sc->ptr;
   if ( len < buf_size - ptr )
@@ -1153,10 +1161,10 @@ static void
 blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
                    void *dst, size_t out_size_w32 )
 {
-   union {
+//   union {
        __m256i buf[16];
-        sph_u32 dummy;
-   } u;
+//        sph_u32 dummy;
+//   } u;
   size_t ptr, k;
   unsigned bit_len;
   sph_u32 th, tl;
@@ -1164,7 +1172,7 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-   u.buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
+   buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;

@@ -1183,30 +1191,30 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,

   if ( ptr <= 52 )
   {
-       memset_zero_256( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
-       if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2],
+       memset_zero_256( buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = _mm256_or_si256( buf[52>>2],
                                           _mm256_set1_epi32( 0x01000000UL ) );
-       *(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
-       *(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
-       blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
+       *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
+       *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
+       blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
   }
   else
   {
-        memset_zero_256( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
-        blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
+        memset_zero_256( buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+        blake32_8way( sc, buf + (ptr>>2), 64 - ptr );
        sc->T0 = SPH_C32(0xFFFFFE00UL);
        sc->T1 = SPH_C32(0xFFFFFFFFUL);
-        memset_zero_256( u.buf, 56>>2 );
-       if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
-        *(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
-        *(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
-        blake32_8way( sc, u.buf, 64 );
+        memset_zero_256( buf, 56>>2 );
+       if ( out_size_w32 == 8 )
+           buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
+        *(buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
+        *(buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
+        blake32_8way( sc, buf, 64 );
   }
   out = (__m256i*)dst;
   for ( k = 0; k < out_size_w32; k++ )
-        out[k] = mm256_byteswap_32( sc->H[k] );
+        out[k] = mm256_bswap_32( sc->H[k] );
 }

 // Blake-512 4 way
@@ -1274,10 +1282,10 @@ static void
 blake64_4way_close( blake_4way_big_context *sc,
 	unsigned ub, unsigned n, void *dst, size_t out_size_w64)
 {
-   union {
+//   union {
      __m256i buf[16];
-      sph_u64 dummy;
-   } u;
+//      sph_u64 dummy;
+//   } u;
   size_t ptr, k;
   unsigned bit_len;
   uint64_t z, zz;
@@ -1288,7 +1296,7 @@ blake64_4way_close( blake_4way_big_context *sc,
   bit_len = ((unsigned)ptr << 3);
   z = 0x80 >> n;
   zz = ((ub & -z) | z) & 0xFF;
-   u.buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
+   buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
   tl = sc->T0 + bit_len;
   th = sc->T1;
   if (ptr == 0 )
@@ -1307,42 +1315,42 @@ blake64_4way_close( blake_4way_big_context *sc,
   }
   if ( ptr <= 104 )
   {
-       memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
+       memset_zero_256( buf + (ptr>>3) + 1, (104-ptr) >> 3 );
       if ( out_size_w64 == 8 )
-          u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
+          buf[(104>>3)] = _mm256_or_si256( buf[(104>>3)],
                                 _mm256_set1_epi64x( 0x0100000000000000ULL ) );
-       *(u.buf+(112>>3)) = mm256_byteswap_64(
+       *(buf+(112>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(u.buf+(120>>3)) = mm256_byteswap_64(
+       *(buf+(120>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( tl, tl, tl, tl ) );

-       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
+       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
   }
   else
  {
-       memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
+       memset_zero_256( buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

-       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
+       blake64_4way( sc, buf + (ptr>>3), 128 - ptr );
       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
-       memset_zero_256( u.buf, 112>>3 ); 
+       memset_zero_256( buf, 112>>3 ); 
       if ( out_size_w64 == 8 )
-           u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
-       *(u.buf+(112>>3)) = mm256_byteswap_64(
+           buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
+       *(buf+(112>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(u.buf+(120>>3)) = mm256_byteswap_64(
+       *(buf+(120>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( tl, tl, tl, tl ) );

-       blake64_4way( sc, u.buf, 128 );
+       blake64_4way( sc, buf, 128 );
   }
   out = (__m256i*)dst;
   for ( k = 0; k < out_size_w64; k++ )
-       out[k] = mm256_byteswap_64( sc->H[k] );
+       out[k] = mm256_bswap_64( sc->H[k] );
 }

 #endif

-// Blake-256 4 way & 8 way
+// Blake-256 4 way

 // default 14 rounds, backward copatibility
 void
@@ -1364,6 +1372,9 @@ blake256_4way_close(void *cc, void *dst)
 }

 #if defined(__AVX2__)
+
+// Blake-256 8way
+
 void
 blake256_8way_init(void *cc)
 {
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -35,7 +35,7 @@
 */

 #ifndef __BLAKE_HASH_4WAY__
-#define __BLAKE_HASH_4WAY__
+#define __BLAKE_HASH_4WAY__ 1

 #ifdef __AVX__

@@ -117,11 +117,11 @@ void blake256r8_8way_close(void *cc, void *dst);
 // Blake-512 4 way

 typedef struct {
-        __m256i buf[16] __attribute__ ((aligned (64)));
-        __m256i H[8];
-        __m256i S[4];   
-        size_t ptr;
-	sph_u64 T0, T1;
+   __m256i buf[16] __attribute__ ((aligned (64)));
+   __m256i H[8];
+   __m256i S[4];   
+   size_t ptr;
+   sph_u64 T0, T1;
 } blake_4way_big_context;

 typedef blake_4way_big_context blake512_4way_context;
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -0,0 +1,136 @@
+#include "blake2s-gate.h"
+#include "blake2s-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+
+#if defined(BLAKE2S_8WAY)
+
+static __thread blake2s_8way_state blake2s_8w_ctx;
+
+void blake2s_8way_hash( void *output, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+   blake2s_8way_state ctx;
+   memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
+
+   blake2s_8way_update( &ctx, input + (64<<3), 16 );
+   blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
+
+   mm256_deinterleave_8x32( output,     output+ 32, output+ 64, output+ 96,
+                            output+128, output+160, output+192, output+224,
+                            vhash, 256 );
+}
+
+int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(64) edata[20];
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 152;   // 19*8
+
+   swab32_array( edata, pdata, 20 );
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
+
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      be32enc( noncep +4, n+4 );
+      be32enc( noncep +5, n+5 );
+      be32enc( noncep +6, n+6 );
+      be32enc( noncep +7, n+7 );
+      pdata[19] = n;
+
+      blake2s_8way_hash( hash, vdata );
+
+
+      for ( int i = 0; i < 8; i++ )
+      if (  (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          pdata[19] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 8;
+
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#elif defined(BLAKE2S_4WAY)
+
+static __thread blake2s_4way_state blake2s_4w_ctx;
+
+void blake2s_4way_hash( void *output, const void *input )
+{
+   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+   blake2s_4way_state ctx;
+   memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
+
+   blake2s_4way_update( &ctx, input + (64<<2), 16 );
+   blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
+
+   mm_deinterleave_4x32( output, output+32, output+64, output+96, vhash, 256 );
+}
+
+int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(64) edata[20];
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 76;   // 19*4
+
+   swab32_array( edata, pdata, 20 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
+
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      pdata[19] = n;
+
+      blake2s_4way_hash( hash, vdata );
+
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          pdata[19] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -0,0 +1,27 @@
+#include "blake2s-gate.h"
+
+
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blake2s_get_max64 ()
+{
+   return 0x7ffffLL;
+}
+
+bool register_blake2s_algo( algo_gate_t* gate )
+{
+#if defined(BLAKE2S_8WAY)
+  gate->scanhash  = (void*)&scanhash_blake2s_8way;
+  gate->hash      = (void*)&blake2s_8way_hash;
+#elif defined(BLAKE2S_4WAY)
+  gate->scanhash  = (void*)&scanhash_blake2s_4way;
+  gate->hash      = (void*)&blake2s_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_blake2s;
+  gate->hash      = (void*)&blake2s_hash;
+#endif
+  gate->get_max64 = (void*)&blake2s_get_max64;
+  gate->optimizations = AVX_OPT | AVX2_OPT;
+  return true;
+};
+
+
--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -0,0 +1,35 @@
+#ifndef __BLAKE2S_GATE_H__
+#define __BLAKE2S_GATE_H__ 1
+
+#include <stdint.h>
+#include "algo-gate-api.h"
+
+#if defined(__AVX__)
+  #define BLAKE2S_4WAY
+#endif
+#if defined(__AVX2__)
+  #define BLAKE2S_8WAY
+#endif
+
+bool register_blake2s_algo( algo_gate_t* gate );
+
+#if defined(BLAKE2S_8WAY)
+
+void blake2s_8way_hash( void *state, const void *input );
+int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+#elif defined (BLAKE2S_4WAY)
+
+void blake2s_4way_hash( void *state, const void *input );
+int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#else
+
+void blake2s_hash( void *state, const void *input );
+int scanhash_blake2s( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done );
+
+#endif
+
+#endif
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -0,0 +1,362 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#include "blake2s-hash-4way.h"
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(__AVX__)
+
+static const uint32_t blake2s_IV[8] =
+{
+	0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+	0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static const uint8_t blake2s_sigma[10][16] =
+{
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+};
+
+// define a constant for initial param.
+
+int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
+{
+   blake2s_nway_param P[1];
+
+   P->digest_length = outlen;
+   P->key_length    = 0;
+   P->fanout        = 1;
+   P->depth         = 1;
+   P->leaf_length   = 0;    
+   *((uint64_t*)(P->node_offset)) = 0;
+   P->node_depth    = 0;
+   P->inner_length  = 0;
+   memset( P->salt,     0, sizeof( P->salt ) );
+   memset( P->personal, 0, sizeof( P->personal ) );
+
+   memset( S, 0, sizeof( blake2s_4way_state ) );
+   for( int i = 0; i < 8; ++i )
+      S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
+
+   uint32_t *p = ( uint32_t * )( P );
+
+   /* IV XOR ParamBlock */
+   for ( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm_xor_si128( S->h[i], _mm_set1_epi32( p[i] ) );
+   return 0;
+}
+
+int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
+{
+   __m128i m[16];
+   __m128i v[16];
+
+   memcpy_128( m, block, 16 );
+   memcpy_128( v, S->h, 8 );
+
+   v[ 8] = _mm_set1_epi32( blake2s_IV[0] );
+   v[ 9] = _mm_set1_epi32( blake2s_IV[1] );
+   v[10] = _mm_set1_epi32( blake2s_IV[2] );
+   v[11] = _mm_set1_epi32( blake2s_IV[3] );
+   v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
+                          _mm_set1_epi32( blake2s_IV[4] ) );
+   v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
+                          _mm_set1_epi32( blake2s_IV[5] ) );
+   v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
+                          _mm_set1_epi32( blake2s_IV[6] ) );
+   v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
+                          _mm_set1_epi32( blake2s_IV[7] ) );
+
+#define G4W(r,i,a,b,c,d) \
+do { \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+0] ] ); \
+   d = mm_rotr_32( _mm_xor_si128( d, a ), 16 ); \
+   c = _mm_add_epi32( c, d ); \
+   b = mm_rotr_32( _mm_xor_si128( b, c ), 12 ); \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+1] ] ); \
+   d = mm_rotr_32( _mm_xor_si128( d, a ),  8 ); \
+   c = _mm_add_epi32( c, d ); \
+   b = mm_rotr_32( _mm_xor_si128( b, c ),  7 ); \
+} while(0)
+
+#define ROUND4W(r)  \
+do { \
+   G4W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
+   G4W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
+   G4W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
+   G4W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
+   G4W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
+   G4W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
+   G4W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
+   G4W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
+} while(0)
+
+   ROUND4W( 0 );
+   ROUND4W( 1 );
+   ROUND4W( 2 );
+   ROUND4W( 3 );
+   ROUND4W( 4 );
+   ROUND4W( 5 );
+   ROUND4W( 6 );
+   ROUND4W( 7 );
+   ROUND4W( 8 );
+   ROUND4W( 9 );
+
+   for( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm_xor_si128( _mm_xor_si128( S->h[i], v[i] ), v[i + 8] );
+
+#undef G4W
+#undef ROUND4W
+   return 0;
+}
+
+int blake2s_4way_update( blake2s_4way_state *S, const void *in,
+                         uint64_t inlen )
+{
+  __m128i *input = (__m128i*)in;
+  __m128i *buf = (__m128i*)S->buf;
+  const int bsize = BLAKE2S_BLOCKBYTES;
+
+   while( inlen > 0 )
+   {
+      size_t left = S->buflen;
+      if( inlen >= bsize - left )
+      {
+         memcpy_128( buf + (left>>2), input, (bsize - left) >> 2 );
+         S->buflen += bsize - left;
+         S->t[0] += BLAKE2S_BLOCKBYTES;
+         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         blake2s_4way_compress( S, buf ); 
+         S->buflen = 0;
+         input += ( bsize >> 2 );
+         inlen -= bsize;
+      }
+      else
+      {
+          memcpy_128( buf + ( left>>2 ), input, inlen>>2 );
+          S->buflen += (size_t) inlen; 
+          input += ( inlen>>2 );
+          inlen -= inlen;
+      }
+   }
+   return 0;
+}
+
+int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
+{
+   __m128i *buf = (__m128i*)S->buf;
+
+   S->t[0] += S->buflen;
+   S->t[1] += ( S->t[0] < S->buflen );
+   if ( S->last_node ) 
+      S->f[1] = ~0U;
+   S->f[0] = ~0U;
+
+   memset_zero_128( buf + ( S->buflen>>2 ),
+                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );      
+   blake2s_4way_compress( S, buf );
+
+   for ( int i = 0; i < 8; ++i )
+      casti_m128i( out, i ) = S->h[ i ];
+   return 0;
+}
+
+#if defined(__AVX2__)
+
+int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
+{
+   __m256i m[16];
+   __m256i v[16];
+
+   memcpy_256( m, block, 16 );
+   memcpy_256( v, S->h, 8 );
+
+   v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
+   v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
+   v[10] = _mm256_set1_epi32( blake2s_IV[2] );
+   v[11] = _mm256_set1_epi32( blake2s_IV[3] );
+   v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
+                             _mm256_set1_epi32( blake2s_IV[4] ) );
+   v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
+                             _mm256_set1_epi32( blake2s_IV[5] ) );
+   v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
+                             _mm256_set1_epi32( blake2s_IV[6] ) );
+   v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
+                             _mm256_set1_epi32( blake2s_IV[7] ) );
+
+#define G8W(r,i,a,b,c,d) \
+do { \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
+                          m[ blake2s_sigma[r][2*i+0] ] ); \
+   d = mm256_rotr_32( _mm256_xor_si256( d, a ), 16 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_rotr_32( _mm256_xor_si256( b, c ), 12 ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
+                         m[ blake2s_sigma[r][2*i+1] ] ); \
+   d = mm256_rotr_32( _mm256_xor_si256( d, a ),  8 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_rotr_32( _mm256_xor_si256( b, c ),  7 ); \
+} while(0)
+
+#define ROUND8W(r)  \
+do { \
+   G8W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
+   G8W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
+   G8W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
+   G8W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
+   G8W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
+   G8W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
+   G8W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
+   G8W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
+} while(0)
+
+   ROUND8W( 0 );
+   ROUND8W( 1 );
+   ROUND8W( 2 );
+   ROUND8W( 3 );
+   ROUND8W( 4 );
+   ROUND8W( 5 );
+   ROUND8W( 6 );
+   ROUND8W( 7 );
+   ROUND8W( 8 );
+   ROUND8W( 9 );
+
+   for( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
+
+#undef G8W
+#undef ROUND8W
+   return 0;
+}
+
+int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
+{
+   blake2s_nway_param P[1];
+
+   P->digest_length = outlen;
+   P->key_length    = 0;
+   P->fanout        = 1;
+   P->depth         = 1;
+   P->leaf_length   = 0;
+   *((uint64_t*)(P->node_offset)) = 0;
+   P->node_depth    = 0;
+   P->inner_length  = 0;
+   memset( P->salt,     0, sizeof( P->salt ) );
+   memset( P->personal, 0, sizeof( P->personal ) );
+
+   memset( S, 0, sizeof( blake2s_8way_state ) );
+   for( int i = 0; i < 8; ++i )
+      S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
+
+   uint32_t *p = ( uint32_t * )( P );
+
+   /* IV XOR ParamBlock */
+   for ( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm256_xor_si256( S->h[i], _mm256_set1_epi32( p[i] ) );
+   return 0;
+}
+
+int blake2s_8way_update( blake2s_8way_state *S, const void *in,
+                         uint64_t inlen )
+{
+  __m256i *input = (__m256i*)in;
+  __m256i *buf = (__m256i*)S->buf;
+  const int bsize = BLAKE2S_BLOCKBYTES;
+
+   while( inlen > 0 )
+   {
+      size_t left = S->buflen;
+      if( inlen >= bsize - left )
+      {
+         memcpy_256( buf + (left>>2), input, (bsize - left) >> 2 );
+         S->buflen += bsize - left;
+         S->t[0] += BLAKE2S_BLOCKBYTES;
+         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         blake2s_8way_compress( S, buf );
+         S->buflen = 0;
+         input += ( bsize >> 2 );
+         inlen -= bsize;
+      }
+      else
+      {
+          memcpy_256( buf + ( left>>2 ), input, inlen>>2 );
+          S->buflen += (size_t) inlen;
+          input += ( inlen>>2 );
+          inlen -= inlen;
+      }
+   }
+   return 0;
+}
+
+int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
+{
+   __m256i *buf = (__m256i*)S->buf;
+
+   S->t[0] += S->buflen;
+   S->t[1] += ( S->t[0] < S->buflen );
+   if ( S->last_node )
+      S->f[1] = ~0U;
+   S->f[0] = ~0U;
+
+   memset_zero_256( buf + ( S->buflen>>2 ),
+                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
+   blake2s_8way_compress( S, buf );
+
+   for ( int i = 0; i < 8; ++i )
+      casti_m256i( out, i ) = S->h[ i ];
+   return 0;
+}
+
+
+#endif // __AVX2__
+
+#if 0
+int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+{
+	blake2s_state S[1];
+
+	/* Verify parameters */
+	if ( NULL == in ) return -1;
+
+	if ( NULL == out ) return -1;
+
+	if ( NULL == key ) keylen = 0; /* Fail here instead if keylen != 0 and key == NULL? */
+
+	if( keylen > 0 )
+	{
+		if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+	}
+	else
+	{
+		if( blake2s_init( S, outlen ) < 0 ) return -1;
+	}
+
+	blake2s_update( S, ( uint8_t * )in, inlen );
+	blake2s_final( S, out, outlen );
+	return 0;
+}
+#endif
+
+#endif // __AVX__
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -0,0 +1,112 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+//#pragma once
+#ifndef __BLAKE2S_HASH_4WAY_H__
+#define __BLAKE2S_HASH_4WAY_H__ 1
+
+#if defined(__AVX__)
+
+#include "avxdefs.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <inttypes.h>
+#define inline __inline
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+enum blake2s_constant
+{
+   BLAKE2S_BLOCKBYTES = 64,
+   BLAKE2S_OUTBYTES   = 32,
+   BLAKE2S_KEYBYTES   = 32,
+   BLAKE2S_SALTBYTES  = 8,
+   BLAKE2S_PERSONALBYTES = 8
+};
+
+#pragma pack(push, 1)
+typedef struct __blake2s_nway_param
+{
+   uint8_t  digest_length; // 1
+   uint8_t  key_length;    // 2
+   uint8_t  fanout;        // 3
+   uint8_t  depth;         // 4
+   uint32_t leaf_length;   // 8
+   uint8_t  node_offset[6];// 14
+   uint8_t  node_depth;    // 15
+   uint8_t  inner_length;  // 16
+   // uint8_t  reserved[0];
+   uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
+   uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+} blake2s_nway_param;
+#pragma pack(pop)
+
+ALIGN( 64 ) typedef struct __blake2s_4way_state
+{
+   __m128i h[8];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 4 ];
+   uint32_t t[2];
+   uint32_t f[2];
+   size_t   buflen;
+   uint8_t  last_node;
+} blake2s_4way_state ;
+
+int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen );
+int blake2s_4way_update( blake2s_4way_state *S, const void *in,
+                         uint64_t inlen );
+int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
+
+#if defined(__AVX2__)
+
+ALIGN( 64 ) typedef struct __blake2s_8way_state
+{
+   __m256i h[8];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 8 ];
+   uint32_t t[2];
+   uint32_t f[2];
+   size_t   buflen;
+   uint8_t  last_node;
+} blake2s_8way_state ;
+
+int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
+int blake2s_8way_update( blake2s_8way_state *S, const void *in,
+                         uint64_t inlen );
+int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
+
+#endif
+
+#if 0
+	// Simple API
+//	int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+
+	// Direct Hash Mining Helpers
+	#define blake2s_salt32(out, in, inlen, key32) blake2s(out, in, key32, 32, inlen, 32) /* neoscrypt */
+	#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif  // __AVX__
+
+#endif
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -1,26 +1,29 @@
-#include "algo-gate-api.h"
+#include "blake2s-gate.h"

 #include <string.h>
 #include <stdint.h>

 #include "sph-blake2s.h"

-static __thread blake2s_state s_midstate;
-static __thread blake2s_state s_ctx;
+static __thread blake2s_state blake2s_ctx;
+//static __thread blake2s_state s_ctx;
 #define MIDLEN 76

-void blake2s_hash(void *output, const void *input)
+void blake2s_hash( void *output, const void *input )
 {
-	unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
-	blake2s_state blake2_ctx __attribute__ ((aligned (64)));
-
-	blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES);
-	blake2s_update(&blake2_ctx, input, 80);
-	blake2s_final(&blake2_ctx, hash, BLAKE2S_OUTBYTES);
+   unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
+   blake2s_state ctx __attribute__ ((aligned (64)));
+  
+   memcpy( &ctx, &blake2s_ctx, sizeof ctx );
+   blake2s_update( &ctx, input+64, 16 );
+ 
+//	blake2s_init(&ctx, BLAKE2S_OUTBYTES);
+//	blake2s_update(&ctx, input, 80);
+	blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );

 	memcpy(output, hash, 32);
 }
-
+/*
 static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
 {
 	s_ctx.buflen = MIDLEN;
@@ -28,7 +31,7 @@ static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
 	blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
 	blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
 }
-
+*/
 int scanhash_blake2s(int thr_id, struct work *work,
 	uint32_t max_nonce, uint64_t *hashes_done)
 {
@@ -46,13 +49,12 @@ int scanhash_blake2s(int thr_id, struct work *work,
        swab32_array( endiandata, pdata, 20 );

 	// midstate
-	blake2s_init(&s_midstate, BLAKE2S_OUTBYTES);
-	blake2s_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
-	memcpy(&s_ctx, &s_midstate, sizeof(blake2s_state));
+	blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
+	blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );

 	do {
 		be32enc(&endiandata[19], n);
-		blake2s_hash_end(hash64, endiandata);
+		blake2s_hash( hash64, endiandata );
 		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
 			*hashes_done = n - first_nonce + 1;
 			pdata[19] = n;
@@ -67,7 +69,7 @@ int scanhash_blake2s(int thr_id, struct work *work,

 	return 0;
 }
-
+/*
 // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
 int64_t blake2s_get_max64 ()
 {
@@ -81,4 +83,4 @@ bool register_blake2s_algo( algo_gate_t* gate )
  gate->get_max64 = (void*)&blake2s_get_max64;
  return true;
 };
-
+*/
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -1,21 +1,22 @@
 #include "blakecoin-gate.h"
-
-#if defined (BLAKECOIN_4WAY)
-
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>

-blake256r8_4way_context blakecoin_ctx;
+#if defined (BLAKECOIN_4WAY)
+
+blake256r8_4way_context blakecoin_4w_ctx;

 void blakecoin_4way_hash(void *state, const void *input)
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256r8_4way_context ctx;
-     memcpy( &ctx, &blakecoin_ctx, sizeof ctx );
+
+     memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
     blake256r8_4way( &ctx, input + (64<<2), 16 );
     blake256r8_4way_close( &ctx, vhash );
+
     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

@@ -31,58 +32,30 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-
-   if (opt_benchmark)
+   if ( opt_benchmark )
      HTarget = 0x7f;

-   // we need big endian data...
   swab32_array( edata, pdata, 20 );
-
   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
-
-   blake256r8_4way_init( &blakecoin_ctx );
-   blake256r8_4way( &blakecoin_ctx, vdata, 64 );
+   blake256r8_4way_init( &blakecoin_4w_ctx );
+   blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );

   uint32_t *noncep = vdata + 76;   // 19*4
   do {
-      found[0] = found[1] = found[2] = found[3] = false;
      be32enc( noncep,    n   );
      be32enc( noncep +1, n+1 );
      be32enc( noncep +2, n+2 );
      be32enc( noncep +3, n+3 );
-
-      blakecoin_4way_hash( hash, vdata );
      pdata[19] = n;
+      blakecoin_4way_hash( hash, vdata );

-      if (  hash[7] <= HTarget && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = n;
-          work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) ) 
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
-      {
-           found[2] = true;
-           num_found++;
-           nonces[2] = n+2;
-           work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
-      {
-           found[3] = true;
-           num_found++;
-           nonces[3] = n+3;
-           work_set_target_ratio( work, hash+24 );
+          pdata[19] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;

@@ -90,15 +63,77 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
             && !work_restart[thr_id].restart );

   *hashes_done = n - first_nonce + 1;
-
-   // workaround to prevent flood of hash reports when nonce range exhasuted
-   // and thread is spinning waiting for new work
-   if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
-   {
-      *hashes_done = 0;
-      sleep(1);
-   }
-
+   return num_found;
+}
+
+#endif
+
+#if defined(BLAKECOIN_8WAY)
+
+blake256r8_8way_context blakecoin_8w_ctx;
+
+void blakecoin_8way_hash( void *state, const void *input )
+{
+     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+     blake256r8_8way_context ctx;
+
+     memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
+     blake256r8_8way( &ctx, input + (64<<3), 16 );
+     blake256r8_8way_close( &ctx, vhash );
+
+     mm256_deinterleave_8x32( state,     state+ 32, state+ 64, state+ 96,
+                              state+128, state+160, state+192, state+224,
+                              vhash, 256 );
+}
+
+int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   uint32_t *noncep = vdata + 152;   // 19*8
+   int num_found = 0;
+   if ( opt_benchmark )
+      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( edata, pdata, 20 );
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+   blake256r8_8way_init( &blakecoin_8w_ctx );
+   blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );
+
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      be32enc( noncep +4, n+4 );
+      be32enc( noncep +5, n+5 );
+      be32enc( noncep +6, n+6 );
+      be32enc( noncep +7, n+7 );
+      pdata[19] = n;
+      blakecoin_8way_hash( hash, vdata );
+
+      for ( int i = 0; i < 8; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      {
+          pdata[19] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 8;
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
   return num_found;
 }

--- a/algo/blake/blakecoin-gate.c
+++ b/algo/blake/blakecoin-gate.c
@@ -8,55 +8,21 @@ int64_t blakecoin_get_max64 ()
 //  return 0x3fffffLL;
 }

-// Blakecoin 4 way hashes so fast it runs out of nonces.
-// This is an attempt to solve this but the result may be
-// to rehash old nonces until new work is received.
-void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id,
-                     uint32_t *end_nonce_ptr, bool clean_job )
-{
-   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
- 
-//   if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) )
-//      algo_gate.stratum_gen_work( &stratum, g_work );
-
-   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) 
-   || ( *nonceptr >= *end_nonce_ptr )
-   || ( (  work->job_id != g_work->job_id ) && clean_job ) )
-/*
-   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
-      && ( clean_job || ( *nonceptr >= *end_nonce_ptr )
-         || ( work->job_id != g_work->job_id ) ) )
-*/   
-   {
-     work_free( work );
-     work_copy( work, g_work );
-     *nonceptr = 0xffffffffU / opt_n_threads * thr_id;
-     if ( opt_randomize )
-       *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
-     *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; 
-// try incrementing the xnonce to chsnge the data
-//     for ( int i = 0; i < work->xnonce2_size && !( ++work->xnonce2[i] ); i++ );
-   }
-   else
-       ++(*nonceptr);
-}
-
-
 // vanilla uses default gen merkle root, otherwise identical to blakecoin
 bool register_vanilla_algo( algo_gate_t* gate )
 {
-#if defined(BLAKECOIN_4WAY)
-//  four_way_not_tested();
+#if defined(BLAKECOIN_8WAY)
+  gate->scanhash  = (void*)&scanhash_blakecoin_8way;
+  gate->hash      = (void*)&blakecoin_8way_hash;
+
+#elif defined(BLAKECOIN_4WAY)
  gate->scanhash  = (void*)&scanhash_blakecoin_4way;
  gate->hash      = (void*)&blakecoin_4way_hash;
-//  gate->get_new_work = (void*)&bc4w_get_new_work;
-//  blakecoin_4way_init( &blake_4way_init_ctx );
 #else
  gate->scanhash = (void*)&scanhash_blakecoin;
  gate->hash     = (void*)&blakecoinhash;
-//  blakecoin_init( &blake_init_ctx );
 #endif
-  gate->optimizations = AVX2_OPT;
+  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->get_max64 = (void*)&blakecoin_get_max64;
  return true;
 }
--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -1,12 +1,21 @@
 #ifndef __BLAKECOIN_GATE_H__
-#define __BLAKECOIN_GATE_H__
+#define __BLAKECOIN_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
+#if defined(__AVX__)
  #define BLAKECOIN_4WAY
 #endif
+#if defined(__AVX2__)
+  #define BLAKECOIN_8WAY
+#endif
+
+#if defined (BLAKECOIN_8WAY)
+void blakecoin_8way_hash(void *state, const void *input);
+int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif

 #if defined (BLAKECOIN_4WAY)
 void blakecoin_4way_hash(void *state, const void *input);
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -12,11 +12,11 @@ static __thread blake256_4way_context blake_mid;
 void decred_hash_4way( void *state, const void *input )
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     uint32_t hash0[8] __attribute__ ((aligned (32)));
-     uint32_t hash1[8] __attribute__ ((aligned (32)));
-     uint32_t hash2[8] __attribute__ ((aligned (32)));
-     uint32_t hash3[8] __attribute__ ((aligned (32)));
-     void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
+//     uint32_t hash0[8] __attribute__ ((aligned (32)));
+//     uint32_t hash1[8] __attribute__ ((aligned (32)));
+//     uint32_t hash2[8] __attribute__ ((aligned (32)));
+//     uint32_t hash3[8] __attribute__ ((aligned (32)));
+     const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
     blake256_4way_context ctx __attribute__ ((aligned (64)));

@@ -38,7 +38,6 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;

   // copy to buffer guaranteed to be aligned.
@@ -52,7 +51,6 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,

   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
   do {
-      found[0] = found[1] = found[2] = found[3] = false;
      * noncep    = n;
      *(noncep+1) = n+1;
      *(noncep+2) = n+2;
@@ -60,35 +58,12 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,

      decred_hash_4way( hash, vdata );

-      if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
      {
-          work_set_target_ratio( work, hash );
-          found[0] = true;
-          num_found++;
-          nonces[0] = n;
-          pdata[DECRED_NONCE_INDEX] = n;
-      }
-      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
-      {
-          work_set_target_ratio( work, hash+8 );
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-      }
-      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
-      {
-          work_set_target_ratio( work, hash+16 );
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-      }
-
-      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
-      {
-          work_set_target_ratio( work, hash+24 );
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
+          pdata[DECRED_NONCE_INDEX] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
  } while ( (num_found == 0) && (n < max_nonce) 
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -111,12 +111,8 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
    const uint32_t first_nonce = pdata[19];
    const uint32_t Htarg = ptarget[7];
    uint32_t *nonces = work->nonces;
-    bool *found = work->nfound;
    int num_found = 0;
-    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-    uint32_t *noncep1 = vdata + 75;
-    uint32_t *noncep2 = vdata + 77;
-    uint32_t *noncep3 = vdata + 79;
+    uint32_t *noncep = vdata + 73;   // 9*8 + 1

 //    uint32_t _ALIGN(32) hash64[8];
 //    uint32_t _ALIGN(32) endiandata[32];
@@ -150,47 +146,19 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
        {
           uint32_t mask = masks[m];
           do {
-              found[0] = found[1] = found[2] = found[3] = false;
-              be32enc( noncep0, n   );
-              be32enc( noncep1, n+1 );
-              be32enc( noncep2, n+2 );
-              be32enc( noncep3, n+3 );
+              be32enc( noncep,   n   );
+              be32enc( noncep+2, n+1 );
+              be32enc( noncep+4, n+2 );
+              be32enc( noncep+6, n+3 );

              pentablakehash_4way( hash, vdata );

-              // return immediately on nonce found, only one submit
-              if ( ( !(hash[7] & mask) ) && fulltest( hash, ptarget ) )
+              for ( int i = 0; i < 4; i++ )
+              if ( !( (hash+(i<<3))[7] & mask )
+                  && fulltest( hash+(i<<3), ptarget ) )
              {
-                  found[0] = true;
-                  num_found++;
-                  nonces[0] = n;
-                  pdata[19] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
-              }
-              if ( (! ((hash+8)[7] & mask) ) && fulltest( hash+8, ptarget ) )
-              {
-                  found[1] = true;
-                  num_found++;
-                  nonces[1] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
-              }
-              if ( ( !((hash+16)[7] & mask) ) && fulltest( hash+16, ptarget ) )
-              {
-                  found[2] = true;
-                  num_found++;
-                  nonces[2] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
-              }
-              if ( ( !((hash+24)[7] & mask) ) && fulltest( hash+24, ptarget ) )
-              {
-                  found[3] = true;
-                  num_found++;
-                  nonces[3] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
+                 nonces[ num_found++ ] = n+i;
+                 work_set_target_ratio( work, hash+(i<<3) );
              }
              n += 4;

--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
@@ -49,11 +49,6 @@ extern "C"{

 // BMW256

-// BMW small has a bug not present in big. Lanes 0 & 2 produce valid hash
-// while lanes 1 & 3 produce invalid hash. The cause is not known.
-
-
-
 static const sph_u32 IV256[] = {
 	SPH_C32(0x40414243), SPH_C32(0x44454647),
 	SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
@@ -121,16 +116,14 @@ static const sph_u64 IV512[] = {
   mm_rotl_32( M[ ( (j) + (off) ) & 0xF ] , \
                ( ( (j) + (off) ) & 0xF ) + 1 )

-// The multiplication in this macro is a possible cause of the lane
-// corruption but a vectorized mullo did not help.
 #define add_elt_s( M, H, j ) \
   _mm_xor_si128( \
-      _mm_add_epi32( \
-            _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
-                                          rol_off_32( M, j, 3 ) ), \
-                           rol_off_32( M, j, 10 ) ), \
-            _mm_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) \
-                   ), H[ ( (j)+7 ) & 0xF ] )
+       _mm_add_epi32( \
+             _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \
+                                           rol_off_32( M, j, 3 ) ), \
+                            rol_off_32( M, j, 10 ) ), \
+       _mm_set1_epi32( ( (j)+16 ) * SPH_C32(0x05555555UL) ) ), \
+   H[ ( (j)+7 ) & 0xF ] )


 #define expand1s( qt, M, H, i ) \
@@ -447,22 +440,22 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
 {
   __m128i qt[32], xl, xh; \

-   qt[ 0] = ss0( Ws0 ) + H[ 1];
-   qt[ 1] = ss1( Ws1 ) + H[ 2];
-   qt[ 2] = ss2( Ws2 ) + H[ 3];
-   qt[ 3] = ss3( Ws3 ) + H[ 4];
-   qt[ 4] = ss4( Ws4 ) + H[ 5];
-   qt[ 5] = ss0( Ws5 ) + H[ 6];
-   qt[ 6] = ss1( Ws6 ) + H[ 7];
-   qt[ 7] = ss2( Ws7 ) + H[ 8];
-   qt[ 8] = ss3( Ws8 ) + H[ 9];
-   qt[ 9] = ss4( Ws9 ) + H[10];
-   qt[10] = ss0( Ws10) + H[11];
-   qt[11] = ss1( Ws11) + H[12];
-   qt[12] = ss2( Ws12) + H[13];
-   qt[13] = ss3( Ws13) + H[14];
-   qt[14] = ss4( Ws14) + H[15];
-   qt[15] = ss0( Ws15) + H[ 0];
+   qt[ 0] = _mm_add_epi32( ss0( Ws0 ), H[ 1] );
+   qt[ 1] = _mm_add_epi32( ss1( Ws1 ), H[ 2] );
+   qt[ 2] = _mm_add_epi32( ss2( Ws2 ), H[ 3] );
+   qt[ 3] = _mm_add_epi32( ss3( Ws3 ), H[ 4] );
+   qt[ 4] = _mm_add_epi32( ss4( Ws4 ), H[ 5] );
+   qt[ 5] = _mm_add_epi32( ss0( Ws5 ), H[ 6] );
+   qt[ 6] = _mm_add_epi32( ss1( Ws6 ), H[ 7] );
+   qt[ 7] = _mm_add_epi32( ss2( Ws7 ), H[ 8] );
+   qt[ 8] = _mm_add_epi32( ss3( Ws8 ), H[ 9] );
+   qt[ 9] = _mm_add_epi32( ss4( Ws9 ), H[10] );
+   qt[10] = _mm_add_epi32( ss0( Ws10), H[11] );
+   qt[11] = _mm_add_epi32( ss1( Ws11), H[12] );
+   qt[12] = _mm_add_epi32( ss2( Ws12), H[13] );
+   qt[13] = _mm_add_epi32( ss3( Ws13), H[14] );
+   qt[14] = _mm_add_epi32( ss4( Ws14), H[15] );
+   qt[15] = _mm_add_epi32( ss0( Ws15), H[ 0] );
   qt[16] = expand1s( qt, M, H, 16 );
   qt[17] = expand1s( qt, M, H, 17 );
   qt[18] = expand2s( qt, M, H, 18 );
@@ -738,24 +731,24 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )

 void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
 {
-   __m256i qt[32], xl, xh; \
+   __m256i qt[32], xl, xh;

-   qt[ 0] = sb0( Wb0 ) + H[ 1]; 
-   qt[ 1] = sb1( Wb1 ) + H[ 2]; 
-   qt[ 2] = sb2( Wb2 ) + H[ 3]; 
-   qt[ 3] = sb3( Wb3 ) + H[ 4]; 
-   qt[ 4] = sb4( Wb4 ) + H[ 5]; 
-   qt[ 5] = sb0( Wb5 ) + H[ 6]; 
-   qt[ 6] = sb1( Wb6 ) + H[ 7]; 
-   qt[ 7] = sb2( Wb7 ) + H[ 8]; 
-   qt[ 8] = sb3( Wb8 ) + H[ 9]; 
-   qt[ 9] = sb4( Wb9 ) + H[10]; 
-   qt[10] = sb0( Wb10) + H[11]; 
-   qt[11] = sb1( Wb11) + H[12]; 
-   qt[12] = sb2( Wb12) + H[13]; 
-   qt[13] = sb3( Wb13) + H[14];
-   qt[14] = sb4( Wb14) + H[15]; 
-   qt[15] = sb0( Wb15) + H[ 0]; 
+   qt[ 0] = _mm256_add_epi64( sb0( Wb0 ), H[ 1] ); 
+   qt[ 1] = _mm256_add_epi64( sb1( Wb1 ), H[ 2] ); 
+   qt[ 2] = _mm256_add_epi64( sb2( Wb2 ), H[ 3] ); 
+   qt[ 3] = _mm256_add_epi64( sb3( Wb3 ), H[ 4] ); 
+   qt[ 4] = _mm256_add_epi64( sb4( Wb4 ), H[ 5] ); 
+   qt[ 5] = _mm256_add_epi64( sb0( Wb5 ), H[ 6] ); 
+   qt[ 6] = _mm256_add_epi64( sb1( Wb6 ), H[ 7] ); 
+   qt[ 7] = _mm256_add_epi64( sb2( Wb7 ), H[ 8] ); 
+   qt[ 8] = _mm256_add_epi64( sb3( Wb8 ), H[ 9] ); 
+   qt[ 9] = _mm256_add_epi64( sb4( Wb9 ), H[10] ); 
+   qt[10] = _mm256_add_epi64( sb0( Wb10), H[11] ); 
+   qt[11] = _mm256_add_epi64( sb1( Wb11), H[12] ); 
+   qt[12] = _mm256_add_epi64( sb2( Wb12), H[13] ); 
+   qt[13] = _mm256_add_epi64( sb3( Wb13), H[14] );
+   qt[14] = _mm256_add_epi64( sb4( Wb14), H[15] ); 
+   qt[15] = _mm256_add_epi64( sb0( Wb15), H[ 0] ); 
   qt[16] = expand1b( qt, M, H, 16 ); 
   qt[17] = expand1b( qt, M, H, 17 ); 
   qt[18] = expand2b( qt, M, H, 18 ); 
@@ -868,7 +861,7 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
 } 

 // BMW256
-/*
+
 static const uint32_t final_s[16][4] =
 {
   { 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0 },
@@ -888,7 +881,7 @@ static const uint32_t final_s[16][4] =
   { 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae },
   { 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf }
 };
-*/
+/*
 static const __m128i final_s[16] =
 {
   { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 },
@@ -908,7 +901,7 @@ static const __m128i final_s[16] =
   { 0xaaaaaaaeaaaaaaae, 0xaaaaaaaeaaaaaaae },
   { 0xaaaaaaafaaaaaaaf, 0xaaaaaaafaaaaaaaf }
 };
-
+*/
 static void
 bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
 {
@@ -984,7 +977,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
   }
   memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
   buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
-   buf[ (buf_size - 4) >> 2 ] = mm_zero;
+   buf[ (buf_size - 4) >> 2 ] = m128_zero;
   compress_small( buf, h, h2 );

   for ( u = 0; u < 16; u ++ )
--- a/algo/bmw/bmw.test
+++ b/algo/bmw/bmw.test
--- a/algo/bmw/sse2/bmw.c
+++ b/algo/bmw/sse2/bmw.c
@@ -477,7 +477,7 @@ do { \
        for (u = 0; u < 16; u ++) \
        sph_enc64le_aligned(data + 8 * u, h2[u]); \
        dh = h1; \
-        h = final_b; \
+        h = (sph_u64*)final_b; \
    } \
    /* end wrapped for break loop */ \
    out = dst; \
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -0,0 +1,205 @@
+#if defined(__AVX2__)
+
+#include <stdbool.h>
+#include <unistd.h>
+#include <memory.h>
+#include "cube-hash-2way.h"
+
+// 2x128
+
+static void transform_2way( cube_2way_context *sp )
+{
+    int r;
+    const int rounds = sp->rounds;
+
+    __m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+
+    x0 = _mm256_load_si256( (__m256i*)sp->h     );
+    x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
+    x2 = _mm256_load_si256( (__m256i*)sp->h + 2 );
+    x3 = _mm256_load_si256( (__m256i*)sp->h + 3 );
+    x4 = _mm256_load_si256( (__m256i*)sp->h + 4 );
+    x5 = _mm256_load_si256( (__m256i*)sp->h + 5 );
+    x6 = _mm256_load_si256( (__m256i*)sp->h + 6 );
+    x7 = _mm256_load_si256( (__m256i*)sp->h + 7 );
+
+    for ( r = 0; r < rounds; ++r )
+    {
+        x4 = _mm256_add_epi32( x0, x4 );
+        x5 = _mm256_add_epi32( x1, x5 );
+        x6 = _mm256_add_epi32( x2, x6 );
+        x7 = _mm256_add_epi32( x3, x7 );
+        y0 = x2;
+        y1 = x3;
+        y2 = x0;
+        y3 = x1;
+        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0,  7 ),
+                               _mm256_srli_epi32( y0, 25 ) );
+        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1,  7 ),
+                               _mm256_srli_epi32( y1, 25 ) );
+        x2 = _mm256_xor_si256( _mm256_slli_epi32( y2,  7 ),
+                               _mm256_srli_epi32( y2, 25 ) );
+        x3 = _mm256_xor_si256( _mm256_slli_epi32( y3,  7 ),
+                               _mm256_srli_epi32( y3, 25 ) );
+        x0 = _mm256_xor_si256( x0, x4 );
+        x1 = _mm256_xor_si256( x1, x5 );
+        x2 = _mm256_xor_si256( x2, x6 );
+        x3 = _mm256_xor_si256( x3, x7 );
+        x4 = mm256_swap128_64( x4 );
+        x5 = mm256_swap128_64( x5 );
+        x6 = mm256_swap128_64( x6 );
+        x7 = mm256_swap128_64( x7 );
+        x4 = _mm256_add_epi32( x0, x4 );
+        x5 = _mm256_add_epi32( x1, x5 );
+        x6 = _mm256_add_epi32( x2, x6 );
+        x7 = _mm256_add_epi32( x3, x7 );
+        y0 = x1;
+        y1 = x0;
+        y2 = x3;
+        y3 = x2;
+        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
+                               _mm256_srli_epi32( y0, 21 ) );
+        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ),
+                               _mm256_srli_epi32( y1, 21 ) );
+        x2 = _mm256_xor_si256( _mm256_slli_epi32( y2, 11 ),
+                               _mm256_srli_epi32( y2, 21 ) );
+        x3 = _mm256_xor_si256( _mm256_slli_epi32( y3, 11 ),
+                               _mm256_srli_epi32( y3, 21 ) );
+        x0 = _mm256_xor_si256( x0, x4 );
+        x1 = _mm256_xor_si256( x1, x5 );
+        x2 = _mm256_xor_si256( x2, x6 );
+        x3 = _mm256_xor_si256( x3, x7 );
+        x4 = mm256_swap64_32( x4 );
+        x5 = mm256_swap64_32( x5 );
+        x6 = mm256_swap64_32( x6 );
+        x7 = mm256_swap64_32( x7 );
+    }
+
+    _mm256_store_si256( (__m256i*)sp->h,     x0 );
+    _mm256_store_si256( (__m256i*)sp->h + 1, x1 );
+    _mm256_store_si256( (__m256i*)sp->h + 2, x2 );
+    _mm256_store_si256( (__m256i*)sp->h + 3, x3 );
+    _mm256_store_si256( (__m256i*)sp->h + 4, x4 );
+    _mm256_store_si256( (__m256i*)sp->h + 5, x5 );
+    _mm256_store_si256( (__m256i*)sp->h + 6, x6 );
+    _mm256_store_si256( (__m256i*)sp->h + 7, x7 );
+
+}
+
+cube_2way_context cube_2way_ctx_cache __attribute__ ((aligned (64)));
+
+int cube_2way_reinit( cube_2way_context *sp )
+{
+   memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
+   return 0;
+
+}
+
+int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
+                       int blockbytes )
+{
+    int i;
+
+    // all sizes of __m128i
+    cube_2way_ctx_cache.hashlen   = hashbitlen/128;
+    cube_2way_ctx_cache.blocksize = blockbytes/16;
+    cube_2way_ctx_cache.rounds    = rounds;
+    cube_2way_ctx_cache.pos       = 0;
+
+    for ( i = 0; i < 8; ++i )
+       cube_2way_ctx_cache.h[i] = m256_zero;
+
+    cube_2way_ctx_cache.h[0] = _mm256_set_epi32(
+                                   0, rounds, blockbytes, hashbitlen / 8,
+                                   0, rounds, blockbytes, hashbitlen / 8 );
+
+    for ( i = 0; i < 10; ++i )
+       transform_2way( &cube_2way_ctx_cache );
+
+    memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
+    return 0;
+}
+
+
+int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
+{
+    const int len = size / 16;
+    const __m256i *in = (__m256i*)data;
+    int i;
+
+    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
+    // Current usage sata is either 64 or 80 bytes.
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_2way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    return 0;
+}
+
+int cube_2way_close( cube_2way_context *sp, void *output )
+{
+    __m256i *hash = (__m256i*)output;
+    int i;
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
+                    _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80,
+                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+    transform_2way( sp );
+
+    sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
+                                                             1,0,0,0 ) );
+    for ( i = 0; i < 10; ++i )
+       transform_2way( &cube_2way_ctx_cache );
+
+    for ( i = 0; i < sp->hashlen; i++ )
+       hash[i] = sp->h[i];
+
+    return 0;
+}
+
+int cube_2way_update_close( cube_2way_context *sp, void *output,
+                               const void *data, size_t size )
+{
+    const int len = size / 16;
+    const __m256i *in = (__m256i*)data;
+    __m256i *hash = (__m256i*)output;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_2way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
+                    _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80,
+                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+    transform_2way( sp );
+
+    sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
+                                                             1,0,0,0 ) );
+    for ( i = 0; i < 10; ++i )
+       transform_2way( &cube_2way_ctx_cache );
+
+    for ( i = 0; i < sp->hashlen; i++ )
+       hash[i] = sp->h[i];
+
+    return 0;
+}
+
+#endif
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -0,0 +1,36 @@
+#ifndef CUBE_HASH_2WAY_H__
+#define CUBE_HASH_2WAY_H__
+
+#if defined(__AVX2__)
+
+#include <stdint.h>
+#include "avxdefs.h"
+
+// 2x128, 2 way parallel SSE2
+
+struct _cube_2way_context
+{
+    int hashlen;           // __m128i
+    int rounds;
+    int blocksize;         // __m128i
+    int pos;               // number of __m128i read into x from current block
+    __m256i h[8] __attribute__ ((aligned (64)));
+};
+
+typedef struct _cube_2way_context cube_2way_context;
+
+int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
+                       int blockbytes );
+// reinitialize context with same parameters, much faster.
+int cube_2way_reinit( cube_2way_context *sp );
+
+int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
+
+int cube_2way_close( cube_2way_context *sp, void *output );
+
+int cube_2way_update_close( cube_2way_context *sp, void *output,
+                            const void *data, size_t size );
+
+
+#endif
+#endif
--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -129,7 +129,7 @@ static void transform( cubehashParam *sp )
 #endif
 }  // transform

-// Ccubehash context initializing is very expensive.
+// Cubehash context initializing is very expensive.
 // Cache the intial value for faster reinitializing.
 cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));

--- a/algo/echo/aes_ni/architectures
+++ b/algo/echo/aes_ni/architectures
@@ -1,2 +0,0 @@
-amd64
-x86
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -14,18 +14,20 @@
 * Institute of Applied Mathematics, Middle East Technical University, Turkey.
 *
 */
+#if defined(__AES__)

 #include <memory.h>
 #include "miner.h"
 #include "hash_api.h"
-#include "vperm.h"
-
+//#include "vperm.h"
+#include <immintrin.h>
+/*
 #ifndef NO_AES_NI
 #include <wmmintrin.h>
 #else
 #include <tmmintrin.h>
 #endif
-
+*/

 MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
 MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
@@ -246,7 +248,8 @@ void DumpState(__m128i *ps)
 void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
 {
 	unsigned int r, b, i, j;
-	__m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
+//      __m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
+	__m128i t1, t2, s2, k1;
 	__m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 


@@ -396,7 +399,7 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
 {
 	int i, j;

-	ctx->k = _mm_xor_si128(ctx->k, ctx->k);
+        ctx->k = _mm_setzero_si128(); 
 	ctx->processed_bits = 0;
 	ctx->uBufferBytes = 0;

@@ -742,4 +745,4 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
 	return SUCCESS;
 }

-
+#endif
--- a/algo/echo/aes_ni/implementors
+++ b/algo/echo/aes_ni/implementors
@@ -1 +0,0 @@
-Çağdaş Çalık
--- a/algo/echo/aes_ni/vperm.h
+++ b/algo/echo/aes_ni/vperm.h
@@ -1,120 +0,0 @@
-/*
- * file        : vperm.h
- * version     : 1.0.208
- * date        : 14.12.2010
- * 
- * vperm implementation of AES s-box 
- *
- * Credits: Adapted from Mike Hamburg's AES implementation, http://crypto.stanford.edu/vpaes/
- *
- * Cagdas Calik
- * ccalik@metu.edu.tr
- * Institute of Applied Mathematics, Middle East Technical University, Turkey.
- *
- */
-
-#ifndef VPERM_H
-#define VPERM_H
-
-#include "algo/sha/sha3_common.h"
-#include <tmmintrin.h>
-
-/*
-extern const unsigned int _k_s0F[];
-extern const unsigned int _k_ipt[];
-extern const unsigned int _k_opt[];
-extern const unsigned int _k_inv[];
-extern const unsigned int _k_sb1[];
-extern const unsigned int _k_sb2[];
-extern const unsigned int _k_sb3[];
-extern const unsigned int _k_sb4[];
-extern const unsigned int _k_sb5[];
-extern const unsigned int _k_sb7[];
-extern const unsigned int _k_sbo[];
-extern const unsigned int _k_h63[];
-extern const unsigned int _k_hc6[];
-extern const unsigned int _k_h5b[];
-extern const unsigned int _k_h4e[];
-extern const unsigned int _k_h0e[];
-extern const unsigned int _k_h15[];
-extern const unsigned int _k_aesmix1[];
-extern const unsigned int _k_aesmix2[];
-extern const unsigned int _k_aesmix3[];
-extern const unsigned int _k_aesmix4[];
-*/
-
-// input: x, table
-// output: x
-#define TRANSFORM(x, table, t1, t2)\
-	t1 = _mm_andnot_si128(M128(_k_s0F), x);\
-	t1 = _mm_srli_epi32(t1, 4);\
-	x  = _mm_and_si128(x, M128(_k_s0F));\
-	t1 = _mm_shuffle_epi8(*((__m128i*)table + 1), t1);\
-	x  = _mm_shuffle_epi8(*((__m128i*)table + 0), x);\
-	x  = _mm_xor_si128(x, t1)
-
-#if 0
-// compiled erroneously with 32-bit msc compiler
-	t2 = _mm_shuffle_epi8(table[0], x);\
-	x  = _mm_shuffle_epi8(table[1], t1);\
-	x  = _mm_xor_si128(x, t2)
-#endif
-
-// input: x
-// output: t2, t3
-#define SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4)\
-	t1 = _mm_andnot_si128(M128(_k_s0F), x);\
-	t1 = _mm_srli_epi32(t1, 4);\
-	x  = _mm_and_si128(x, M128(_k_s0F));\
-	t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 1), x);\
-	x  = _mm_xor_si128(x, t1);\
-	t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t1);\
-	t3 = _mm_xor_si128(t3, t2);\
-	t4 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), x);\
-	t4 = _mm_xor_si128(t4, t2);\
-	t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t3);\
-	t2 = _mm_xor_si128(t2, x);\
-	t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t4);\
-	t3 = _mm_xor_si128(t3, t1);\
-
-
-// input: x1, x2, table
-// output: y
-#define VPERM_LOOKUP(x1, x2, table, y, t)\
-	t = _mm_shuffle_epi8(*((__m128i*)table + 0), x1);\
-	y = _mm_shuffle_epi8(*((__m128i*)table + 1), x2);\
-	y = _mm_xor_si128(y, t)
-
-
-// input: x
-// output: x
-#define SUBSTITUTE_VPERM(x, t1, t2, t3, t4)  \
-	TRANSFORM(x, _k_ipt, t1, t2);\
-	SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
-	VPERM_LOOKUP(t2, t3, _k_sbo, x, t1);\
-	x = _mm_xor_si128(x, M128(_k_h63))
-
-
-// input: x
-// output: x
-#define AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3) \
-	SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
-	VPERM_LOOKUP(t2, t3, _k_sb1, s1, t1);\
-	VPERM_LOOKUP(t2, t3, _k_sb2, s2, t1);\
-	s3 = _mm_xor_si128(s1, s2);\
-	x = _mm_shuffle_epi8(s2, M128(_k_aesmix1));\
-	x = _mm_xor_si128(x, _mm_shuffle_epi8(s3, M128(_k_aesmix2)));\
-	x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix3)));\
-	x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix4)));\
-	x = _mm_xor_si128(x, M128(_k_h5b))
-
-
-// input: x
-// output: x
-#define AES_ROUND_VPERM(x, t1, t2, t3, t4, s1, s2, s3) \
-	TRANSFORM(x, _k_ipt, t1, t2);\
-	AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3);\
-	TRANSFORM(x, _k_opt, t1, t2)
-
-#endif // VPERM_H
-
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "myrgr-gate.h"

 #include <stdio.h>
 #include <stdlib.h>
@@ -10,8 +10,6 @@
 #else
  #include "aes_ni/hash-groestl.h"
 #endif
-
-#include <openssl/sha.h>
 #include "algo/sha/sph_sha2.h"

 typedef struct {
@@ -20,11 +18,7 @@ typedef struct {
 #else
    hashState_groestl       groestl;
 #endif
-#ifndef USE_SPH_SHA
-   SHA256_CTX         sha;
-#else
-   sph_sha256_context sha;
-#endif
+    sph_sha256_context sha;
 } myrgr_ctx_holder;

 myrgr_ctx_holder myrgr_ctx;
@@ -36,44 +30,37 @@ void init_myrgr_ctx()
 #else
     init_groestl (&myrgr_ctx.groestl, 64 );
 #endif
-#ifndef USE_SPH_SHA
-   SHA256_Init( &myrgr_ctx.sha );
-#else
-   sph_sha256_init( &myrgr_ctx.sha );
-#endif
+     sph_sha256_init(&myrgr_ctx.sha);
 }

-void myriadhash( void *output, const void *input )
+void myriad_hash(void *output, const void *input)
 {
-     myrgr_ctx_holder ctx __attribute__ ((aligned (64)));
-     memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
-     uint32_t hash[16] __attribute__ ((aligned (64))); 
+        myrgr_ctx_holder ctx;
+        memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
+
+ 	uint32_t _ALIGN(32) hash[16];

 #ifdef NO_AES_NI
-     sph_groestl512(&ctx.groestl, input, 80);
-     sph_groestl512_close(&ctx.groestl, hash);
+	sph_groestl512(&ctx.groestl, input, 80);
+	sph_groestl512_close(&ctx.groestl, hash);
 #else
-     update_and_final_groestl( &ctx.groestl, (char*)input,
-                               (const char*)input, 640 );
+        update_groestl( &ctx.groestl, (char*)input, 640 );
+        final_groestl( &ctx.groestl, (char*)hash);
 #endif

-#ifndef USE_SPH_SHA
-     SHA256_Update( &ctx.sha, hash, 64 );
-     SHA256_Final( (unsigned char*) hash, &ctx.sha );
-#else
-     sph_sha256(&ctx.sha, hash, 64);
-     sph_sha256_close(&ctx.sha, hash);
-#endif
-     memcpy(output, hash, 32);
+	sph_sha256(&ctx.sha, hash, 64);
+	sph_sha256_close(&ctx.sha, hash);
+
+	memcpy(output, hash, 32);
 }

-int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done)
+int scanhash_myriad(int thr_id, struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done)
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;

-	uint32_t endiandata[20] __attribute__ ((aligned (64)));
+	uint32_t _ALIGN(64) endiandata[20];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;

@@ -84,9 +71,9 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,

 	do {
 		const uint32_t Htarg = ptarget[7];
-		uint32_t hash[8] __attribute__ ((aligned (64)));
+		uint32_t hash[8];
 		be32enc(&endiandata[19], nonce);
-		myriadhash(hash, endiandata);
+		myriad_hash(hash, endiandata);

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
@@ -101,14 +88,15 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-
+/*
 bool register_myriad_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+    gate->optimizations = SSE2_OPT | AES_OPT;
    init_myrgr_ctx();
    gate->scanhash = (void*)&scanhash_myriad;
    gate->hash     = (void*)&myriadhash;
+//    gate->hash_alt = (void*)&myriadhash;
    gate->get_max64 = (void*)&get_max64_0x3ffff;
    return true;
 };
-
+*/
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -0,0 +1,108 @@
+#include "myrgr-gate.h"
+
+#if defined(MYRGR_4WAY)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "aes_ni/hash-groestl.h"
+#include "algo/sha/sha2-hash-4way.h"
+
+typedef struct {
+    hashState_groestl       groestl;
+    sha256_4way_context     sha;
+} myrgr_4way_ctx_holder;
+
+myrgr_4way_ctx_holder myrgr_4way_ctx;
+
+void init_myrgr_4way_ctx()
+{
+     init_groestl (&myrgr_4way_ctx.groestl, 64 );
+     sha256_4way_init( &myrgr_4way_ctx.sha );
+}
+
+void myriad_4way_hash( void *output, const void *input )
+{
+     uint32_t hash0[20] __attribute__ ((aligned (64)));
+     uint32_t hash1[20] __attribute__ ((aligned (64)));
+     uint32_t hash2[20] __attribute__ ((aligned (64)));
+     uint32_t hash3[20] __attribute__ ((aligned (64)));
+     uint32_t vhash[16*4] __attribute__ ((aligned (64)));
+     myrgr_4way_ctx_holder ctx;
+     memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );
+
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
+
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     sha256_4way( &ctx.sha, vhash, 64 );
+     sha256_4way_close( &ctx.sha, vhash );
+
+     mm_deinterleave_4x32( output, output+32, output+64, output+96,
+                           vhash, 256 );
+}
+
+int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 76; // 19*4
+
+/*
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+
+	uint32_t _ALIGN(64) endiandata[20];
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+*/
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   swab32_array( edata, pdata, 20 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   do {
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );
+
+      myriad_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          pdata[19] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/groestl/myrgr-gate.c
+++ b/algo/groestl/myrgr-gate.c
@@ -0,0 +1,18 @@
+#include "myrgr-gate.h"
+
+bool register_myriad_algo( algo_gate_t* gate )
+{
+#if defined (MYRGR_4WAY)
+  init_myrgr_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_myriad_4way;
+  gate->hash      = (void*)&myriad_4way_hash;
+#else
+  init_myrgr_ctx();
+  gate->scanhash  = (void*)&scanhash_myriad;
+  gate->hash      = (void*)&myriad_hash;
+#endif
+  gate->optimizations = AES_OPT | AVX2_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/groestl/myrgr-gate.h
+++ b/algo/groestl/myrgr-gate.h
@@ -0,0 +1,30 @@
+#ifndef MYRGR_GATE_H__
+#define MYRGR_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define MYRGR_4WAY
+#endif
+
+#if defined(MYRGR_4WAY)
+
+void myriad_4way_hash( void *state, const void *input );
+
+int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_myrgr_4way_ctx();
+
+#endif
+
+void myriad_hash( void *state, const void *input );
+
+int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_myrgr_ctx();
+
+#endif
+
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -48,20 +48,20 @@ extern "C"{

 #define SPH_SIZE_hamsi512   512

+// Partial is only scalar but needs pointer ref for hamsi-helper
+// deprecate partial_len
 typedef struct {
-   __m128i h[16];
-   __m128i partial[2];
+   __m256i h[8];
+   __m256i buf[1];
   size_t partial_len;
   sph_u32 count_high, count_low;
 } hamsi_4way_big_context;

 typedef hamsi_4way_big_context hamsi512_4way_context;

-void hamsi512_4way_init(void *cc);
-
-void hamsi512_4way(void *cc, const void *data, size_t len);
-
-void hamsi512_4way_close(void *cc, void *dst);
+void hamsi512_4way_init( hamsi512_4way_context *sc );
+void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
+void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );

 #ifdef __cplusplus
 }
--- a/algo/hamsi/hamsi-helper-4way.c
+++ b/algo/hamsi/hamsi-helper-4way.c
@@ -1,482 +0,0 @@
-/* $Id: hamsi_helper.c 202 2010-05-31 15:46:48Z tp $ */
-/*
- * Helper code for Hamsi (input block expansion). This code is
- * automatically generated and includes precomputed tables for
- * expansion code which handles 2 to 8 bits at a time.
- *
- * This file is included from hamsi.c, and is not meant to be compiled
- * independently.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-/* Note: this table lists bits within each byte from least
-   siginificant to most significant. */
-static const sph_u32 T512[64][16] = {
-	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
-	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
-	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
-	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
-	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
-	  SPH_C32(0x9e69af68) },
-	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
-	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
-	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
-	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
-	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
-	  SPH_C32(0x0c26f262) },
-	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
-	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
-	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
-	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
-	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
-	  SPH_C32(0xdc24e61f) },
-	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
-	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
-	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
-	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
-	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
-	  SPH_C32(0x3daac2da) },
-	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
-	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
-	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
-	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
-	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
-	  SPH_C32(0x78cace29) },
-	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
-	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
-	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
-	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
-	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
-	  SPH_C32(0x2dd1f9ab) },
-	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
-	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
-	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
-	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
-	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
-	  SPH_C32(0xbf2c0be2) },
-	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
-	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
-	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
-	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
-	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
-	  SPH_C32(0x32219526) },
-	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
-	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
-	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
-	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
-	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
-	  SPH_C32(0xac8e6c88) },
-	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
-	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
-	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
-	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
-	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
-	  SPH_C32(0x7b1bd6b9) },
-	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
-	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
-	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
-	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
-	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
-	  SPH_C32(0xf746c320) },
-	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
-	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
-	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
-	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
-	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
-	  SPH_C32(0x69505b3a) },
-	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
-	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
-	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
-	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
-	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
-	  SPH_C32(0x8a341574) },
-	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
-	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
-	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
-	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
-	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
-	  SPH_C32(0x450360bf) },
-	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
-	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
-	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
-	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
-	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
-	  SPH_C32(0xf3d45758) },
-	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
-	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
-	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
-	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
-	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
-	  SPH_C32(0x925c44e9) },
-	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
-	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
-	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
-	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
-	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
-	  SPH_C32(0xa123ff9f) },
-	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
-	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
-	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
-	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
-	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
-	  SPH_C32(0x1568ff0f) },
-	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
-	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
-	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
-	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
-	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
-	  SPH_C32(0xc5c1eb3e) },
-	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
-	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
-	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
-	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
-	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
-	  SPH_C32(0x1af21fe1) },
-	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
-	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
-	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
-	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
-	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
-	  SPH_C32(0x857f3c2b) },
-	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
-	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
-	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
-	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
-	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
-	  SPH_C32(0x2ba05a55) },
-	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
-	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
-	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
-	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
-	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
-	  SPH_C32(0xfeabf254) },
-	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
-	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
-	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
-	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
-	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
-	  SPH_C32(0xfe1cdc7f) },
-	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
-	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
-	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
-	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
-	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
-	  SPH_C32(0xb0a51834) },
-	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
-	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
-	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
-	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
-	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
-	  SPH_C32(0xa6b8c28d) },
-	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
-	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
-	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
-	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
-	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
-	  SPH_C32(0x3a4e99d7) },
-	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
-	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
-	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
-	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
-	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
-	  SPH_C32(0xe1844257) },
-	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
-	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
-	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
-	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
-	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
-	  SPH_C32(0x2c3b504e) },
-	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
-	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
-	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
-	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
-	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
-	  SPH_C32(0x524a0d59) },
-	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
-	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
-	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
-	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
-	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
-	  SPH_C32(0x378dd173) },
-	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
-	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
-	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
-	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
-	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
-	  SPH_C32(0x8b6c72bd) },
-	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
-	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
-	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
-	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
-	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
-	  SPH_C32(0x8e67b7fa) },
-	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
-	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
-	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
-	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
-	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
-	  SPH_C32(0x443d3004) },
-	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
-	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
-	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
-	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
-	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
-	  SPH_C32(0xf4f6ea7b) },
-	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
-	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
-	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
-	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
-	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
-	  SPH_C32(0x979961d0) },
-	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
-	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
-	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
-	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
-	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
-	  SPH_C32(0x98aa496e) },
-	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
-	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
-	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
-	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
-	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
-	  SPH_C32(0x094e3198) },
-	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
-	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
-	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
-	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
-	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
-	  SPH_C32(0xe86cba2e) },
-	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
-	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
-	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
-	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
-	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
-	  SPH_C32(0x4b7eec55) },
-	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
-	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
-	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
-	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
-	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
-	  SPH_C32(0x1e7536a6) },
-	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
-	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
-	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
-	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
-	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
-	  SPH_C32(0x24314f17) },
-	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
-	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
-	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
-	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
-	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
-	  SPH_C32(0x9075b1ce) },
-	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
-	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
-	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
-	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
-	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
-	  SPH_C32(0x9b6ef888) },
-	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
-	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
-	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
-	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
-	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
-	  SPH_C32(0xd8b61463) },
-	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
-	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
-	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
-	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
-	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
-	  SPH_C32(0x3ea660f7) },
-	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
-	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
-	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
-	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
-	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
-	  SPH_C32(0x7f975691) },
-	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
-	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
-	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
-	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
-	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
-	  SPH_C32(0x2c94459e) },
-	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
-	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
-	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
-	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
-	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
-	  SPH_C32(0x56a7b19f) },
-	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
-	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
-	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
-	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
-	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
-	  SPH_C32(0x81fdf908) },
-	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
-	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
-	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
-	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
-	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
-	  SPH_C32(0x5bd61539) },
-	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
-	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
-	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
-	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
-	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
-	  SPH_C32(0x15b961e7) },
-	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
-	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
-	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
-	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
-	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
-	  SPH_C32(0x2a2c18f0) },
-	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
-	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
-	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
-	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
-	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
-	  SPH_C32(0x551e3d6e) },
-	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
-	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
-	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
-	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
-	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
-	  SPH_C32(0x33c5244f) },
-	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
-	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
-	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
-	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
-	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
-	  SPH_C32(0x8a58e6a4) },
-	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
-	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
-	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
-	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
-	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
-	  SPH_C32(0xda878000) },
-	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
-	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
-	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
-	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
-	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
-	  SPH_C32(0x3c5dfffe) },
-	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
-	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
-	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
-	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
-	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
-	  SPH_C32(0x7b1675d7) },
-	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
-	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
-	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
-	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
-	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
-	  SPH_C32(0x2879ebac) },
-	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
-	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
-	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
-	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
-	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
-	  SPH_C32(0xbe0a679e) },
-	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
-	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
-	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
-	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
-	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
-	  SPH_C32(0x30aebcf7) },
-	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
-	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
-	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
-	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
-	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
-	  SPH_C32(0xc7ff60f0) },
-	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
-	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
-	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
-	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
-	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
-	  SPH_C32(0xe7e00a94) }
-};
-
-#define U_BIG( n ) \
-do { \
-  __m128i db = buf[n]; \
-  for ( int u = 0; u < 32; u++ ) \
-  { \
-     __m128i dm = mm_negate_32( _mm_and_si128( db, mm_one_32 ) ); \
-     m0 = _mm_xor_si128( m0, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m1 = _mm_xor_si128( m1, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m2 = _mm_xor_si128( m2, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m3 = _mm_xor_si128( m3, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m4 = _mm_xor_si128( m4, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m5 = _mm_xor_si128( m5, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m6 = _mm_xor_si128( m6, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m7 = _mm_xor_si128( m7, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m8 = _mm_xor_si128( m8, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m9 = _mm_xor_si128( m9, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     mA = _mm_xor_si128( mA, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     mB = _mm_xor_si128( mB, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     mC = _mm_xor_si128( mC, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     mD = _mm_xor_si128( mD, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     mE = _mm_xor_si128( mE, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     mF = _mm_xor_si128( mF, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     db = _mm_srli_epi32( db, 1 ); \
-  } \
-} while (0);
-
-#define INPUT_BIG \
-do { \
-  const sph_u32 *tp = &T512[0][0]; \
-  m0 = mm_zero; \
-  m1 = mm_zero; \
-  m2 = mm_zero; \
-  m3 = mm_zero; \
-  m4 = mm_zero; \
-  m5 = mm_zero; \
-  m6 = mm_zero; \
-  m7 = mm_zero; \
-  m8 = mm_zero; \
-  m9 = mm_zero; \
-  mA = mm_zero; \
-  mB = mm_zero; \
-  mC = mm_zero; \
-  mD = mm_zero; \
-  mE = mm_zero; \
-  mF = mm_zero; \
-  U_BIG( 0 ); \
-  U_BIG( 1 ); \
-} while (0)
-
-#ifdef __cplusplus
-}
-#endif
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,

   current = (unsigned)sc->count_low & 127UL;

-   sc->buf[ current>>2 ] = mm_one_32;
+   sc->buf[ current>>2 ] = m128_one_32;
   current += 4;   
   RSTATE;
   if ( current > 116UL )
--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -15,7 +15,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/skein/sse2/skein.c"

 #ifndef NO_AES_NI
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -42,15 +42,82 @@ void hodl_le_build_stratum_request( char* req, struct work* work,
   free( xnonce2str );
 }

-void hodl_build_extraheader( struct work* g_work, struct stratum_ctx *sctx )
+char* hodl_malloc_txs_request( struct work *work )
+{
+  char* req;
+  json_t *val;
+  char data_str[2 * sizeof(work->data) + 1];
+  int i;
+
+  for ( i = 0; i < ARRAY_SIZE(work->data); i++ )
+    be32enc( work->data + i, work->data[i] );
+
+  bin2hex( data_str, (unsigned char *)work->data, 88 );
+  if ( work->workid )
+  {
+    char *params;
+    val = json_object();
+    json_object_set_new( val, "workid", json_string( work->workid ) );
+    params = json_dumps( val, 0 );
+    json_decref( val );
+    req = malloc( 128 + 2*88 + strlen( work->txs ) + strlen( params ) );
+    sprintf( req,
+     "{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":1}\r\n",
+      data_str, work->txs, params);
+    free( params );
+  }
+  else
+  {
+    req = malloc( 128 + 2*88 + strlen(work->txs));
+    sprintf( req,
+       "{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":1}\r\n",
+        data_str, work->txs);
+  }
+  return req;
+}
+
+void hodl_build_block_header( struct work* g_work, uint32_t version,
+                              uint32_t *prevhash, uint32_t *merkle_tree,
+                              uint32_t ntime, uint32_t nbits )
 {
-   uchar merkle_root[64] = { 0 };
-   size_t t;
   int i;

-   algo_gate.gen_merkle_root( merkle_root, sctx );
+   memset( g_work->data, 0, sizeof(g_work->data) );
+   g_work->data[0] = version;
+
+   if ( have_stratum )
+      for ( i = 0; i < 8; i++ )
+         g_work->data[ 1+i ] = le32dec( prevhash + i );
+   else
+      for (i = 0; i < 8; i++)
+         g_work->data[ 8-i ] = le32dec( prevhash + i );
+
+   for ( i = 0; i < 8; i++ )
+      g_work->data[ 9+i ] = be32dec( merkle_tree + i );
+
+   g_work->data[ algo_gate.ntime_index ] = ntime;
+   g_work->data[ algo_gate.nbits_index ] = nbits;
+   g_work->data[22] = 0x80000000;
+   g_work->data[31] = 0x00000280;
+}
+
+// hodl build_extra_header is redundant, hodl can use std_build_extra_header
+// and call hodl_build_block_header.
+#if 0
+void hodl_build_extraheader( struct work* g_work, struct stratum_ctx *sctx )
+{
+   uchar merkle_tree[64] = { 0 };
+   size_t t;
+//   int i;
+
+   algo_gate.gen_merkle_root( merkle_tree, sctx );
   // Increment extranonce2
   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
+
+   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
+          (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
+          le32dec( sctx->job.ntime ), le32dec( sctx->job.nbits ) );
+/*
   // Assemble block header
   memset( g_work->data, 0, sizeof(g_work->data) );
   g_work->data[0] = le32dec( sctx->job.version );
@@ -63,7 +130,9 @@ void hodl_build_extraheader( struct work* g_work, struct stratum_ctx *sctx )
   g_work->data[ algo_gate.nbits_index ] = le32dec( sctx->job.nbits );
   g_work->data[22] = 0x80000000;
   g_work->data[31] = 0x00000280;
+*/
 }
+#endif

 // called only by thread 0, saves a backup of g_work
 void hodl_get_new_work( struct work* work, struct work* g_work)
@@ -73,6 +142,22 @@ void hodl_get_new_work( struct work* work, struct work* g_work)
     hodl_work.data[ algo_gate.nonce_index ] = ( clock() + rand() ) % 9999;
 }

+json_t *hodl_longpoll_rpc_call( CURL *curl, int *err, char* lp_url )
+{
+   json_t *val;
+   char *req = NULL;
+
+   if ( have_gbt )
+   {
+      req = malloc( strlen( gbt_lp_req ) + strlen( lp_id ) + 1 );
+      sprintf( req, gbt_lp_req, lp_id );
+   }
+   val = json_rpc_call( curl, lp_url, rpc_userpass,
+                        req ? req : getwork_req, err, JSON_RPC_LONGPOLL );
+   free( req );
+   return val;
+}
+
 // called by every thread, copies the backup to each thread's work.
 void hodl_resync_threads( struct work* work )
 {
@@ -95,10 +180,11 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done )
 {
 #ifndef NO_AES_NI
-  GenRandomGarbage( hodl_scratchbuf, work->data, thr_id );
+  GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, thr_id );
  pthread_barrier_wait( &hodl_barrier );
  return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
 #endif
+  return false;
 }

 bool register_hodl_algo( algo_gate_t* gate )
@@ -107,17 +193,26 @@ bool register_hodl_algo( algo_gate_t* gate )
  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
  return false;
 #endif
+//  if ( TOTAL_CHUNKS % opt_n_threads )
+//  {
+//     applog(LOG_ERR,"Thread count must be power of 2.");
+//     return false;
+//  }
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
-  gate->optimizations         = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations         = AES_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
  gate->get_new_work          = (void*)&hodl_get_new_work;
+  gate->longpoll_rpc_call     = (void*)&hodl_longpoll_rpc_call;
  gate->set_target            = (void*)&hodl_set_target;
  gate->build_stratum_request = (void*)&hodl_le_build_stratum_request;
-  gate->build_extraheader     = (void*)&hodl_build_extraheader;
+  gate->malloc_txs_request    = (void*)&hodl_malloc_txs_request;
+  gate->build_block_header    = (void*)&hodl_build_block_header;
+//  gate->build_extraheader     = (void*)&hodl_build_extraheader;
  gate->resync_threads        = (void*)&hodl_resync_threads;
  gate->do_this_thread        = (void*)&hodl_do_this_thread;
  gate->work_cmp_size         = 76;
  hodl_scratchbuf = (unsigned char*)malloc( 1 << 30 );
+  allow_getwork = false;
  return ( hodl_scratchbuf != NULL );
 }

--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -10,23 +10,26 @@

 #ifndef NO_AES_NI               

-void GenerateGarbageCore(CacheEntry *Garbage, int ThreadID, int ThreadCount, void *MidHash)
+void GenerateGarbageCore( CacheEntry *Garbage, int ThreadID, int ThreadCount,
+     void *MidHash )
 {
-#ifdef __AVX__
-    uint64_t* TempBufs[SHA512_PARALLEL_N] ;
-    uint64_t* desination[SHA512_PARALLEL_N];
+    const int Chunk = TOTAL_CHUNKS / ThreadCount;
+    const uint32_t StartChunk = ThreadID * Chunk;
+    const uint32_t EndChunk   = StartChunk + Chunk;

-    for ( int i=0; i<SHA512_PARALLEL_N; ++i )
+#ifdef __AVX__
+    uint64_t* TempBufs[ SHA512_PARALLEL_N ] ;
+    uint64_t* desination[ SHA512_PARALLEL_N ];
+
+    for ( int i=0; i < SHA512_PARALLEL_N; ++i )
    {
-        TempBufs[i] = (uint64_t*)malloc(32);
-        memcpy(TempBufs[i], MidHash, 32);
+        TempBufs[i] = (uint64_t*)malloc( 32 );
+        memcpy( TempBufs[i], MidHash, 32 );
    }

-    uint32_t StartChunk = ThreadID * (TOTAL_CHUNKS / ThreadCount);
-    for ( uint32_t i = StartChunk;
-          i < StartChunk + (TOTAL_CHUNKS / ThreadCount); i+= SHA512_PARALLEL_N )
+    for ( uint32_t i = StartChunk; i < EndChunk; i += SHA512_PARALLEL_N )
    {
-        for ( int j=0; j<SHA512_PARALLEL_N; ++j )
+        for ( int j = 0; j < SHA512_PARALLEL_N; ++j )
        {
            ( (uint32_t*)TempBufs[j] )[0] = i + j;
            desination[j] = (uint64_t*)( (uint8_t *)Garbage + ( (i+j)
@@ -35,15 +38,13 @@ void GenerateGarbageCore(CacheEntry *Garbage, int ThreadID, int ThreadCount, voi
        sha512Compute32b_parallel( TempBufs, desination );
    }

-    for ( int i=0; i<SHA512_PARALLEL_N; ++i )
+    for ( int i = 0; i < SHA512_PARALLEL_N; ++i )
        free( TempBufs[i] );
 #else
    uint32_t TempBuf[8];
    memcpy( TempBuf, MidHash, 32 );

-    uint32_t StartChunk = ThreadID * (TOTAL_CHUNKS / ThreadCount);
-    for ( uint32_t i = StartChunk;
-          i < StartChunk + (TOTAL_CHUNKS / ThreadCount); ++i )
+    for ( uint32_t i = StartChunk; i < EndChunk; ++i )
    {
        TempBuf[0] = i;
        SHA512( ( uint8_t *)TempBuf, 32,
@@ -150,6 +151,9 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
        int searchNumber = COMPARE_SIZE / opt_n_threads;
        int startLoc = threadNumber * searchNumber;

+        if ( opt_debug )
+           applog( LOG_DEBUG,"Hash target= %08lx", ptarget[7] );
+
        for(int32_t k = startLoc; k < startLoc + searchNumber && !work_restart[threadNumber].restart; k++)
        {
           // copy data to first l2 cache
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -44,7 +44,7 @@ void jha_hash_4way( void *out, const void *input )
    for ( int round = 0; round < 3; round++ )
    {
       vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
-               vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero );
+               vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero );

       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
       init_groestl( &ctx_groestl, 64 );
@@ -95,12 +95,8 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t Htarg = ptarget[7];
   uint32_t n = pdata[19];
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-   uint32_t *noncep1 = vdata + 75;
-   uint32_t *noncep2 = vdata + 77;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1

   uint64_t htmax[] = {
 		0,
@@ -131,46 +127,21 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
      {
         uint32_t mask = masks[m];
         do {
-              found[0] = found[1] = found[2] = found[3] = false;
-              be32enc( noncep0, n   );
-              be32enc( noncep1, n+1 );
-              be32enc( noncep2, n+2 );
-              be32enc( noncep3, n+3 );
+              be32enc( noncep,   n   );
+              be32enc( noncep+2, n+1 );
+              be32enc( noncep+4, n+2 );
+              be32enc( noncep+6, n+3 );

              jha_hash_4way( hash, vdata );
              pdata[19] = n;

-              if ( ( !(hash[7] & mask) )
-                   && fulltest( hash, ptarget ) )
+              for ( int i = 0; i < 4; i++ )
+              if ( ( !( (hash+(i<<3))[7] & mask ) == 0 )
+                  && fulltest( hash+(i<<3), ptarget ) )
              {
-                 found[0] = true;
-                 num_found++;
-                 nonces[0] = n;
-                 work_set_target_ratio( work, hash );
-              }
-              if ( ( !((hash+8)[7] & mask) )
-                   && fulltest( hash+8, ptarget ) )
-              {
-                 found[1] = true;
-                 num_found++;
-                 nonces[1] = n+1;
-                 work_set_target_ratio( work, hash+8 );
-              }
-              if ( ( !((hash+16)[7] & mask) )
-                 && fulltest( hash+16, ptarget ) )
-              {
-                 found[2] = true;
-                 num_found++;
-                 nonces[2] = n+2;
-                 work_set_target_ratio( work, hash+16 );
-              }
-              if ( ( !((hash+24)[7] & mask) )
-                   && fulltest( hash+24, ptarget ) )
-              {
-                 found[3] = true;
-                 num_found++;
-                 nonces[3] = n+3;
-                 work_set_target_ratio( work, hash+24 );
+                 pdata[19] = n;
+                 nonces[ num_found++ ] = n+i;
+                 work_set_target_ratio( work, hash+(i<<3) );
              }
              n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/jh/sse2/jh_sse2_opt64.h
+++ b/algo/jh/sse2/jh_sse2_opt64.h
@@ -339,13 +339,13 @@ do { \
        jhSbuffer[53] = 0x00, \
        jhSbuffer[54] = 0x00, \
        jhSbuffer[55] = 0x00; \
-        jhSbuffer[56] = ((64*8) >> 56) & 0xff, \
-        jhSbuffer[57] = ((64*8) >> 48) & 0xff, \
-        jhSbuffer[58] = ((64*8) >> 40) & 0xff, \
-        jhSbuffer[59] = ((64*8) >> 32) & 0xff, \
-        jhSbuffer[60] = ((64*8) >> 24) & 0xff, \
-        jhSbuffer[61] = ((64*8) >> 16) & 0xff, \
-        jhSbuffer[62] = ((64*8) >> 8) & 0xff, \
+        jhSbuffer[56] = ((char)((uint64_t)(64*8) >> 56)) & 0xff, \
+        jhSbuffer[57] = ((char)((uint64_t)(64*8) >> 48)) & 0xff, \
+        jhSbuffer[58] = ((char)((uint64_t)(64*8) >> 40)) & 0xff, \
+        jhSbuffer[59] = ((char)((uint64_t)(64*8) >> 32)) & 0xff, \
+        jhSbuffer[60] = ((char)((uint64_t)(64*8) >> 24)) & 0xff, \
+        jhSbuffer[61] = ((char)((uint64_t)(64*8) >> 16)) & 0xff, \
+        jhSbuffer[62] = ((char)((uint64_t)(64*8) >> 8)) & 0xff, \
        jhSbuffer[63] = (64*8) & 0xff; \
        b = true; \
    } \
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -32,12 +32,8 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
 //   const uint32_t Htarg = ptarget[7];
   uint32_t endiandata[20];
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-   uint32_t *noncep1 = vdata + 75;
-   uint32_t *noncep2 = vdata + 77;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1

   for ( int i=0; i < 19; i++ ) 
      be32enc( &endiandata[i], pdata[i] );
@@ -46,42 +42,20 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+2, n+1 );
+      be32enc( noncep+4, n+2 );
+      be32enc( noncep+6, n+3 );
 	
      keccakhash_4way( hash, vdata );

-      if ( ( ( hash[7] & 0xFFFFFF00 ) == 0 )
-         && fulltest( hash, ptarget) )
+      for ( int i = 0; i < 4; i++ )
+      if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
+           && fulltest( hash+(i<<3), ptarget ) )
      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = n;
-          pdata[19] = n;
-      }
-      if ( ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 )
-         && fulltest( hash+8, ptarget) ) 
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-      }
-      if ( ( ( (hash+16) [7] & 0xFFFFFF00 ) == 0 )
-         && fulltest( hash+16, ptarget) )
-      {
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-      }
-      if ( ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 )
-         && fulltest( hash+24, ptarget) )
-      {
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
+         pdata[19] = n+i;
+         nonces[ num_found++ ] = n+i;
+         work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;

--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -59,7 +59,7 @@ static const sph_u64 RC[] = {
 #define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
 #define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
-#define NOT64(d, s)      (d = _mm256_xor_si256(s,mm256_neg1))
+#define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rotl_64(v, n))
 #define XOR64_IOTA       XOR64

@@ -375,12 +375,12 @@ static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
          kc->w[i] = _mm256_setzero_si256();

   // Initialization for the "lane complement".
-   kc->w[ 1] = mm256_neg1;
-   kc->w[ 2] = mm256_neg1;
-   kc->w[ 8] = mm256_neg1;
-   kc->w[12] = mm256_neg1;
-   kc->w[17] = mm256_neg1;
-   kc->w[20] = mm256_neg1;
+   kc->w[ 1] = m256_neg1;
+   kc->w[ 2] = m256_neg1;
+   kc->w[ 8] = m256_neg1;
+   kc->w[12] = m256_neg1;
+   kc->w[17] = m256_neg1;
+   kc->w[20] = m256_neg1;
   kc->ptr = 0;
   kc->lim = 200 - (out_size >> 2);
 }
--- a/algo/keccak/sse2/keccak.c
+++ b/algo/keccak/sse2/keccak.c
@@ -775,10 +775,8 @@ static const sph_u64 RC[] = {
 			KF_ELT( 5,  6, RC[j + 5]); \
 			KF_ELT( 6,  7, RC[j + 6]); \
 			KF_ELT( 7,  8, RC[j + 7]); \
-*/
-
-	//kekDECL_STATE \
-        
+	kekDECL_STATE \
+*/        
 #define DECL_KEC  


--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -0,0 +1,583 @@
+/*
+ * luffa_for_sse2.c
+ * Version 2.0 (Sep 15th 2009)
+ *
+ * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
+ *
+ * Hitachi, Ltd. is the owner of this software and hereby grant
+ * the U.S. Government and any interested party the right to use
+ * this software for the purposes of the SHA-3 evaluation process,
+ * notwithstanding that this software is copyrighted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <string.h>
+#include <immintrin.h>
+#include "luffa-hash-2way.h"
+
+#if defined(__AVX2__)
+
+#include "avxdefs.h"
+
+#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
+                               0UL, 0UL, 0UL, 0xffffffffUL )
+
+#define ADD_CONSTANT(a,b,c0,c1)\
+    a = _mm256_xor_si256(a,c0);\
+    b = _mm256_xor_si256(b,c1);\
+
+#define MULT2(a0,a1) \
+do { \
+  register __m256i b = _mm256_xor_si256( a0, \
+                   _mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \
+  a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
+  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
+} while(0)
+
+// confirm pointer arithmetic
+// ok but use array indexes
+#define STEP_PART(x,c,t)\
+    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
+    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
+    MIXWORD(*x,*(x+4),*t,*(t+1));\
+    MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
+    MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
+    MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
+    ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
+
+#define SUBCRUMB(a0,a1,a2,a3,t)\
+    t  = _mm256_load_si256(&a0);\
+    a0 = _mm256_or_si256(a0,a1);\
+    a2 = _mm256_xor_si256(a2,a3);\
+    a1 = _mm256_andnot_si256(a1, m256_neg1 );\
+    a0 = _mm256_xor_si256(a0,a3);\
+    a3 = _mm256_and_si256(a3,t);\
+    a1 = _mm256_xor_si256(a1,a3);\
+    a3 = _mm256_xor_si256(a3,a2);\
+    a2 = _mm256_and_si256(a2,a0);\
+    a0 = _mm256_andnot_si256(a0, m256_neg1 );\
+    a2 = _mm256_xor_si256(a2,a1);\
+    a1 = _mm256_or_si256(a1,a3);\
+    t  = _mm256_xor_si256(t,a1);\
+    a3 = _mm256_xor_si256(a3,a2);\
+    a2 = _mm256_and_si256(a2,a1);\
+    a1 = _mm256_xor_si256(a1,a0);\
+    a0 = _mm256_load_si256(&t);\
+
+#define MIXWORD(a,b,t1,t2)\
+    b  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(a,2);\
+    t2 = _mm256_srli_epi32(a,30);\
+     a = _mm256_or_si256(t1,t2);\
+    a  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(b,14);\
+    t2 = _mm256_srli_epi32(b,18);\
+    b  = _mm256_or_si256(t1,t2);\
+    b  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(a,10);\
+    t2 = _mm256_srli_epi32(a,22);\
+    a  = _mm256_or_si256(t1,t2);\
+    a  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(b,1);\
+    t2 = _mm256_srli_epi32(b,31);\
+    b  = _mm256_or_si256(t1,t2);
+
+#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
+    a1 = _mm256_shuffle_epi32(a1,147);\
+    t0 = _mm256_load_si256(&a1);\
+    a1 = _mm256_unpacklo_epi32(a1,a0);\
+    t0 = _mm256_unpackhi_epi32(t0,a0);\
+    t1 = _mm256_shuffle_epi32(t0,78);\
+    a0 = _mm256_shuffle_epi32(a1,78);\
+    SUBCRUMB(t1,t0,a0,a1,tmp0);\
+    t0 = _mm256_unpacklo_epi32(t0,t1);\
+    a1 = _mm256_unpacklo_epi32(a1,a0);\
+    a0 = _mm256_load_si256(&a1);\
+    a0 = _mm256_unpackhi_epi64(a0,t0);\
+    a1 = _mm256_unpacklo_epi64(a1,t0);\
+    a1 = _mm256_shuffle_epi32(a1,57);\
+    MIXWORD(a0,a1,tmp0,tmp1);\
+    ADD_CONSTANT(a0,a1,c0,c1);
+
+#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
+    s2 = _mm256_load_si256(&r1);\
+    q2 = _mm256_load_si256(&p1);\
+    r2 = _mm256_shuffle_epi32(r2,216);\
+    p2 = _mm256_shuffle_epi32(p2,216);\
+    r1 = _mm256_unpacklo_epi32(r1,r0);\
+    p1 = _mm256_unpacklo_epi32(p1,p0);\
+    s2 = _mm256_unpackhi_epi32(s2,r0);\
+    q2 = _mm256_unpackhi_epi32(q2,p0);\
+    s0 = _mm256_load_si256(&r2);\
+    q0 = _mm256_load_si256(&p2);\
+    r2 = _mm256_unpacklo_epi64(r2,r1);\
+    p2 = _mm256_unpacklo_epi64(p2,p1);\
+    s1 = _mm256_load_si256(&s0);\
+    q1 = _mm256_load_si256(&q0);\
+    s0 = _mm256_unpackhi_epi64(s0,r1);\
+    q0 = _mm256_unpackhi_epi64(q0,p1);\
+    r2 = _mm256_shuffle_epi32(r2,225);\
+    p2 = _mm256_shuffle_epi32(p2,225);\
+    r0 = _mm256_load_si256(&s1);\
+    p0 = _mm256_load_si256(&q1);\
+    s0 = _mm256_shuffle_epi32(s0,225);\
+    q0 = _mm256_shuffle_epi32(q0,225);\
+    s1 = _mm256_unpacklo_epi64(s1,s2);\
+    q1 = _mm256_unpacklo_epi64(q1,q2);\
+    r0 = _mm256_unpackhi_epi64(r0,s2);\
+    p0 = _mm256_unpackhi_epi64(p0,q2);\
+    s2 = _mm256_load_si256(&r0);\
+    q2 = _mm256_load_si256(&p0);\
+    s3 = _mm256_load_si256(&r2);\
+    q3 = _mm256_load_si256(&p2);\
+
+#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
+    s0 = _mm256_load_si256(&r0);\
+    q0 = _mm256_load_si256(&p0);\
+    s1 = _mm256_load_si256(&r2);\
+    q1 = _mm256_load_si256(&p2);\
+    r0 = _mm256_unpackhi_epi32(r0,r1);\
+    p0 = _mm256_unpackhi_epi32(p0,p1);\
+    r2 = _mm256_unpackhi_epi32(r2,r3);\
+    p2 = _mm256_unpackhi_epi32(p2,p3);\
+    s0 = _mm256_unpacklo_epi32(s0,r1);\
+    q0 = _mm256_unpacklo_epi32(q0,p1);\
+    s1 = _mm256_unpacklo_epi32(s1,r3);\
+    q1 = _mm256_unpacklo_epi32(q1,p3);\
+    r1 = _mm256_load_si256(&r0);\
+    p1 = _mm256_load_si256(&p0);\
+    r0 = _mm256_unpackhi_epi64(r0,r2);\
+    p0 = _mm256_unpackhi_epi64(p0,p2);\
+    s0 = _mm256_unpackhi_epi64(s0,s1);\
+    q0 = _mm256_unpackhi_epi64(q0,q1);\
+    r1 = _mm256_unpacklo_epi64(r1,r2);\
+    p1 = _mm256_unpacklo_epi64(p1,p2);\
+    s2 = _mm256_load_si256(&r0);\
+    q2 = _mm256_load_si256(&p0);\
+    s1 = _mm256_load_si256(&r1);\
+    q1 = _mm256_load_si256(&p1);\
+
+#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    s1 = _mm256_load_si256(&r3);\
+    q1 = _mm256_load_si256(&p3);\
+    s3 = _mm256_load_si256(&r3);\
+    q3 = _mm256_load_si256(&p3);\
+    s1 = _mm256_unpackhi_epi32(s1,r2);\
+    q1 = _mm256_unpackhi_epi32(q1,p2);\
+    s3 = _mm256_unpacklo_epi32(s3,r2);\
+    q3 = _mm256_unpacklo_epi32(q3,p2);\
+    s0 = _mm256_load_si256(&s1);\
+    q0 = _mm256_load_si256(&q1);\
+    s2 = _mm256_load_si256(&s3);\
+    q2 = _mm256_load_si256(&q3);\
+    r3 = _mm256_load_si256(&r1);\
+    p3 = _mm256_load_si256(&p1);\
+    r1 = _mm256_unpacklo_epi32(r1,r0);\
+    p1 = _mm256_unpacklo_epi32(p1,p0);\
+    r3 = _mm256_unpackhi_epi32(r3,r0);\
+    p3 = _mm256_unpackhi_epi32(p3,p0);\
+    s0 = _mm256_unpackhi_epi64(s0,r3);\
+    q0 = _mm256_unpackhi_epi64(q0,p3);\
+    s1 = _mm256_unpacklo_epi64(s1,r3);\
+    q1 = _mm256_unpacklo_epi64(q1,p3);\
+    s2 = _mm256_unpackhi_epi64(s2,r1);\
+    q2 = _mm256_unpackhi_epi64(q2,p1);\
+    s3 = _mm256_unpacklo_epi64(s3,r1);\
+    q3 = _mm256_unpacklo_epi64(q3,p1);
+
+#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
+
+/* initial values of chaining variables */
+static const uint32 IV[40] __attribute((aligned(32))) = {
+    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
+    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
+    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
+    0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
+    0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
+    0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
+    0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
+    0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
+    0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
+    0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
+};
+
+/* Round Constants */
+static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
+    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
+    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
+    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
+    0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
+    0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
+    0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
+    0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
+    0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
+    0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
+    0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
+    0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
+    0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
+    0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
+    0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
+    0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
+    0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
+    0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
+    0x00000000,0x00000000,0x00000000,0x5090d577,
+    0x00000000,0x00000000,0x00000000,0xac11d7fa,
+    0x00000000,0x00000000,0x00000000,0x2d1925ab,
+    0x00000000,0x00000000,0x00000000,0x1bcb66f2,
+    0x00000000,0x00000000,0x00000000,0xb46496ac,
+    0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
+    0x00000000,0x00000000,0x00000000,0xd1925ab0,
+    0x00000000,0x00000000,0x00000000,0x78602649,
+    0x00000000,0x00000000,0x00000000,0x29131ab6,
+    0x00000000,0x00000000,0x00000000,0x8edae952,
+    0x00000000,0x00000000,0x00000000,0x0fc053c3,
+    0x00000000,0x00000000,0x00000000,0x3b6ba548,
+    0x00000000,0x00000000,0x00000000,0x3f014f0c,
+    0x00000000,0x00000000,0x00000000,0xedae9520,
+    0x00000000,0x00000000,0x00000000,0xfc053c31
+};
+
+__m256i CNS[32];
+
+/***************************************************/
+/* Round function         */
+/* state: hash context    */
+
+void rnd512_2way( luffa_2way_context *state, __m256i *msg )
+{
+    __m256i t0, t1;
+    __m256i *chainv = state->chainv;
+    __m256i msg0, msg1;
+    __m256i tmp[2];
+    __m256i x[8];
+
+    t0 = chainv[0];
+    t1 = chainv[1];
+
+    t0 = _mm256_xor_si256( t0, chainv[2] );
+    t1 = _mm256_xor_si256( t1, chainv[3] );
+    t0 = _mm256_xor_si256( t0, chainv[4] );
+    t1 = _mm256_xor_si256( t1, chainv[5] );
+    t0 = _mm256_xor_si256( t0, chainv[6] );
+    t1 = _mm256_xor_si256( t1, chainv[7] );
+    t0 = _mm256_xor_si256( t0, chainv[8] );
+    t1 = _mm256_xor_si256( t1, chainv[9] );
+
+    MULT2( t0, t1 );
+
+    msg0 = _mm256_shuffle_epi32( msg[0], 27 );
+    msg1 = _mm256_shuffle_epi32( msg[1], 27 );
+
+    chainv[0] = _mm256_xor_si256( chainv[0], t0 );
+    chainv[1] = _mm256_xor_si256( chainv[1], t1 );
+    chainv[2] = _mm256_xor_si256( chainv[2], t0 );
+    chainv[3] = _mm256_xor_si256( chainv[3], t1 );
+    chainv[4] = _mm256_xor_si256( chainv[4], t0 );
+    chainv[5] = _mm256_xor_si256( chainv[5], t1 );
+    chainv[6] = _mm256_xor_si256( chainv[6], t0 );
+    chainv[7] = _mm256_xor_si256( chainv[7], t1 );
+    chainv[8] = _mm256_xor_si256( chainv[8], t0 );
+    chainv[9] = _mm256_xor_si256( chainv[9], t1 );
+
+    t0 = chainv[0];
+    t1 = chainv[1];
+
+    MULT2( chainv[0], chainv[1]);
+    chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
+    chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
+
+    MULT2( chainv[2], chainv[3]);
+    chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
+    chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
+
+    MULT2( chainv[4], chainv[5]);
+    chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
+    chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
+
+    MULT2( chainv[6], chainv[7]);
+    chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
+    chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
+
+    MULT2( chainv[8], chainv[9]);
+    chainv[8] = _mm256_xor_si256( chainv[8], t0 );
+    chainv[9] = _mm256_xor_si256( chainv[9], t1 );
+
+    t0 = chainv[8];
+    t1 = chainv[9];
+
+    MULT2( chainv[8], chainv[9]);
+    chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
+    chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
+
+    MULT2( chainv[6], chainv[7]);
+    chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
+    chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
+
+    MULT2( chainv[4], chainv[5]);
+    chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
+    chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
+
+    MULT2( chainv[2], chainv[3] );
+    chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
+    chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
+
+    MULT2( chainv[0], chainv[1] );
+    chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
+    chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
+    chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
+    chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
+    chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
+    chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
+
+    MULT2( msg0, msg1);
+
+    chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3],  1 ),
+                                 _mm256_srli_epi32( chainv[3], 31 ) );
+    chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5],  2 ),
+                                 _mm256_srli_epi32( chainv[5], 30 ) );
+    chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7],  3 ),
+                                 _mm256_srli_epi32( chainv[7], 29 ) );
+    chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9],  4 ),
+                                 _mm256_srli_epi32( chainv[9], 28 ) );
+
+    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
+                x[0], x[1], x[2], x[3],
+                chainv[1],chainv[3],chainv[5],chainv[7],
+                x[4], x[5], x[6], x[7] );
+
+    STEP_PART( &x[0], &CNS[ 0], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 2], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 4], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 6], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 8], &tmp[0] );
+    STEP_PART( &x[0], &CNS[10], &tmp[0] );
+    STEP_PART( &x[0], &CNS[12], &tmp[0] );
+    STEP_PART( &x[0], &CNS[14], &tmp[0] );
+
+    MIXTON1024( x[0], x[1], x[2], x[3],
+                chainv[0], chainv[2], chainv[4],chainv[6],
+                x[4], x[5], x[6], x[7],
+                chainv[1],chainv[3],chainv[5],chainv[7]);
+
+    /* Process last 256-bit block */
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[16], CNS[17],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[18], CNS[19],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[20], CNS[21],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[22], CNS[23],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[24], CNS[25],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[26], CNS[27],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[28], CNS[29],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[30], CNS[31],
+                tmp[0], tmp[1] );
+}
+
+
+/***************************************************/
+/* Finalization function  */
+/* state: hash context    */
+/* b[8]: hash values      */
+
+void finalization512_2way( luffa_2way_context *state, uint32 *b )
+{
+    uint32 hash[8] __attribute((aligned(64)));
+    __m256i* chainv = state->chainv;
+    __m256i t[2];
+    __m256i zero[2];
+    zero[0] = zero[1] = _mm256_setzero_si256();
+
+    /*---- blank round with m=0 ----*/
+    rnd512_2way( state, zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+
+    t[0] = _mm256_xor_si256( t[0], chainv[2] );
+    t[1] = _mm256_xor_si256( t[1], chainv[3] );
+    t[0] = _mm256_xor_si256( t[0], chainv[4] );
+    t[1] = _mm256_xor_si256( t[1], chainv[5] );
+    t[0] = _mm256_xor_si256( t[0], chainv[6] );
+    t[1] = _mm256_xor_si256( t[1], chainv[7] );
+    t[0] = _mm256_xor_si256( t[0], chainv[8] );
+    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+
+    t[0] = _mm256_shuffle_epi32( t[0], 27 );
+    t[1] = _mm256_shuffle_epi32( t[1], 27 );
+
+    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
+    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
+
+    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
+
+    rnd512_2way( state, zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+    t[0] = _mm256_xor_si256( t[0], chainv[2] );
+    t[1] = _mm256_xor_si256( t[1], chainv[3] );
+    t[0] = _mm256_xor_si256( t[0], chainv[4] );
+    t[1] = _mm256_xor_si256( t[1], chainv[5] );
+    t[0] = _mm256_xor_si256( t[0], chainv[6] );
+    t[1] = _mm256_xor_si256( t[1], chainv[7] );
+    t[0] = _mm256_xor_si256( t[0], chainv[8] );
+    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+
+    t[0] = _mm256_shuffle_epi32( t[0], 27 );
+    t[1] = _mm256_shuffle_epi32( t[1], 27 );
+
+    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
+    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
+
+    casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
+}
+
+int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
+{
+    int i;
+    state->hashbitlen = hashbitlen;
+
+    for ( i=0; i<32; i++ ) CNS[i] =
+          _mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
+                            CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2)    ],
+                            CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
+                            CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2)    ] );
+
+    for ( i=0; i<10; i++ ) state->chainv[i] =
+          _mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
+                            IV[ (i<<2) +1 ], IV[ (i<<2)    ],
+                            IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
+                            IV[ (i<<2) +1 ], IV[ (i<<2)    ] );
+
+    ((__m256i*)state->buffer)[0] = m256_zero;
+    ((__m256i*)state->buffer)[1] = m256_zero;
+
+    return 0;
+}
+
+// Do not call luffa_update_close after having called luffa_update.
+// Once luffa_update has been called only call luffa_update or luffa_close.
+int luffa_2way_update( luffa_2way_context *state, const void *data,
+                       size_t len )
+{
+    __m256i *vdata  = (__m256i*)data;
+    __m256i *buffer = (__m256i*)state->buffer;
+    __m256i msg[2];
+    int i;
+    int blocks = (int)len >> 5;
+    state-> rembytes = (int)len & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = mm256_bswap_32( vdata[ 0] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
+       rnd512_2way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    // store in buffer for transform in final for midstate to work
+    if ( state->rembytes  )
+    {
+      // remaining data bytes
+      buffer[0] = mm256_bswap_32( vdata[0] );
+      buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                   0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
+    }
+    return 0;
+}
+
+int luffa_2way_close( luffa_2way_context *state, void *hashval )
+{
+    __m256i *buffer = (__m256i*)state->buffer;
+    __m256i msg[2];
+
+    // transform pad block
+    if ( state->rembytes )
+      // not empty, data is in buffer
+      rnd512_2way( state, buffer );
+    else
+    {     // empty pad block, constant data
+      msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
+      msg[1] = m256_zero;
+      rnd512_2way( state, msg );
+    }
+    finalization512_2way( state, (uint32*)hashval );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_2way( state, (uint32*)( hashval+32 ) );
+    return 0;
+}
+
+int luffa_2way_update_close( luffa_2way_context *state,
+                 void *output, const void *data, size_t inlen )
+{
+// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
+    const __m256i *vdata  = (__m256i*)data;
+    __m256i msg[2];
+    int i;
+    const int blocks = (int)( inlen >> 5 );
+    state->rembytes = inlen & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
+       rnd512_2way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+    {
+       // padding of partial block
+       msg[0] = mm256_bswap_32( vdata[0] );
+       msg[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
+       rnd512_2way( state, msg );
+    }
+    else
+    {
+       // empty pad block
+       msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
+       msg[1] = m256_zero;
+       rnd512_2way( state, msg );
+    }
+
+    finalization512_2way( state, (uint32*)output );
+    if ( state->hashbitlen > 512 )
+        finalization512_2way( state, (uint32*)( output+32 ) );
+
+    return 0;
+}
+
+#endif
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -0,0 +1,69 @@
+#if !defined(LUFFA_HASH_2WAY_H__)
+#define LUFFA_HASH_2WAY_H__ 1
+/*
+ * luffa_for_sse2.h
+ * Version 2.0 (Sep 15th 2009)
+ *
+ * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
+ *
+ * Hitachi, Ltd. is the owner of this software and hereby grant
+ * the U.S. Government and any interested party the right to use
+ * this software for the purposes of the SHA-3 evaluation process,
+ * notwithstanding that this software is copyrighted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#if defined(__AVX2__)
+
+#include <immintrin.h>
+#include "algo/sha/sha3-defs.h"
+#include "avxdefs.h"
+
+/* The length of digests*/
+#define DIGEST_BIT_LEN_224 224
+#define DIGEST_BIT_LEN_256 256
+#define DIGEST_BIT_LEN_384 384
+#define DIGEST_BIT_LEN_512 512
+
+/*********************************/
+/* The parameters of Luffa       */
+#define MSG_BLOCK_BIT_LEN 256  /*The bit length of a message block*/
+#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
+                                                     * of a message block*/
+
+/* The number of blocks in Luffa */
+#define WIDTH_224 3
+#define WIDTH_256 3
+#define WIDTH_384 4
+#define WIDTH_512 5
+
+/* The limit of the length of message */
+#define LIMIT_224 64
+#define LIMIT_256 64
+#define LIMIT_384 128
+#define LIMIT_512 128
+/*********************************/
+
+typedef struct {
+    uint32 buffer[8*2] __attribute((aligned(64)));
+    __m256i chainv[10] __attribute((aligned(32)));   /* Chaining values */
+    int hashbitlen;
+    int rembytes;
+} luffa_2way_context;
+
+int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
+int luffa_2way_update( luffa_2way_context *state, const void *data,
+                       size_t len );
+int luffa_2way_close( luffa_2way_context *state, void *hashval );
+int luffa_2way_update_close( luffa_2way_context *state, void *output,
+                                   const void *data, size_t inlen );
+
+#endif
+#endif
--- a/algo/luffa/sse2/luffa_for_sse2.c
+++ b/algo/luffa/sse2/luffa_for_sse2.c
@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
+                      mm_bswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) );
+      casti_m128i( state->buffer, 0 ) = mm_bswap_32( cast_m128i( data ) );
      // padding of partial block
      casti_m128i( state->buffer, 1 ) =
            _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
+                      mm_bswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    {
      // padding of partial block
      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
-                      mm_byteswap_32( cast_m128i( data ) ) );
+                      mm_bswap_32( cast_m128i( data ) ) );
    }
    else
    {
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )

    _mm256_store_si256( (__m256i*)hash, t );

-    casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );

    rnd512( state, zero, zero );

@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )

    _mm256_store_si256( (__m256i*)hash, t );

-    casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
 }

 #else
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 0 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 1 ) = mm_bswap_32( casti_m128i( hash, 1 ) );

    rnd512( state, zero, zero );

@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 2 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 3 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
 }
 #endif

--- a/algo/luffa/sse2/luffa_for_sse2.h
+++ b/algo/luffa/sse2/luffa_for_sse2.h
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -0,0 +1,139 @@
+#include "allium-gate.h"
+#include <memory.h>
+#include <mm_malloc.h>
+
+#if defined (ALLIUM_4WAY)	
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/groestl/aes_ni/hash-groestl256.h"
+
+typedef struct {
+   blake256_4way_context     blake;
+   keccak256_4way_context    keccak;
+   cubehashParam             cube;
+   skein256_4way_context     skein;
+   hashState_groestl256      groestl;
+
+} allium_4way_ctx_holder;
+
+static __thread allium_4way_ctx_holder allium_4way_ctx;
+
+bool init_allium_4way_ctx()
+{
+   keccak256_4way_init( &allium_4way_ctx.keccak );
+   cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 );
+   skein256_4way_init( &allium_4way_ctx.skein );
+   init_groestl256( &allium_4way_ctx.groestl, 32 );
+   return true;
+}
+
+void allium_4way_hash( void *state, const void *input )
+{
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t vhash32[8*4] __attribute__ ((aligned (64)));
+   uint32_t vhash64[8*4] __attribute__ ((aligned (64)));
+   allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
+
+   memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
+   blake256_4way( &ctx.blake, input + (64<<2), 16 );
+   blake256_4way_close( &ctx.blake, vhash32 );
+
+   mm256_reinterleave_4x64( vhash64, vhash32, 256 );
+   keccak256_4way( &ctx.keccak, vhash64, 32 );
+   keccak256_4way_close( &ctx.keccak, vhash64 );
+   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
+
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+
+   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   skein256_4way( &ctx.skein, vhash64, 32 );
+   skein256_4way_close( &ctx.skein, vhash64 );
+   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
+
+   memcpy( state,    hash0, 32 );
+   memcpy( state+32, hash1, 32 );
+   memcpy( state+64, hash2, 32 );
+   memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 76; // 19*4
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   swab32_array( edata, pdata, 20 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   blake256_4way_init( &allium_4way_ctx.blake );
+   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
+
+   do {
+     be32enc( noncep,   n   );
+     be32enc( noncep+1, n+1 );
+     be32enc( noncep+2, n+2 );
+     be32enc( noncep+3, n+3 );
+
+     allium_4way_hash( hash, vdata );
+     pdata[19] = n;
+
+     for ( int i = 0; i < 4; i++ )
+     if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+     {
+         pdata[19] = n+i;
+         nonces[ num_found++ ] = n+i;
+         work_set_target_ratio( work, hash+(i<<3) );
+     }
+     n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/lyra2/allium-gate.c
+++ b/algo/lyra2/allium-gate.c
@@ -0,0 +1,22 @@
+#include "allium-gate.h"
+
+int64_t get_max64_0xFFFFLL() { return 0xFFFFLL; }
+
+bool register_allium_algo( algo_gate_t* gate )
+{
+#if defined (ALLIUM_4WAY)
+  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
+  gate->scanhash  = (void*)&scanhash_allium_4way;
+  gate->hash      = (void*)&allium_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&init_allium_ctx;
+  gate->scanhash  = (void*)&scanhash_allium;
+  gate->hash      = (void*)&allium_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->set_target        = (void*)&alt_set_target;
+  gate->get_max64         = (void*)&get_max64_0xFFFFLL;
+  return true;
+};
+
+
--- a/algo/lyra2/allium-gate.h
+++ b/algo/lyra2/allium-gate.h
@@ -0,0 +1,29 @@
+#ifndef ALLIUM_GATE_H__
+#define ALLIUM_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+#include "lyra2.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define ALLIUM_4WAY
+#endif
+
+bool register_allium_algo( algo_gate_t* gate );
+
+#if defined(ALLIUM_4WAY)
+
+void allium_4way_hash( void *state, const void *input );
+int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+bool init_allium_4way_ctx();
+
+#endif
+
+void allium_hash( void *state, const void *input );
+int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+bool init_allium_ctx();
+
+#endif
+
--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -0,0 +1,112 @@
+#include "allium-gate.h"
+#include <memory.h>
+#include "algo/blake/sph_blake.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#if defined(__AES__)
+#include "algo/groestl/aes_ni/hash-groestl256.h"
+#else
+#include "algo/groestl/sph_groestl.h"
+#endif
+#include "lyra2.h"
+
+typedef struct {
+        sph_blake256_context     blake;
+        sph_keccak256_context    keccak;
+        cubehashParam            cube;
+        sph_skein256_context     skein;
+#if defined (__AES__)
+        hashState_groestl256     groestl;
+#else
+        sph_groestl256_context   groestl;
+#endif
+} allium_ctx_holder;
+
+static __thread allium_ctx_holder allium_ctx;
+
+bool init_allium_ctx()
+{
+        sph_keccak256_init( &allium_ctx.keccak );
+        cubehashInit( &allium_ctx.cube, 256, 16, 32 );
+        sph_skein256_init( &allium_ctx.skein );
+#if defined (__AES__)
+        init_groestl256( &allium_ctx.groestl, 32 );
+#else
+        sph_groestl256_init( &allium_ctx.groestl );
+#endif
+        return true;
+}
+
+void allium_hash(void *state, const void *input)
+{
+    uint32_t hash[8] __attribute__ ((aligned (64)));
+    allium_ctx_holder ctx __attribute__ ((aligned (32)));
+
+    memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
+    sph_blake256( &ctx.blake, input + 64, 16 );
+    sph_blake256_close( &ctx.blake, hash );
+
+    sph_keccak256( &ctx.keccak, hash, 32 );
+    sph_keccak256_close( &ctx.keccak, hash );
+
+    LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
+
+    cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
+
+    LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
+
+    sph_skein256( &ctx.skein, hash, 32 );
+    sph_skein256_close( &ctx.skein, hash );
+
+#if defined (__AES__)
+   update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
+#else
+   sph_groestl256( &ctx.groestl, hash, 32 );
+   sph_groestl256_close( &ctx.groestl, hash );
+#endif
+
+    memcpy(state, hash, 32);
+}
+
+int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done )
+{
+    uint32_t _ALIGN(128) hash[8];
+    uint32_t _ALIGN(128) endiandata[20];
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+
+    const uint32_t Htarg = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t nonce = first_nonce;
+
+    if ( opt_benchmark )
+        ptarget[7] = 0x3ffff;
+
+    for ( int i = 0; i < 19; i++ )
+        be32enc( &endiandata[i], pdata[i] );
+
+    sph_blake256_init( &allium_ctx.blake );
+    sph_blake256( &allium_ctx.blake, endiandata, 64 );
+
+    do {
+        be32enc( &endiandata[19], nonce );
+        allium_hash( hash, endiandata );
+
+        if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+        {
+            work_set_target_ratio( work, hash );
+            pdata[19] = nonce;
+            *hashes_done = pdata[19] - first_nonce;
+            return 1;
+        }
+        nonce++;
+
+    } while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+    pdata[19] = nonce;
+    *hashes_done = pdata[19] - first_nonce + 1;
+    return 0;
+}
+
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -47,8 +47,9 @@
 */

 int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
-               uint64_t pwdlen, const void *salt, uint64_t saltlen,
-               uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
+               const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
+               const uint64_t timeCost, const uint64_t nRows,
+               const uint64_t nCols )
 {
   //====================== Basic variables ============================//
   uint64_t _ALIGN(256) state[16];
@@ -73,6 +74,8 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
   uint64_t *ptrWord = wholeMatrix;

+//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
   //=== Getting the password + salt + basil padded with 10*1 ==========//
   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
   //but this ensures that the password copied locally will be overwritten as soon as possible
@@ -209,8 +212,9 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
 }

 int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
-            uint64_t pwdlen, const void *salt, uint64_t saltlen,
-            uint64_t timeCost, uint64_t nRows, uint64_t nCols )
+            const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
+            const uint64_t timeCost, const uint64_t nRows,
+            const uint64_t nCols )
 {
    //========================== Basic variables ============================//
    uint64_t _ALIGN(256) state[16];
@@ -230,6 +234,8 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
 //    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

+//    memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
    //==== Getting the password + salt + basil padded with 10*1 ============//
    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
    //but this ensures that the password copied locally will be overwritten as soon as possible
@@ -347,9 +353,9 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
 }

 // Lyra2RE doesn't like the new wholeMatrix implementation
-int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
-             uint64_t pwdlen, const void *salt, uint64_t saltlen,
-             uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
+int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
+             const void *salt, const uint64_t saltlen, const uint64_t timeCost,
+             const uint64_t nRows, const uint64_t nCols )
 {
   //====================== Basic variables ============================//
   uint64_t _ALIGN(256) state[16];
@@ -377,15 +383,15 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
   if (wholeMatrix == NULL)
      return -1;
-/*
-#if defined (__AVX2__)
-   memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
+
+#if defined(__AVX2__)
+   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
 #elif defined(__AVX__)
-   memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
+   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
 #else
-   memset(wholeMatrix, 0, i);
+   memset( wholeMatrix, 0, i );
 #endif
-*/
+
   uint64_t *ptrWord = wholeMatrix;

   //=== Getting the password + salt + basil padded with 10*1 ==========//
@@ -406,8 +412,8 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
   memcpy(ptrByte, salt, saltlen);
   ptrByte += saltlen;

-   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
-                       - (saltlen + pwdlen) );
+//   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+//                       - (saltlen + pwdlen) );

   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
   memcpy(ptrByte, &kLen, sizeof(int64_t));
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -54,4 +54,6 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
            uint64_t pwdlen, const void *salt, uint64_t saltlen,
            uint64_t timeCost, uint64_t nRows, uint64_t nCols );

+int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
+
 #endif /* LYRA2_H_ */
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -61,12 +61,8 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 76; // 19*4
-   uint32_t *noncep1 = vdata + 77;
-   uint32_t *noncep2 = vdata + 78;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep= vdata + 76; // 19*4

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;
@@ -79,42 +75,20 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
   lyra2h_4way_midstate( vdata );

   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );

      be32enc( &edata[19], n );
      lyra2h_4way_hash( hash, vdata );

-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = pdata[19] = n;
-          work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
-      {
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-          work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
-      {
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
-          work_set_target_ratio( work, hash+24 );
+          pdata[19] = n+i;         
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
   } while ( (num_found == 0) && (n < max_nonce-4)
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -7,9 +7,6 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
-
-#include "algo/cubehash/sph_cubehash.h"
-//#include "algo/bmw/sph_bmw.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h" 

 typedef struct {
@@ -18,19 +15,17 @@ typedef struct {
   cubehashParam             cube;
   skein256_4way_context     skein;
   bmw256_4way_context          bmw;
-//        sph_bmw256_context       bmw;
 } lyra2v2_4way_ctx_holder;

 static lyra2v2_4way_ctx_holder l2v2_4way_ctx;

-void init_lyra2rev2_4way_ctx()
+bool init_lyra2rev2_4way_ctx()
 {
-//   blake256_4way_init( &l2v2_4way_ctx.blake );
   keccak256_4way_init( &l2v2_4way_ctx.keccak );
   cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
   skein256_4way_init( &l2v2_4way_ctx.skein );
   bmw256_4way_init( &l2v2_4way_ctx.bmw );
-//        sph_bmw256_init( &l2v2_4way_ctx.bmw );
+   return true;
 }

 void lyra2rev2_4way_hash( void *state, const void *input )
@@ -45,7 +40,6 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );

   blake256_4way( &ctx.blake, input + (64<<2), 16 );
-//   blake256_4way( &ctx.blake, input, 80 );
   blake256_4way_close( &ctx.blake, vhash );

   mm256_reinterleave_4x64( vhash64, vhash, 256 );
@@ -54,11 +48,11 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );

   LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
@@ -71,36 +65,20 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   skein256_4way_close( &ctx.skein, vhash64 );
   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );

-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
-   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );

-
-   // BMW256 4way has a lane corruption problem, only lanes 0 & 2 produce
-   // good hash. As a result this ugly workaround of running bmw256-4way
-   // twice with data shuffled to get all 4 lanes of good hash.
-   // The hash is then shuffled back into the appropriate lanes for output.
-   // Not as fast but still faster than using sph serially. 
-
-   // shift lane 1 data to lane 2.
-   mm_interleave_4x32( vhash, hash0, hash0, hash1, hash1, 256 );
+   mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
   bmw256_4way( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, vhash );
-   uint32_t trash[8] __attribute__ ((aligned (32)));
-   // extract lane 0 as usual and lane2 containing lane 1 hash
-   mm_deinterleave_4x32( state, trash, state+32, trash, vhash, 256 );
-   // shift lane2 data to lane 0 and lane 3 data to lane 2
-   mm_interleave_4x32( vhash, hash2, hash2, hash3, hash3, 256 );
-   bmw256_4way_init( &ctx.bmw );
-   bmw256_4way( &ctx.bmw, vhash, 32 );
-   bmw256_4way_close( &ctx.bmw, vhash );
-   // extract lane 2 hash from lane 0 and lane 3 hash from lane 2.
-   mm_deinterleave_4x32( state+64, trash, state+96, trash, vhash, 256 );
+
+   mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -115,12 +93,8 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   const uint32_t Htarg = ptarget[7];
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 76; // 19*4
-   uint32_t *noncep1 = vdata + 77;
-   uint32_t *noncep2 = vdata + 78;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 76; // 19*4

   if ( opt_benchmark )
      ( (uint32_t*)ptarget )[7] = 0x0000ff;
@@ -133,46 +107,20 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );

   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );

      lyra2rev2_4way_hash( hash, vdata );
      pdata[19] = n;

-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-//printf("found0\n");
-          found[0] = true;
-          num_found++;
-          nonces[0] = pdata[19] = n;
-          work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
-      {
-//printf("found1\n");
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
-      {
-//printf("found2\n");
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-          work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
-      {
-//printf("found3\n");
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
-          work_set_target_ratio( work, hash+24 );
+          pdata[19] = n+i;         
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
   } while ( (num_found == 0) && (n < max_nonce-4)
--- a/algo/lyra2/lyra2rev2-gate.c
+++ b/algo/lyra2/lyra2rev2-gate.c
@@ -14,18 +14,20 @@ bool lyra2rev2_thread_init()

   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
   l2v2_wholeMatrix = _mm_malloc( i, 64 );
-
+#if defined (LYRA2REV2_4WAY)
+   init_lyra2rev2_4way_ctx();;
+#else
+   init_lyra2rev2_ctx();
+#endif
   return l2v2_wholeMatrix;
 }

 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
 #if defined (LYRA2REV2_4WAY)
-  init_lyra2rev2_4way_ctx();
  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
  gate->hash      = (void*)&lyra2rev2_4way_hash;
 #else
-  init_lyra2rev2_ctx();
  gate->scanhash  = (void*)&scanhash_lyra2rev2;
  gate->hash      = (void*)&lyra2rev2_hash;
 #endif
--- a/algo/lyra2/lyra2rev2-gate.h
+++ b/algo/lyra2/lyra2rev2-gate.h
@@ -20,7 +20,7 @@ void lyra2rev2_4way_hash( void *state, const void *input );
 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );

-void init_lyra2rev2_4way_ctx();
+bool init_lyra2rev2_4way_ctx();

 #endif

@@ -29,7 +29,7 @@ void lyra2rev2_hash( void *state, const void *input );
 int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );

-void init_lyra2rev2_ctx();
+bool init_lyra2rev2_ctx();

 #endif

--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -21,7 +21,7 @@ typedef struct {
 static lyra2v2_ctx_holder lyra2v2_ctx;
 static __thread sph_blake256_context l2v2_blake_mid;

-void init_lyra2rev2_ctx()
+bool init_lyra2rev2_ctx()
 {
        cubehashInit( &lyra2v2_ctx.cube1, 256, 16, 32 );
        cubehashInit( &lyra2v2_ctx.cube2, 256, 16, 32 );
@@ -29,6 +29,7 @@ void init_lyra2rev2_ctx()
        sph_keccak256_init( &lyra2v2_ctx.keccak );
        sph_skein256_init( &lyra2v2_ctx.skein );
        sph_bmw256_init( &lyra2v2_ctx.bmw );
+        return true;
 }

 void l2v2_blake256_midstate( const void* input )
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -61,12 +61,8 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
   const uint32_t first_nonce = pdata[19];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;
-   uint32_t *noncep0 = vdata + 76; // 19*4
-   uint32_t *noncep1 = vdata + 77;
-   uint32_t *noncep2 = vdata + 78;
-   uint32_t *noncep3 = vdata + 79;
+   uint32_t *noncep = vdata + 76; // 19*4

   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;
@@ -79,42 +75,20 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
   lyra2z_4way_midstate( vdata );

   do {
-      found[0] = found[1] = found[2] = found[3] = false;
-      be32enc( noncep0, n   );
-      be32enc( noncep1, n+1 );
-      be32enc( noncep2, n+2 );
-      be32enc( noncep3, n+3 );
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );

      lyra2z_4way_hash( hash, vdata );
      pdata[19] = n;

-      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
      {
-          found[0] = true;
-          num_found++;
-          nonces[0] = pdata[19] = n;
-          work_set_target_ratio( work, hash );
-      }
-      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
-      {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash+8 );
-      }
-      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
-      {
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-          work_set_target_ratio( work, hash+16 );
-      }
-      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
-      {
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
-          work_set_target_ratio( work, hash+24 );
+          pdata[19] = n+i;         
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
      n += 4;
   } while ( (num_found == 0) && (n < max_nonce-4)
@@ -126,3 +100,115 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,

 #endif

+#if defined(LYRA2Z_8WAY)
+
+__thread uint64_t* lyra2z_8way_matrix;
+
+bool lyra2z_8way_thread_init()
+{
+ return ( lyra2z_8way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_8way_context l2z_8way_blake_mid;
+
+void lyra2z_8way_midstate( const void* input )
+{
+       blake256_8way_init( &l2z_8way_blake_mid );
+       blake256_8way( &l2z_8way_blake_mid, input, 64 );
+}
+
+void lyra2z_8way_hash( void *state, const void *input )
+{
+     uint32_t hash0[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (64)));
+     uint32_t hash2[8] __attribute__ ((aligned (64)));
+     uint32_t hash3[8] __attribute__ ((aligned (64)));
+     uint32_t hash4[8] __attribute__ ((aligned (64)));
+     uint32_t hash5[8] __attribute__ ((aligned (64)));
+     uint32_t hash6[8] __attribute__ ((aligned (64)));
+     uint32_t hash7[8] __attribute__ ((aligned (64)));
+     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+     blake256_8way_context ctx_blake __attribute__ ((aligned (64)));
+
+     memcpy( &ctx_blake, &l2z_8way_blake_mid, sizeof l2z_8way_blake_mid );
+     blake256_8way( &ctx_blake, input + (64*8), 16 );
+     blake256_8way_close( &ctx_blake, vhash );
+
+     mm256_deinterleave_8x32( hash0, hash1, hash2, hash3,
+                              hash4, hash5, hash6, hash7, vhash, 256 );
+
+     LYRA2Z( lyra2z_8way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash4, 32, hash4, 32, hash4, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash5, 32, hash5, 32, hash5, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash6, 32, hash6, 32, hash6, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_8way_matrix, hash7, 32, hash7, 32, hash7, 32, 8, 8, 8 );
+
+     memcpy( state,     hash0, 32 );
+     memcpy( state+ 32, hash1, 32 );
+     memcpy( state+ 64, hash2, 32 );
+     memcpy( state+ 96, hash3, 32 );
+     memcpy( state+128, hash1, 32 );
+     memcpy( state+160, hash2, 32 );
+     memcpy( state+192, hash3, 32 );
+     memcpy( state+224, hash1, 32 );
+}
+
+int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 152; // 19*8
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   for ( int i=0; i < 19; i++ )
+      be32enc( &edata[i], pdata[i] );
+
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+
+   lyra2z_8way_midstate( vdata );
+
+   do {
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );
+      be32enc( noncep+4, n+4 );
+      be32enc( noncep+5, n+5 );
+      be32enc( noncep+6, n+6 );
+      be32enc( noncep+7, n+7 );
+
+      lyra2z_8way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 8; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          pdata[19] = n+i;         
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 8;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+
+#endif
--- a/algo/lyra2/lyra2z-gate.c
+++ b/algo/lyra2/lyra2z-gate.c
@@ -8,7 +8,11 @@ void lyra2z_set_target( struct work* work, double job_diff )

 bool register_lyra2z_algo( algo_gate_t* gate )
 {
-#ifdef LYRA2Z_4WAY
+#if defined(LYRA2Z_8WAY)
+  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
+  gate->hash       = (void*)&lyra2z_8way_hash;
+#elif defined(LYRA2Z_4WAY)
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
  gate->hash       = (void*)&lyra2z_4way_hash;
--- a/algo/lyra2/lyra2z-gate.h
+++ b/algo/lyra2/lyra2z-gate.h
@@ -1,17 +1,29 @@
 #ifndef LYRA2Z_GATE_H__
-#define LYRA2Z_GATE_H__
+#define LYRA2Z_GATE_H__ 1

 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
+#if defined(__AVX__)
  #define LYRA2Z_4WAY
 #endif
+#if defined(__AVX2__)
+//  #define LYRA2Z_8WAY
+#endif


 #define LYRA2Z_MATRIX_SIZE  BLOCK_LEN_INT64 * 8 * 8 * 8

-#if defined(LYRA2Z_4WAY)
+#if defined(LYRA2Z_8WAY)
+
+void lyra2z_8way_hash( void *state, const void *input );
+
+int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+bool lyra2z_8way_thread_init();
+
+#elif defined(LYRA2Z_4WAY)

 void lyra2z_4way_hash( void *state, const void *input );

@@ -20,7 +32,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,

 bool lyra2z_4way_thread_init();

-#endif
+#else

 void lyra2z_hash( void *state, const void *input );

@@ -31,3 +43,4 @@ bool lyra2z_thread_init();

 #endif

+#endif
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -55,23 +55,23 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // returns void, updates all args
 #define G_4X64(a,b,c,d) \
   a = _mm256_add_epi64( a, b ); \
-   d = mm256_rotr_64( _mm256_xor_si256( d, a), 32 ); \
+   d = mm256_ror_64( _mm256_xor_si256( d, a), 32 ); \
   c = _mm256_add_epi64( c, d ); \
-   b = mm256_rotr_64( _mm256_xor_si256( b, c ), 24 ); \
+   b = mm256_ror_64( _mm256_xor_si256( b, c ), 24 ); \
   a = _mm256_add_epi64( a, b ); \
-   d = mm256_rotr_64( _mm256_xor_si256( d, a ), 16 ); \
+   d = mm256_ror_64( _mm256_xor_si256( d, a ), 16 ); \
   c = _mm256_add_epi64( c, d ); \
-   b = mm256_rotr_64( _mm256_xor_si256( b, c ), 63 );
+   b = mm256_ror_64( _mm256_xor_si256( b, c ), 63 );

 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_rotr256_1x64( s1); \
+   s1 = mm256_ror256_1x64( s1); \
   s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_rotl256_1x64( s3 ); \
+   s3 = mm256_rol256_1x64( s3 ); \
   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_rotl256_1x64( s1 ); \
+   s1 = mm256_rol256_1x64( s1 ); \
   s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_rotr256_1x64( s3 );
+   s3 = mm256_ror256_1x64( s3 );

 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -94,25 +94,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // returns void, all args updated
 #define G_2X64(a,b,c,d) \
   a = _mm_add_epi64( a, b ); \
-   d = mm_rotr_64( _mm_xor_si128( d, a), 32 ); \
+   d = mm_ror_64( _mm_xor_si128( d, a), 32 ); \
   c = _mm_add_epi64( c, d ); \
-   b = mm_rotr_64( _mm_xor_si128( b, c ), 24 ); \
+   b = mm_ror_64( _mm_xor_si128( b, c ), 24 ); \
   a = _mm_add_epi64( a, b ); \
-   d = mm_rotr_64( _mm_xor_si128( d, a ), 16 ); \
+   d = mm_ror_64( _mm_xor_si128( d, a ), 16 ); \
   c = _mm_add_epi64( c, d ); \
-   b = mm_rotr_64( _mm_xor_si128( b, c ), 63 );
+   b = mm_ror_64( _mm_xor_si128( b, c ), 63 );

 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm_rotl256_1x64( s2, s3 ); \
+   mm_ror256_1x64( s2, s3 ); \
   mm_swap_128( s4, s5 ); \
-   mm_rotr256_1x64( s6, s7 ); \
+   mm_rol256_1x64( s6, s7 ); \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm_rotr256_1x64( s2, s3 ); \
+   mm_rol256_1x64( s2, s3 ); \
   mm_swap_128( s4, s5 ); \
-   mm_rotl256_1x64( s6, s7 );
+   mm_ror256_1x64( s6, s7 );

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/neoscrypt/neoscrypt.c
+++ b/algo/neoscrypt/neoscrypt.c
@@ -85,12 +85,12 @@ typedef unsigned int  uint;
    U32TO8_BE((p) + 4, (uint32_t)((v)      ));


-typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE];
+typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE] __attribute__ ((aligned (16)));


 /* SHA-256 */

-static const uint32_t sha256_constants[64] = {
+static const uint32_t sha256_constants[64] __attribute__ ((aligned (16))) = {
    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
@@ -123,10 +123,10 @@ static const uint32_t sha256_constants[64] = {


 typedef struct sha256_hash_state_t {
-    uint32_t H[8];
+    uint32_t H[8] __attribute__ ((aligned (16)));
    uint64_t T;
    uint32_t leftover;
-    uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
+    uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE] __attribute__ ((aligned (16)));
 } sha256_hash_state;


@@ -242,7 +242,7 @@ typedef struct sha256_hmac_state_t {
 } sha256_hmac_state;

 static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen) {
-    uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
+    uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] __attribute__ ((aligned (16))) = {0};
    size_t i;

    neoscrypt_hash_init_sha256(&st->inner);
@@ -570,17 +570,17 @@ typedef struct blake2s_param_t {

 /* State block of 180 bytes */
 typedef struct blake2s_state_t {
-    uint  h[8];
+    uint  h[8] __attribute__ ((aligned (16)));
    uint  t[2];
    uint  f[2];
-    uchar buf[2 * BLAKE2S_BLOCK_SIZE];
+    uchar buf[2 * BLAKE2S_BLOCK_SIZE] __attribute__ ((aligned (16)));
    uint  buflen;
 } blake2s_state;

 static void blake2s_compress(blake2s_state *S, const void *buf) {
    uint i;
-    uint m[16];
-    uint v[16];
+    uint m[16] __attribute__ ((aligned (16)));
+    uint v[16] __attribute__ ((aligned (16)));

    neoscrypt_copy(m, buf, 64);
    neoscrypt_copy(v, S, 32);
@@ -1082,6 +1082,7 @@ void neoscrypt_wait_for_diff( struct stratum_ctx *stratum )

 bool register_neoscrypt_algo( algo_gate_t* gate )
 {
+  gate->optimizations         = SSE2_OPT;
  gate->scanhash              = (void*)&scanhash_neoscrypt;
  gate->hash                  = (void*)&neoscrypt;
  gate->get_max64             = (void*)&get_neoscrypt_max64;
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -79,12 +79,8 @@ int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
     const uint32_t first_nonce = pdata[19];
     const uint32_t Htarg = ptarget[7];
     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+     uint32_t *noncep = vdata + 73;   // 9*8 + 1

     uint64_t htmax[] = {          0,
                                 0xF,
@@ -117,47 +113,22 @@ int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
           uint32_t mask = masks[m];

           do {
-              found[0] = found[1] = found[2] = found[3] = false;
-              be32enc( noncep0, n   );
-              be32enc( noncep1, n+1 );
-              be32enc( noncep2, n+2 );
-              be32enc( noncep3, n+3 );
+              be32enc( noncep,   n   );
+              be32enc( noncep+2, n+1 );
+              be32enc( noncep+4, n+2 );
+              be32enc( noncep+6, n+3 );

              nist5hash_4way( hash, vdata );

              pdata[19] = n;

-              if ( ( !(hash[7] & mask) )
-                   && fulltest( hash, ptarget ) ) 
+              for ( int i = 0; i < 4; i++ )
+              if ( ( !( (hash+(i<<3))[7] & mask ) == 0 )
+                 && fulltest( hash+(i<<3), ptarget ) )
              {
-                 found[0] = true;
-                 num_found++;
-                 nonces[0] = n; 
-                 work_set_target_ratio( work, hash );
-              }
-              if ( ( !((hash+8)[7] & mask) )
-                   && fulltest( hash+8, ptarget ) )
-              {
-                 found[1] = true;
-                 num_found++;
-                 nonces[1] = n+1;
-                 work_set_target_ratio( work, hash+8 );
-              }
-              if ( ( !((hash+16)[7] & mask) )
-                 && fulltest( hash+16, ptarget ) )
-              {
-                 found[2] = true;
-                 num_found++;
-                 nonces[2] = n+2;
-                 work_set_target_ratio( work, hash+16 );
-              }
-              if ( ( !((hash+24)[7] & mask) )
-                   && fulltest( hash+24, ptarget ) )
-              {
-                 found[3] = true;
-                 num_found++;
-                 nonces[3] = n+3;
-                 work_set_target_ratio( work, hash+24 );
+                 pdata[19] = n+i;         
+                 nonces[ num_found++ ] = n+i;
+                 work_set_target_ratio( work, hash+(i<<3) );
              }
              n += 4;
           } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -21,6 +21,7 @@ void nist5hash( void *state, const void *input );

 int scanhash_nist5( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );
+void init_nist5_ctx();
 #endif

 #endif
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -60,7 +60,7 @@ void anime_4way_hash( void *state, const void *input )
    blake512_4way_close( &ctx.blake, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );

       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
       update_and_final_groestl( &ctx.groestl, (char*)hash0,
@@ -97,7 +97,7 @@ void anime_4way_hash( void *state, const void *input )
    jh512_4way_close( &ctx.jh, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );

       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
@@ -118,7 +118,7 @@ void anime_4way_hash( void *state, const void *input )
    skein512_4way_close( &ctx.skein, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );

       keccak512_4way_init( &ctx.keccak );
       keccak512_4way( &ctx.keccak, vhash, 64 );
@@ -145,12 +145,8 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
    uint32_t *nonces = work->nonces;
-    bool *found = work->nfound;
    int num_found = 0;
-    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-    uint32_t *noncep1 = vdata + 75;
-    uint32_t *noncep2 = vdata + 77;
-    uint32_t *noncep3 = vdata + 79;
+    uint32_t *noncep = vdata + 73;   // 9*8 + 1
    const uint32_t Htarg = ptarget[7];
    uint64_t htmax[] = {
                0,
@@ -181,42 +177,21 @@ int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,

          do
          {
-              found[0] = found[1] = found[2] = found[3] = false;
-              be32enc( noncep0, n   );
-              be32enc( noncep1, n+1 );
-              be32enc( noncep2, n+2 );
-              be32enc( noncep3, n+3 );
+             be32enc( noncep,   n   );
+             be32enc( noncep+2, n+1 );
+             be32enc( noncep+4, n+2 );
+             be32enc( noncep+6, n+3 );

-              anime_4way_hash( hash, vdata );
-              pdata[19] = n;
+             anime_4way_hash( hash, vdata );
+             pdata[19] = n;

-             if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) ) 
+             for ( int i = 0; i < 4; i++ )
+             if ( ( ( (hash+(i<<3))[7] & mask ) == 0 )
+                && fulltest( hash+(i<<3), ptarget ) )
             {
-                found[0] = true;
-                num_found++;
-                nonces[0] = n;
-                work_set_target_ratio( work, hash );
-             }
-             if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
-             {
-                found[1] = true;
-                num_found++;
-                nonces[1] = n+1;
-                work_set_target_ratio( work, hash );
-             }
-             if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
-             {
-                found[2] = true;
-                num_found++;
-                nonces[2] = n+2;
-                work_set_target_ratio( work, hash );
-             }
-             if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
-             {
-                found[3] = true;
-                num_found++;
-                nonces[3] = n+3;
-                work_set_target_ratio( work, hash );
+                pdata[19] = n+i;
+                nonces[ num_found++ ] = n+i;
+                work_set_target_ratio( work, hash+(i<<3) );
             }
             n += 4;
          } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -60,7 +60,7 @@ void quark_4way_hash( void *state, const void *input )
    bmw512_4way_close( &ctx.bmw, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );

       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
       update_and_final_groestl( &ctx.groestl, (char*)hash0,
@@ -97,7 +97,7 @@ void quark_4way_hash( void *state, const void *input )
    jh512_4way_close( &ctx.jh, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );

       blake512_4way_init( &ctx.blake );
       blake512_4way( &ctx.blake, vhash, 64 );
@@ -118,7 +118,7 @@ void quark_4way_hash( void *state, const void *input )
    skein512_4way_close( &ctx.skein, vhash );

    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );

       keccak512_4way_init( &ctx.keccak );
       keccak512_4way( &ctx.keccak, vhash, 64 );
@@ -145,12 +145,8 @@ int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
    uint32_t n = pdata[19];
    const uint32_t first_nonce = pdata[19];
    uint32_t *nonces = work->nonces;
-    bool *found = work->nfound;
    int num_found = 0;
-    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-    uint32_t *noncep1 = vdata + 75;
-    uint32_t *noncep2 = vdata + 77;
-    uint32_t *noncep3 = vdata + 79;
+    uint32_t *noncep = vdata + 73;   // 9*8 + 1

    swab32_array( endiandata, pdata, 20 );

@@ -159,42 +155,21 @@ int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,

    do
    {
-       found[0] = found[1] = found[2] = found[3] = false;
-       be32enc( noncep0, n   );
-       be32enc( noncep1, n+1 );
-       be32enc( noncep2, n+2 );
-       be32enc( noncep3, n+3 );
+       be32enc( noncep,   n   );
+       be32enc( noncep+2, n+1 );
+       be32enc( noncep+4, n+2 );
+       be32enc( noncep+6, n+3 );

       quark_4way_hash( hash, vdata );
       pdata[19] = n;

-       if ( ( hash[7] & 0xFFFFFF00 ) == 0 && fulltest( hash, ptarget ) ) 
+       for ( int i = 0; i < 4; i++ )
+       if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
+            && fulltest( hash+(i<<3), ptarget ) )
       {
-          found[0] = true;
-          num_found++;
-          nonces[0] = n;
-          work_set_target_ratio( work, hash );
-       }
-       if ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+8, ptarget ) )
-       {
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-          work_set_target_ratio( work, hash );
-       }
-       if ( ( (hash+16)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+16, ptarget ) )
-       {
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-          work_set_target_ratio( work, hash );
-       }
-       if ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+24, ptarget ) )
-       {
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
-          work_set_target_ratio( work, hash );
+          pdata[19] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
       }
       n += 4;
    } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/qubit/deep-2way.c
+++ b/algo/qubit/deep-2way.c
@@ -0,0 +1,124 @@
+#include "deep-gate.h"
+
+#if defined(DEEP_2WAY)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/shavite/sph_shavite.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct
+{
+        luffa_2way_context      luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        hashState_echo          echo;
+} deep_2way_ctx_holder;
+
+deep_2way_ctx_holder deep_2way_ctx;
+
+void init_deep_2way_ctx()
+{
+        luffa_2way_init( &deep_2way_ctx.luffa, 512 );
+        cubehashInit(&deep_2way_ctx.cube,512,16,32);
+        sph_shavite512_init(&deep_2way_ctx.shavite);
+        init_echo(&deep_2way_ctx.echo, 512);
+};
+
+void deep_2way_hash( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*2] __attribute__ ((aligned (64)));
+     deep_2way_ctx_holder ctx;
+
+     memcpy( &ctx, &deep_2way_ctx, sizeof(deep_2way_ctx) );
+     luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
+     luffa_2way_close( &ctx.luffa, vhash );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                           (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &deep_2way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &deep_2way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &deep_2way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+}
+
+int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     int num_found = 0;
+     uint32_t *noncep = vdata + 32+3;   // 4*8 + 3
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
+
+     luffa_2way_init( &deep_2way_ctx.luffa, 512 );
+     luffa_2way_update( &deep_2way_ctx.luffa, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     {
+        uint32_t mask = masks[m];
+        do
+        {
+            be32enc( noncep,   n   );
+            be32enc( noncep+4, n+1 );
+
+            deep_2way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
+            {
+               nonces[ num_found++ ] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
+            {
+               nonces[ num_found++ ] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            n += 2;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+     }
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/qubit/deep-gate.c
+++ b/algo/qubit/deep-gate.c
@@ -0,0 +1,17 @@
+#include "deep-gate.h"
+
+bool register_deep_algo( algo_gate_t* gate )
+{
+#if defined (DEEP_2WAY)
+  init_deep_2way_ctx();
+  gate->scanhash  = (void*)&scanhash_deep_2way;
+  gate->hash      = (void*)&deep_2way_hash;
+#else
+  init_deep_ctx();
+  gate->scanhash  = (void*)&scanhash_deep;
+  gate->hash      = (void*)&deep_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  return true;
+};
+
--- a/algo/qubit/deep-gate.h
+++ b/algo/qubit/deep-gate.h
@@ -0,0 +1,32 @@
+#ifndef DEEP_GATE_H__
+#define DEEP_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define DEEP_2WAY
+#endif
+
+bool register_deep_algo( algo_gate_t* gate );
+
+#if defined(DEEP_2WAY)
+
+void deep_2way_hash( void *state, const void *input );
+
+int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_deep_2way_ctx();
+
+#endif
+
+void deep_hash( void *state, const void *input );
+
+int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_deep_ctx();
+
+#endif
+
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -1,9 +1,9 @@
-#include "algo-gate-api.h"
+#include "deep-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
@@ -139,12 +139,3 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-bool register_deep_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_deep_ctx();
-  gate->scanhash = (void*)&scanhash_deep;
-  gate->hash     = (void*)&deep_hash;
-  return true;
-};
-
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -0,0 +1,131 @@
+#include "qubit-gate.h"
+
+#if defined(QUBIT_2WAY)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/simd/simd-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct
+{
+        luffa_2way_context      luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        simd_2way_context       simd;
+        hashState_echo          echo;
+} qubit_2way_ctx_holder;
+
+qubit_2way_ctx_holder qubit_2way_ctx;
+
+void init_qubit_2way_ctx()
+{
+        cubehashInit(&qubit_2way_ctx.cube,512,16,32);
+        sph_shavite512_init(&qubit_2way_ctx.shavite);
+        simd_2way_init( &qubit_2way_ctx.simd, 512 );
+        init_echo(&qubit_2way_ctx.echo, 512);
+};
+
+void qubit_2way_hash( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*2] __attribute__ ((aligned (64)));
+     qubit_2way_ctx_holder ctx;
+
+     memcpy( &ctx, &qubit_2way_ctx, sizeof(qubit_2way_ctx) );
+     luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
+     luffa_2way_close( &ctx.luffa, vhash );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                           (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+}
+
+int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     int num_found = 0;
+     uint32_t *noncep = vdata + 32+3;   // 4*8 + 3
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
+
+     luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
+     luffa_2way_update( &qubit_2way_ctx.luffa, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     {
+        uint32_t mask = masks[m];
+        do
+        {
+            be32enc( noncep,   n   );
+            be32enc( noncep+4, n+1 );
+            qubit_2way_hash( hash, vdata );
+            pdata[19] = n;
+
+
+            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
+            {
+               nonces[ num_found++ ] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
+            {
+               pdata[19] = n+1;
+               nonces[ num_found++ ] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            n += 2;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+     }
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -0,0 +1,17 @@
+#include "qubit-gate.h"
+
+bool register_qubit_algo( algo_gate_t* gate )
+{
+#if defined (QUBIT_2WAY)
+  init_qubit_2way_ctx();
+  gate->scanhash  = (void*)&scanhash_qubit_2way;
+  gate->hash      = (void*)&qubit_2way_hash;
+#else
+  init_qubit_ctx();
+  gate->scanhash  = (void*)&scanhash_qubit;
+  gate->hash      = (void*)&qubit_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  return true;
+};
+
--- a/algo/qubit/qubit-gate.h
+++ b/algo/qubit/qubit-gate.h
@@ -0,0 +1,32 @@
+#ifndef QUBIT_GATE_H__
+#define QUBIT_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define QUBIT_2WAY
+#endif
+
+bool register_qubit_algo( algo_gate_t* gate );
+
+#if defined(QUBIT_2WAY)
+
+void qubit_2way_hash( void *state, const void *input );
+
+int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_qubit_2way_ctx();
+
+#endif
+
+void qubit_hash( void *state, const void *input );
+
+int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_qubit_ctx();
+
+#endif
+
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -1,11 +1,11 @@
-#include "algo-gate-api.h"
+#include "qubit-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
@@ -48,7 +48,7 @@ void qubit_luffa_midstate( const void* input )
    update_luffa( &qubit_luffa_mid, input, 64 );
 }

-void qubithash(void *output, const void *input)
+void qubit_hash(void *output, const void *input)
 {
        unsigned char hash[128] __attribute((aligned(64)));
        #define hashB hash+64
@@ -115,7 +115,7 @@ int scanhash_qubit(int thr_id, struct work *work,
                {
 	            pdata[19] = ++n;
 		    be32enc(&endiandata[19], n);
-		    qubithash(hash64, endiandata);
+		    qubit_hash(hash64, endiandata);
 #ifndef DEBUG_ALGO
 		    if (!(hash64[7] & mask))
                    {
@@ -151,12 +151,3 @@ int scanhash_qubit(int thr_id, struct work *work,
 	return 0;
 }

-bool register_qubit_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_qubit_ctx();
-  gate->scanhash = (void*)&scanhash_qubit;
-  gate->hash     = (void*)&qubithash;
-  return true;
-};
-
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -0,0 +1,245 @@
+#include "lbry-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/sha/sha2-hash-4way.h"
+#include "ripemd-hash-4way.h"
+
+#define LBRY_INPUT_SIZE 112
+#define LBRY_MIDSTATE    64
+#define LBRY_TAIL (LBRY_INPUT_SIZE) - (LBRY_MIDSTATE)
+
+#if defined(LBRY_8WAY)
+
+static __thread sha256_8way_context sha256_8w_mid;
+
+void lbry_8way_hash( void* output, const void* input )
+{
+   uint32_t _ALIGN(64) vhashA[16<<3];
+   uint32_t _ALIGN(64) vhashB[16<<3];
+   uint32_t _ALIGN(64) vhashC[16<<3];
+   uint32_t _ALIGN(32) h0[32];
+   uint32_t _ALIGN(32) h1[32];
+   uint32_t _ALIGN(32) h2[32];
+   uint32_t _ALIGN(32) h3[32];
+   uint32_t _ALIGN(32) h4[32];
+   uint32_t _ALIGN(32) h5[32];
+   uint32_t _ALIGN(32) h6[32];
+   uint32_t _ALIGN(32) h7[32];
+   sha256_8way_context     ctx_sha256 __attribute__ ((aligned (64)));
+   sha512_4way_context     ctx_sha512;
+   ripemd160_8way_context  ctx_ripemd;
+
+   memcpy( &ctx_sha256, &sha256_8w_mid, sizeof(ctx_sha256) );
+   sha256_8way( &ctx_sha256, input + (LBRY_MIDSTATE<<3), LBRY_TAIL );
+   sha256_8way_close( &ctx_sha256, vhashA );
+
+   sha256_8way_init( &ctx_sha256 );
+   sha256_8way( &ctx_sha256, vhashA, 32 );
+   sha256_8way_close( &ctx_sha256, vhashA );
+
+   // reinterleave to do sha512 4-way 64 bit twice.
+   mm256_deinterleave_8x32( h0, h1, h2, h3, h4, h5, h6, h7, vhashA, 256 );
+   mm256_interleave_4x64( vhashA, h0, h1, h2, h3, 256 );
+   mm256_interleave_4x64( vhashB, h4, h5, h6, h7, 256 );
+
+   sha512_4way_init( &ctx_sha512 );
+   sha512_4way( &ctx_sha512, vhashA, 32 );
+   sha512_4way_close( &ctx_sha512, vhashA );
+
+   sha512_4way_init( &ctx_sha512 );
+   sha512_4way( &ctx_sha512, vhashB, 32 );
+   sha512_4way_close( &ctx_sha512, vhashB );
+
+   // back to 8-way 32 bit
+   mm256_deinterleave_4x64( h0, h1, h2, h3, vhashA, 512 );
+   mm256_deinterleave_4x64( h4, h5, h6, h7, vhashB, 512 );
+   mm256_interleave_8x32( vhashA, h0, h1, h2, h3, h4, h5, h6, h7, 512 );
+
+   ripemd160_8way_init( &ctx_ripemd );
+   ripemd160_8way( &ctx_ripemd, vhashA, 32 );
+   ripemd160_8way_close( &ctx_ripemd, vhashB );
+
+   ripemd160_8way_init( &ctx_ripemd );
+   ripemd160_8way( &ctx_ripemd, vhashA+(8<<3), 32 );
+   ripemd160_8way_close( &ctx_ripemd, vhashC );
+
+   sha256_8way_init( &ctx_sha256 );
+   sha256_8way( &ctx_sha256, vhashB, 20 );
+   sha256_8way( &ctx_sha256, vhashC, 20 );
+   sha256_8way_close( &ctx_sha256, vhashA );
+
+   sha256_8way_init( &ctx_sha256 );
+   sha256_8way( &ctx_sha256, vhashA, 32 );
+   sha256_8way_close( &ctx_sha256, vhashA );
+
+   mm256_deinterleave_8x32( output,     output+ 32, output+ 64, output+ 96,
+                            output+128, output+160, output+192, output+224,
+                            vhashA, 256 );
+}
+
+int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done)
+{
+   uint32_t hash[8*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[32*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[27];
+   const uint32_t first_nonce = pdata[27];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t edata[32] __attribute__ ((aligned (64)));
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 216; // 27*8
+
+   uint64_t htmax[] = {          0,        0xF,       0xFF,
+                             0xFFF,     0xFFFF, 0x10000000 };
+   uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                        0xFFFFF000, 0xFFFF0000,          0 };
+
+   // we need bigendian data...
+   swab32_array( edata, pdata, 32 );
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 1024 );
+   sha256_8way_init( &sha256_8w_mid );
+   sha256_8way( &sha256_8w_mid, vdata, LBRY_MIDSTATE );
+
+   for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[m];
+      do
+      {
+         be32enc( noncep,   n   );
+         be32enc( noncep+1, n+1 );
+         be32enc( noncep+2, n+2 );
+         be32enc( noncep+3, n+3 );
+         be32enc( noncep+4, n+4 );
+         be32enc( noncep+5, n+5 );
+         be32enc( noncep+6, n+6 );
+         be32enc( noncep+7, n+7 );
+
+         lbry_8way_hash( hash, vdata );
+
+         for ( int i = 0; i < 8; i++ )
+         if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
+         {
+            pdata[27] = n+i;
+            nonces[ num_found++ ] = n+i;
+            work_set_target_ratio( work, hash+(i<<3) );
+         }
+         n+=8;
+      } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+      break;
+   }
+
+   *hashes_done = n - first_nonce;
+   return num_found;
+}
+
+#elif defined(LBRY_4WAY)
+
+static __thread sha256_4way_context sha256_mid;
+
+void lbry_4way_hash( void* output, const void* input )
+{
+   sha256_4way_context     ctx_sha256 __attribute__ ((aligned (64)));
+   sha512_4way_context     ctx_sha512;
+   ripemd160_4way_context  ctx_ripemd;
+   uint32_t _ALIGN(64) vhashA[16<<2];
+   uint32_t _ALIGN(64) vhashB[16<<2];
+   uint32_t _ALIGN(64) vhashC[16<<2];
+
+   memcpy( &ctx_sha256, &sha256_mid, sizeof(ctx_sha256) );
+   sha256_4way( &ctx_sha256, input + (LBRY_MIDSTATE<<2), LBRY_TAIL );
+   sha256_4way_close( &ctx_sha256, vhashA );
+
+   sha256_4way_init( &ctx_sha256 );
+   sha256_4way( &ctx_sha256, vhashA, 32 );
+   sha256_4way_close( &ctx_sha256, vhashA );
+
+   // sha512 64 bit data, 64 byte output
+   mm256_reinterleave_4x64( vhashB, vhashA, 256 );
+   sha512_4way_init( &ctx_sha512 );
+   sha512_4way( &ctx_sha512, vhashB, 32 );
+   sha512_4way_close( &ctx_sha512, vhashB );
+   mm256_reinterleave_4x32( vhashA, vhashB, 512 );
+
+   ripemd160_4way_init( &ctx_ripemd );
+   ripemd160_4way( &ctx_ripemd, vhashA, 32 );
+   ripemd160_4way_close( &ctx_ripemd, vhashB );
+
+   ripemd160_4way_init( &ctx_ripemd );
+   ripemd160_4way( &ctx_ripemd, vhashA+(8<<2), 32 );
+   ripemd160_4way_close( &ctx_ripemd, vhashC );
+
+   sha256_4way_init( &ctx_sha256 );
+   sha256_4way( &ctx_sha256, vhashB, 20 );
+   sha256_4way( &ctx_sha256, vhashC, 20 );
+   sha256_4way_close( &ctx_sha256, vhashA );
+
+   sha256_4way_init( &ctx_sha256 );
+   sha256_4way( &ctx_sha256, vhashA, 32 );
+   sha256_4way_close( &ctx_sha256, vhashA );
+
+   mm_deinterleave_4x32( output, output+32, output+64, output+96, vhashA, 256 );
+}
+
+int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done)
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[32*4] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[27];
+   const uint32_t first_nonce = pdata[27];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t edata[32] __attribute__ ((aligned (64)));
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 108; // 27*4
+
+   uint64_t htmax[] = {          0,        0xF,       0xFF,
+                             0xFFF,     0xFFFF, 0x10000000 };
+   uint32_t masks[] = {	0xFFFFFFFF, 0xFFFFFFF0,	0xFFFFFF00,
+                        0xFFFFF000, 0xFFFF0000,          0 };
+
+   // we need bigendian data...
+   swab32_array( edata, pdata, 32 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
+   sha256_4way_init( &sha256_mid );
+   sha256_4way( &sha256_mid, vdata, LBRY_MIDSTATE );
+
+   for ( int m = 0; m < sizeof(masks); m++ ) if ( Htarg <= htmax[m] )
+   {
+      uint32_t mask = masks[m];
+      do
+      {
+         be32enc( noncep,   n   );
+         be32enc( noncep+1, n+1 );
+         be32enc( noncep+2, n+2 );
+         be32enc( noncep+3, n+3 );
+
+         lbry_4way_hash( hash, vdata );
+
+         for ( int i = 0; i < 4; i++ )
+         if ( !( (hash+(i<<3))[7] & mask ) && fulltest( hash+(i<<3), ptarget ) )
+         {
+            pdata[27] = n+i;
+            nonces[ num_found++ ] = n+i;
+            work_set_target_ratio( work, hash+(i<<3) );
+         }
+         n+=4;
+      } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+      break;
+   }
+
+   *hashes_done = n - first_nonce;
+   return num_found;
+}
+
+#endif
--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -0,0 +1,136 @@
+#include "lbry-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+double lbry_calc_network_diff( struct work *work )
+{
+        // sample for diff 43.281 : 1c05ea29
+        // todo: endian reversed on longpoll could be zr5 specific...
+
+   uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
+   uint32_t bits = (nbits & 0xffffff);
+   int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
+   double d = (double)0x0000ffff / (double)bits;
+
+   for (int m=shift; m < 29; m++) d *= 256.0;
+   for (int m=29; m < shift; m++) d /= 256.0;
+   if (opt_debug_diff)
+      applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
+
+   return d;
+}
+
+// std_le should work but it doesn't
+void lbry_le_build_stratum_request( char *req, struct work *work,
+                                      struct stratum_ctx *sctx )
+{
+   unsigned char *xnonce2str;
+   uint32_t ntime, nonce;
+   char ntimestr[9], noncestr[9];
+
+   le32enc( &ntime, work->data[ LBRY_NTIME_INDEX ] );
+   le32enc( &nonce, work->data[ LBRY_NONCE_INDEX ] );
+   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
+   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
+   xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len);
+   snprintf( req, JSON_BUF_LEN,
+        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
+         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
+   free(xnonce2str);
+}
+
+// don't use lbry_build_block_header, it can't handle clasim, do it inline
+// in lbry_build_extraheader. The side effect is no gbt support for lbry.
+void lbry_build_block_header( struct work* g_work, uint32_t version,
+                             uint32_t *prevhash, uint32_t *merkle_root,
+                             uint32_t ntime, uint32_t nbits )
+{
+   int i;
+   memset( g_work->data, 0, sizeof(g_work->data) );
+   g_work->data[0] =  version;
+
+   if ( have_stratum )
+      for ( i = 0; i < 8; i++ )
+         g_work->data[1 + i] = le32dec( prevhash + i );
+   else
+      for (i = 0; i < 8; i++)
+         g_work->data[ 8-i ] = le32dec( prevhash + i );
+
+   for ( i = 0; i < 8; i++ )
+      g_work->data[9 + i] = be32dec( merkle_root + i );
+
+//   for ( int i = 0; i < 8; i++ )
+//        g_work->data[17 + i] = claim[i];
+
+   g_work->data[ LBRY_NTIME_INDEX ] = ntime;
+   g_work->data[ LBRY_NBITS_INDEX ] = nbits;
+   g_work->data[28] = 0x80000000;
+}
+
+void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
+{
+   unsigned char merkle_root[64] = { 0 };
+   size_t t;
+   int i;
+
+   algo_gate.gen_merkle_root( merkle_root, sctx );
+   // Increment extranonce2 
+   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
+   // Assemble block header 
+
+//   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
+//          (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_root,
+//          le32dec( sctx->job.ntime ), le32dec( sctx->job.nbits ) );
+
+   memset( g_work->data, 0, sizeof(g_work->data) );
+   g_work->data[0] = le32dec( sctx->job.version );
+
+   for ( i = 0; i < 8; i++ )
+      g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
+
+   for ( i = 0; i < 8; i++ )
+      g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
+
+   for ( int i = 0; i < 8; i++ )
+        g_work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
+
+   g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime);
+   g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits);
+   g_work->data[28] = 0x80000000;
+}
+
+void lbry_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+int64_t lbry_get_max64() { return 0x1ffffLL; }
+
+bool register_lbry_algo( algo_gate_t* gate )
+{
+  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+#if defined (LBRY_8WAY)
+  gate->scanhash              = (void*)&scanhash_lbry_8way;
+  gate->hash                  = (void*)&lbry_8way_hash;
+#elif defined (LBRY_4WAY)
+  gate->scanhash              = (void*)&scanhash_lbry_4way;
+  gate->hash                  = (void*)&lbry_4way_hash;
+#else 
+  gate->scanhash              = (void*)&scanhash_lbry;
+  gate->hash                  = (void*)&lbry_hash;
+#endif
+  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
+  gate->get_max64             = (void*)&lbry_get_max64;
+  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
+//  gate->build_block_header    = (void*)&build_block_header;
+  gate->build_extraheader     = (void*)&lbry_build_extraheader;
+  gate->set_target            = (void*)&lbry_set_target;
+  gate->ntime_index           = LBRY_NTIME_INDEX;
+  gate->nbits_index           = LBRY_NBITS_INDEX;
+  gate->nonce_index           = LBRY_NONCE_INDEX;
+  gate->work_data_size        = LBRY_WORK_DATA_SIZE;
+  return true;
+}
+
--- a/algo/ripemd/lbry-gate.h
+++ b/algo/ripemd/lbry-gate.h
@@ -0,0 +1,37 @@
+#ifndef LBRY_GATE_H__
+#define LBRY_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+// need sha512 2 way AVX x2 or 1 way scalar x4 to support 4way AVX.
+#if defined(__AVX2__)
+  #define LBRY_8WAY
+#endif
+
+#define LBRY_NTIME_INDEX 25
+#define LBRY_NBITS_INDEX 26
+#define LBRY_NONCE_INDEX 27
+#define LBRY_WORK_DATA_SIZE 192
+#define LBRY_WORK_CMP_SIZE 76  // same as default
+
+bool register_lbry_algo( algo_gate_t* gate );
+
+#if defined(LBRY_8WAY)
+
+void lbry_8way_hash( void *state, const void *input );
+int scanhash_lbry_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+#elif defined(LBRY_4WAY)
+
+void lbry_4way_hash( void *state, const void *input );
+int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#else
+
+void lbry_hash( void *state, const void *input );
+int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+#endif
+#endif
--- a/algo/ripemd/lbry.c
+++ b/algo/ripemd/lbry.c
@@ -1,19 +1,12 @@
-#include "algo-gate-api.h"
+#include "lbry-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "ripemd/sph_ripemd.h"
-#include "sha/sph_sha2.h"
+#include "sph_ripemd.h"
+#include "algo/sha/sph_sha2.h"
 #include <openssl/sha.h>

-#define LBRY_NTIME_INDEX 25
-#define LBRY_NBITS_INDEX 26
-#define LBRY_NONCE_INDEX 27
-#define LBRY_WORK_DATA_SIZE 192
-#define LBRY_WORK_CMP_SIZE 76  // same as default
-
-
 void lbry_hash(void* output, const void* input)
 {
 #ifndef USE_SPH_SHA
@@ -151,88 +144,3 @@ int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
 	pdata[27] = n;
 	return 0;
 }
-
-double lbry_calc_network_diff( struct work *work )
-{
-        // sample for diff 43.281 : 1c05ea29
-        // todo: endian reversed on longpoll could be zr5 specific...
-
-   uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
-   uint32_t bits = (nbits & 0xffffff);
-   int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
-   double d = (double)0x0000ffff / (double)bits;
-
-   for (int m=shift; m < 29; m++) d *= 256.0;
-   for (int m=29; m < shift; m++) d /= 256.0;
-   if (opt_debug_diff)
-      applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
-
-   return d;
-}
-
-// std_le should work but it doesn't
-void lbry_le_build_stratum_request( char *req, struct work *work,
-                                      struct stratum_ctx *sctx )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime, nonce;
-   char ntimestr[9], noncestr[9];
-
-   le32enc( &ntime, work->data[ LBRY_NTIME_INDEX ] );
-   le32enc( &nonce, work->data[ LBRY_NONCE_INDEX ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len);
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free(xnonce2str);
-}
-
-void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-   unsigned char merkle_root[64] = { 0 };
-   size_t t;
-   int i;
-
-   algo_gate.gen_merkle_root( merkle_root, sctx );
-   // Increment extranonce2 
-   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
-   // Assemble block header 
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
-   for ( int i = 0; i < 8; i++ )
-        g_work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
-   g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime);
-   g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits);
-   g_work->data[28] = 0x80000000;
-}
-
-void lbry_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-int64_t lbry_get_max64() { return 0x1ffffLL; }
-
-bool register_lbry_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
-  gate->scanhash              = (void*)&scanhash_lbry;
-  gate->hash                  = (void*)&lbry_hash;
-  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
-  gate->get_max64             = (void*)&lbry_get_max64;
-  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
-  gate->build_extraheader     = (void*)&lbry_build_extraheader;
-  gate->set_target            = (void*)&lbry_set_target;
-  gate->ntime_index           = LBRY_NTIME_INDEX;
-  gate->nbits_index           = LBRY_NBITS_INDEX;
-  gate->nonce_index           = LBRY_NONCE_INDEX;
-  gate->work_data_size        = LBRY_WORK_DATA_SIZE;
-  return true;
-}
-
--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -0,0 +1,622 @@
+#include "ripemd-hash-4way.h"
+
+#if defined(__AVX__)
+
+#include <stddef.h>
+#include <string.h>
+
+static const uint32_t IV[5] =
+{ 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };
+
+/*
+ * Round constants for RIPEMD-160.
+ */
+#define K11  0x00000000
+#define K12  0x5A827999
+#define K13  0x6ED9EBA1
+#define K14  0x8F1BBCDC
+#define K15  0xA953FD4E
+
+#define K21  0x50A28BE6
+#define K22  0x5C4DD124
+#define K23  0x6D703EF3
+#define K24  0x7A6D76E9
+#define K25  0x00000000
+
+// RIPEMD-160 4 way
+
+#define F1(x, y, z) \
+   _mm_xor_si128( _mm_xor_si128( x, y ), z )
+
+#define F2(x, y, z) \
+   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )
+
+#define F3(x, y, z) \
+   _mm_xor_si128( _mm_or_si128( x, mm_not( y ) ), z )
+
+#define F4(x, y, z) \
+   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )
+
+#define F5(x, y, z) \
+   _mm_xor_si128( x, _mm_or_si128( y, mm_not( z ) ) )
+
+#define RR(a, b, c, d, e, f, s, r, k) \
+do{ \
+   a = _mm_add_epi32( mm_rotl_32( _mm_add_epi32( _mm_add_epi32( \
+                _mm_add_epi32( a, f( b ,c, d ) ), r ), \
+                                 _mm_set1_epi32( k ) ), s ), e ); \
+   c = mm_rotl_32( c, 10 );\
+} while (0)
+
+#define ROUND1(a, b, c, d, e, f, s, r, k)  \
+	RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
+
+#define ROUND2(a, b, c, d, e, f, s, r, k)  \
+	RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
+
+static void ripemd160_4way_round( ripemd160_4way_context *sc )
+{
+   const __m128i *in = (__m128i*)sc->buf;
+   __m128i *h  = (__m128i*)sc->val;
+   register __m128i A1, B1, C1, D1, E1;
+   register __m128i A2, B2, C2, D2, E2;
+   __m128i tmp;
+
+   A1 = A2 = h[0];
+   B1 = B2 = h[1];
+   C1 = C2 = h[2];
+   D1 = D2 = h[3];
+   E1 = E2 = h[4];
+
+   ROUND1( A, B, C, D, E, F1, 11, in[ 0], 1 );
+   ROUND1( E, A, B, C, D, F1, 14, in[ 1], 1 );
+   ROUND1( D, E, A, B, C, F1, 15, in[ 2], 1 );
+   ROUND1( C, D, E, A, B, F1, 12, in[ 3], 1 );
+   ROUND1( B, C, D, E, A, F1,  5, in[ 4], 1 );
+   ROUND1( A, B, C, D, E, F1,  8, in[ 5], 1 );
+   ROUND1( E, A, B, C, D, F1,  7, in[ 6], 1 );
+   ROUND1( D, E, A, B, C, F1,  9, in[ 7], 1 );
+   ROUND1( C, D, E, A, B, F1, 11, in[ 8], 1 );
+   ROUND1( B, C, D, E, A, F1, 13, in[ 9], 1 );
+   ROUND1( A, B, C, D, E, F1, 14, in[10], 1 );
+   ROUND1( E, A, B, C, D, F1, 15, in[11], 1 );
+   ROUND1( D, E, A, B, C, F1,  6, in[12], 1 );
+   ROUND1( C, D, E, A, B, F1,  7, in[13], 1 );
+   ROUND1( B, C, D, E, A, F1,  9, in[14], 1 );
+   ROUND1( A, B, C, D, E, F1,  8, in[15], 1 );
+
+   ROUND1( E, A, B, C, D, F2,  7, in[ 7], 2 );
+   ROUND1( D, E, A, B, C, F2,  6, in[ 4], 2 );
+   ROUND1( C, D, E, A, B, F2,  8, in[13], 2 );
+   ROUND1( B, C, D, E, A, F2, 13, in[ 1], 2 );
+   ROUND1( A, B, C, D, E, F2, 11, in[10], 2 );
+   ROUND1( E, A, B, C, D, F2,  9, in[ 6], 2 );
+   ROUND1( D, E, A, B, C, F2,  7, in[15], 2 );
+   ROUND1( C, D, E, A, B, F2, 15, in[ 3], 2 );
+   ROUND1( B, C, D, E, A, F2,  7, in[12], 2 );
+   ROUND1( A, B, C, D, E, F2, 12, in[ 0], 2 );
+   ROUND1( E, A, B, C, D, F2, 15, in[ 9], 2 );
+   ROUND1( D, E, A, B, C, F2,  9, in[ 5], 2 );
+   ROUND1( C, D, E, A, B, F2, 11, in[ 2], 2 );
+   ROUND1( B, C, D, E, A, F2,  7, in[14], 2 );
+   ROUND1( A, B, C, D, E, F2, 13, in[11], 2 );
+   ROUND1( E, A, B, C, D, F2, 12, in[ 8], 2 );
+
+   ROUND1( D, E, A, B, C, F3, 11, in[ 3], 3 );
+   ROUND1( C, D, E, A, B, F3, 13, in[10], 3 );
+   ROUND1( B, C, D, E, A, F3,  6, in[14], 3 );
+   ROUND1( A, B, C, D, E, F3,  7, in[ 4], 3 );
+   ROUND1( E, A, B, C, D, F3, 14, in[ 9], 3 );
+   ROUND1( D, E, A, B, C, F3,  9, in[15], 3 );
+   ROUND1( C, D, E, A, B, F3, 13, in[ 8], 3 );
+   ROUND1( B, C, D, E, A, F3, 15, in[ 1], 3 );
+   ROUND1( A, B, C, D, E, F3, 14, in[ 2], 3 );
+   ROUND1( E, A, B, C, D, F3,  8, in[ 7], 3 );
+   ROUND1( D, E, A, B, C, F3, 13, in[ 0], 3 );
+   ROUND1( C, D, E, A, B, F3,  6, in[ 6], 3 );
+   ROUND1( B, C, D, E, A, F3,  5, in[13], 3 );
+   ROUND1( A, B, C, D, E, F3, 12, in[11], 3 );
+   ROUND1( E, A, B, C, D, F3,  7, in[ 5], 3 );
+   ROUND1( D, E, A, B, C, F3,  5, in[12], 3 );
+
+   ROUND1( C, D, E, A, B, F4, 11, in[ 1], 4 );
+   ROUND1( B, C, D, E, A, F4, 12, in[ 9], 4 );
+   ROUND1( A, B, C, D, E, F4, 14, in[11], 4 );
+   ROUND1( E, A, B, C, D, F4, 15, in[10], 4 );
+   ROUND1( D, E, A, B, C, F4, 14, in[ 0], 4 );
+   ROUND1( C, D, E, A, B, F4, 15, in[ 8], 4 );
+   ROUND1( B, C, D, E, A, F4,  9, in[12], 4 );
+   ROUND1( A, B, C, D, E, F4,  8, in[ 4], 4 );
+   ROUND1( E, A, B, C, D, F4,  9, in[13], 4 );
+   ROUND1( D, E, A, B, C, F4, 14, in[ 3], 4 );
+   ROUND1( C, D, E, A, B, F4,  5, in[ 7], 4 );
+   ROUND1( B, C, D, E, A, F4,  6, in[15], 4 );
+   ROUND1( A, B, C, D, E, F4,  8, in[14], 4 );
+   ROUND1( E, A, B, C, D, F4,  6, in[ 5], 4 );
+   ROUND1( D, E, A, B, C, F4,  5, in[ 6], 4 );
+   ROUND1( C, D, E, A, B, F4, 12, in[ 2], 4 );
+
+   ROUND1( B, C, D, E, A, F5,  9, in[ 4], 5 );
+   ROUND1( A, B, C, D, E, F5, 15, in[ 0], 5 );
+   ROUND1( E, A, B, C, D, F5,  5, in[ 5], 5 );
+   ROUND1( D, E, A, B, C, F5, 11, in[ 9], 5 );
+   ROUND1( C, D, E, A, B, F5,  6, in[ 7], 5 );
+   ROUND1( B, C, D, E, A, F5,  8, in[12], 5 );
+   ROUND1( A, B, C, D, E, F5, 13, in[ 2], 5 );
+   ROUND1( E, A, B, C, D, F5, 12, in[10], 5 );
+   ROUND1( D, E, A, B, C, F5,  5, in[14], 5 );
+   ROUND1( C, D, E, A, B, F5, 12, in[ 1], 5 );
+   ROUND1( B, C, D, E, A, F5, 13, in[ 3], 5 );
+   ROUND1( A, B, C, D, E, F5, 14, in[ 8], 5 );
+   ROUND1( E, A, B, C, D, F5, 11, in[11], 5 );
+   ROUND1( D, E, A, B, C, F5,  8, in[ 6], 5 );
+   ROUND1( C, D, E, A, B, F5,  5, in[15], 5 );
+   ROUND1( B, C, D, E, A, F5,  6, in[13], 5 );
+
+   ROUND2( A, B, C, D, E, F5,  8, in[ 5], 1 );
+   ROUND2( E, A, B, C, D, F5,  9, in[14], 1 );
+   ROUND2( D, E, A, B, C, F5,  9, in[ 7], 1 );
+   ROUND2( C, D, E, A, B, F5, 11, in[ 0], 1 );
+   ROUND2( B, C, D, E, A, F5, 13, in[ 9], 1 );
+   ROUND2( A, B, C, D, E, F5, 15, in[ 2], 1 );
+   ROUND2( E, A, B, C, D, F5, 15, in[11], 1 );
+   ROUND2( D, E, A, B, C, F5,  5, in[ 4], 1 );
+   ROUND2( C, D, E, A, B, F5,  7, in[13], 1 );
+   ROUND2( B, C, D, E, A, F5,  7, in[ 6], 1 );
+   ROUND2( A, B, C, D, E, F5,  8, in[15], 1 );
+   ROUND2( E, A, B, C, D, F5, 11, in[ 8], 1 );
+   ROUND2( D, E, A, B, C, F5, 14, in[ 1], 1 );
+   ROUND2( C, D, E, A, B, F5, 14, in[10], 1 );
+   ROUND2( B, C, D, E, A, F5, 12, in[ 3], 1 );
+   ROUND2( A, B, C, D, E, F5,  6, in[12], 1 );
+
+   ROUND2( E, A, B, C, D, F4,  9, in[ 6], 2 );
+   ROUND2( D, E, A, B, C, F4, 13, in[11], 2 );
+   ROUND2( C, D, E, A, B, F4, 15, in[ 3], 2 );
+   ROUND2( B, C, D, E, A, F4,  7, in[ 7], 2 );
+   ROUND2( A, B, C, D, E, F4, 12, in[ 0], 2 );
+   ROUND2( E, A, B, C, D, F4,  8, in[13], 2 );
+   ROUND2( D, E, A, B, C, F4,  9, in[ 5], 2 );
+   ROUND2( C, D, E, A, B, F4, 11, in[10], 2 );
+   ROUND2( B, C, D, E, A, F4,  7, in[14], 2 );
+   ROUND2( A, B, C, D, E, F4,  7, in[15], 2 );
+   ROUND2( E, A, B, C, D, F4, 12, in[ 8], 2 );
+   ROUND2( D, E, A, B, C, F4,  7, in[12], 2 );
+   ROUND2( C, D, E, A, B, F4,  6, in[ 4], 2 );
+   ROUND2( B, C, D, E, A, F4, 15, in[ 9], 2 );
+   ROUND2( A, B, C, D, E, F4, 13, in[ 1], 2 );
+   ROUND2( E, A, B, C, D, F4, 11, in[ 2], 2 );
+
+   ROUND2( D, E, A, B, C, F3,  9, in[15], 3 );
+   ROUND2( C, D, E, A, B, F3,  7, in[ 5], 3 );
+   ROUND2( B, C, D, E, A, F3, 15, in[ 1], 3 );
+   ROUND2( A, B, C, D, E, F3, 11, in[ 3], 3 );
+   ROUND2( E, A, B, C, D, F3,  8, in[ 7], 3 );
+   ROUND2( D, E, A, B, C, F3,  6, in[14], 3 );
+   ROUND2( C, D, E, A, B, F3,  6, in[ 6], 3 );
+   ROUND2( B, C, D, E, A, F3, 14, in[ 9], 3 );
+   ROUND2( A, B, C, D, E, F3, 12, in[11], 3 );
+   ROUND2( E, A, B, C, D, F3, 13, in[ 8], 3 );
+   ROUND2( D, E, A, B, C, F3,  5, in[12], 3 );
+   ROUND2( C, D, E, A, B, F3, 14, in[ 2], 3 );
+   ROUND2( B, C, D, E, A, F3, 13, in[10], 3 );
+   ROUND2( A, B, C, D, E, F3, 13, in[ 0], 3 );
+   ROUND2( E, A, B, C, D, F3,  7, in[ 4], 3 );
+   ROUND2( D, E, A, B, C, F3,  5, in[13], 3 );
+
+   ROUND2( C, D, E, A, B, F2, 15, in[ 8], 4 );
+   ROUND2( B, C, D, E, A, F2,  5, in[ 6], 4 );
+   ROUND2( A, B, C, D, E, F2,  8, in[ 4], 4 );
+   ROUND2( E, A, B, C, D, F2, 11, in[ 1], 4 );
+   ROUND2( D, E, A, B, C, F2, 14, in[ 3], 4 );
+   ROUND2( C, D, E, A, B, F2, 14, in[11], 4 );
+   ROUND2( B, C, D, E, A, F2,  6, in[15], 4 );
+   ROUND2( A, B, C, D, E, F2, 14, in[ 0], 4 );
+   ROUND2( E, A, B, C, D, F2,  6, in[ 5], 4 );
+   ROUND2( D, E, A, B, C, F2,  9, in[12], 4 );
+   ROUND2( C, D, E, A, B, F2, 12, in[ 2], 4 );
+   ROUND2( B, C, D, E, A, F2,  9, in[13], 4 );
+   ROUND2( A, B, C, D, E, F2, 12, in[ 9], 4 );
+   ROUND2( E, A, B, C, D, F2,  5, in[ 7], 4 );
+   ROUND2( D, E, A, B, C, F2, 15, in[10], 4 );
+   ROUND2( C, D, E, A, B, F2,  8, in[14], 4 );
+
+   ROUND2( B, C, D, E, A, F1,  8, in[12], 5 );
+   ROUND2( A, B, C, D, E, F1,  5, in[15], 5 );
+   ROUND2( E, A, B, C, D, F1, 12, in[10], 5 );
+   ROUND2( D, E, A, B, C, F1,  9, in[ 4], 5 );
+   ROUND2( C, D, E, A, B, F1, 12, in[ 1], 5 );
+   ROUND2( B, C, D, E, A, F1,  5, in[ 5], 5 );
+   ROUND2( A, B, C, D, E, F1, 14, in[ 8], 5 );
+   ROUND2( E, A, B, C, D, F1,  6, in[ 7], 5 );
+   ROUND2( D, E, A, B, C, F1,  8, in[ 6], 5 );
+   ROUND2( C, D, E, A, B, F1, 13, in[ 2], 5 );
+   ROUND2( B, C, D, E, A, F1,  6, in[13], 5 );
+   ROUND2( A, B, C, D, E, F1,  5, in[14], 5 );
+   ROUND2( E, A, B, C, D, F1, 15, in[ 0], 5 );
+   ROUND2( D, E, A, B, C, F1, 13, in[ 3], 5 );
+   ROUND2( C, D, E, A, B, F1, 11, in[ 9], 5 );
+   ROUND2( B, C, D, E, A, F1, 11, in[11], 5 );
+
+   tmp =  _mm_add_epi32( _mm_add_epi32( h[1], C1 ), D2 );
+   h[1] = _mm_add_epi32( _mm_add_epi32( h[2], D1 ), E2 );
+   h[2] = _mm_add_epi32( _mm_add_epi32( h[3], E1 ), A2 );
+   h[3] = _mm_add_epi32( _mm_add_epi32( h[4], A1 ), B2 );
+   h[4] = _mm_add_epi32( _mm_add_epi32( h[0], B1 ), C2 );
+   h[0] = tmp;
+}
+
+void ripemd160_4way_init( ripemd160_4way_context *sc )
+{
+   sc->val[0] = _mm_set1_epi32( IV[0] );
+   sc->val[1] = _mm_set1_epi32( IV[1] );
+   sc->val[2] = _mm_set1_epi32( IV[2] );
+   sc->val[3] = _mm_set1_epi32( IV[3] );
+   sc->val[4] = _mm_set1_epi32( IV[4] );
+   sc->count_high = sc->count_low = 0;
+}
+
+void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len )
+{
+   __m128i *vdata = (__m128i*)data;
+   size_t ptr;
+   const int block_size = 64;
+
+   ptr = (unsigned)sc->count_low & (block_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = block_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 );
+      vdata = vdata + (clen>>2);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == block_size )
+      {
+         ripemd160_4way_round( sc );
+         ptr = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high++;
+   }
+}
+
+void ripemd160_4way_close( ripemd160_4way_context  *sc, void *dst )
+{
+   unsigned ptr, u;
+   uint32_t low, high;
+   const int block_size = 64;
+   const int pad = block_size - 8;
+
+   ptr = (unsigned)sc->count_low & ( block_size - 1U);
+   sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
+   ptr += 4;
+
+   if ( ptr > pad )
+   {
+       memset_zero_128( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
+       ripemd160_4way_round( sc );
+       memset_zero_128( sc->buf, pad>>2 );
+   }
+   else
+       memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+        
+    low = sc->count_low;
+    high = (sc->count_high << 3) | (low >> 29);
+    low = low << 3;
+    sc->buf[  pad>>2      ] = _mm_set1_epi32( low  );
+    sc->buf[ (pad>>2) + 1 ] = _mm_set1_epi32( high );
+    ripemd160_4way_round( sc );
+    for (u = 0; u < 5; u ++)
+        casti_m128i( dst, u ) = sc->val[u];
+}
+
+#endif
+
+#if defined(__AVX2__)
+
+// Ripemd-160 8 way
+
+#define F8W_1(x, y, z) \
+   _mm256_xor_si256( _mm256_xor_si256( x, y ), z )
+
+#define F8W_2(x, y, z) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( y, z ), x ), z )
+
+#define F8W_3(x, y, z) \
+   _mm256_xor_si256( _mm256_or_si256( x, mm256_not( y ) ), z )
+
+#define F8W_4(x, y, z) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( x, y ), z ), y )
+
+#define F8W_5(x, y, z) \
+   _mm256_xor_si256( x, _mm256_or_si256( y, mm256_not( z ) ) )
+
+#define RR_8W(a, b, c, d, e, f, s, r, k) \
+do{ \
+   a = _mm256_add_epi32( mm256_rotl_32( _mm256_add_epi32( _mm256_add_epi32( \
+                _mm256_add_epi32( a, f( b ,c, d ) ), r ), \
+                                 _mm256_set1_epi32( k ) ), s ), e ); \
+   c = mm256_rotl_32( c, 10 );\
+} while (0)
+    
+#define ROUND1_8W(a, b, c, d, e, f, s, r, k)  \
+        RR_8W(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
+
+#define ROUND2_8W(a, b, c, d, e, f, s, r, k)  \
+        RR_8W(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
+
+static void ripemd160_8way_round( ripemd160_8way_context *sc )
+{
+   const __m256i *in = (__m256i*)sc->buf;
+   __m256i *h  = (__m256i*)sc->val;
+   register __m256i A1, B1, C1, D1, E1;
+   register __m256i A2, B2, C2, D2, E2;
+   __m256i tmp;
+
+   A1 = A2 = h[0];
+   B1 = B2 = h[1];
+   C1 = C2 = h[2];
+   D1 = D2 = h[3];
+   E1 = E2 = h[4];
+
+   ROUND1_8W( A, B, C, D, E, F8W_1, 11, in[ 0], 1 );
+   ROUND1_8W( E, A, B, C, D, F8W_1, 14, in[ 1], 1 );
+   ROUND1_8W( D, E, A, B, C, F8W_1, 15, in[ 2], 1 );
+   ROUND1_8W( C, D, E, A, B, F8W_1, 12, in[ 3], 1 );
+   ROUND1_8W( B, C, D, E, A, F8W_1,  5, in[ 4], 1 );
+   ROUND1_8W( A, B, C, D, E, F8W_1,  8, in[ 5], 1 );
+   ROUND1_8W( E, A, B, C, D, F8W_1,  7, in[ 6], 1 );
+   ROUND1_8W( D, E, A, B, C, F8W_1,  9, in[ 7], 1 );
+   ROUND1_8W( C, D, E, A, B, F8W_1, 11, in[ 8], 1 );
+   ROUND1_8W( B, C, D, E, A, F8W_1, 13, in[ 9], 1 );
+   ROUND1_8W( A, B, C, D, E, F8W_1, 14, in[10], 1 );
+   ROUND1_8W( E, A, B, C, D, F8W_1, 15, in[11], 1 );
+   ROUND1_8W( D, E, A, B, C, F8W_1,  6, in[12], 1 );
+   ROUND1_8W( C, D, E, A, B, F8W_1,  7, in[13], 1 );
+   ROUND1_8W( B, C, D, E, A, F8W_1,  9, in[14], 1 );
+   ROUND1_8W( A, B, C, D, E, F8W_1,  8, in[15], 1 );
+
+   ROUND1_8W( E, A, B, C, D, F8W_2,  7, in[ 7], 2 );
+   ROUND1_8W( D, E, A, B, C, F8W_2,  6, in[ 4], 2 );
+   ROUND1_8W( C, D, E, A, B, F8W_2,  8, in[13], 2 );
+   ROUND1_8W( B, C, D, E, A, F8W_2, 13, in[ 1], 2 );
+   ROUND1_8W( A, B, C, D, E, F8W_2, 11, in[10], 2 );
+   ROUND1_8W( E, A, B, C, D, F8W_2,  9, in[ 6], 2 );
+   ROUND1_8W( D, E, A, B, C, F8W_2,  7, in[15], 2 );
+   ROUND1_8W( C, D, E, A, B, F8W_2, 15, in[ 3], 2 );
+   ROUND1_8W( B, C, D, E, A, F8W_2,  7, in[12], 2 );
+   ROUND1_8W( A, B, C, D, E, F8W_2, 12, in[ 0], 2 );
+   ROUND1_8W( E, A, B, C, D, F8W_2, 15, in[ 9], 2 );
+   ROUND1_8W( D, E, A, B, C, F8W_2,  9, in[ 5], 2 );
+   ROUND1_8W( C, D, E, A, B, F8W_2, 11, in[ 2], 2 );
+   ROUND1_8W( B, C, D, E, A, F8W_2,  7, in[14], 2 );
+   ROUND1_8W( A, B, C, D, E, F8W_2, 13, in[11], 2 );
+   ROUND1_8W( E, A, B, C, D, F8W_2, 12, in[ 8], 2 );
+
+   ROUND1_8W( D, E, A, B, C, F8W_3, 11, in[ 3], 3 );
+   ROUND1_8W( C, D, E, A, B, F8W_3, 13, in[10], 3 );
+   ROUND1_8W( B, C, D, E, A, F8W_3,  6, in[14], 3 );
+   ROUND1_8W( A, B, C, D, E, F8W_3,  7, in[ 4], 3 );
+   ROUND1_8W( E, A, B, C, D, F8W_3, 14, in[ 9], 3 );
+   ROUND1_8W( D, E, A, B, C, F8W_3,  9, in[15], 3 );
+   ROUND1_8W( C, D, E, A, B, F8W_3, 13, in[ 8], 3 );
+   ROUND1_8W( B, C, D, E, A, F8W_3, 15, in[ 1], 3 );
+   ROUND1_8W( A, B, C, D, E, F8W_3, 14, in[ 2], 3 );
+   ROUND1_8W( E, A, B, C, D, F8W_3,  8, in[ 7], 3 );
+   ROUND1_8W( D, E, A, B, C, F8W_3, 13, in[ 0], 3 );
+   ROUND1_8W( C, D, E, A, B, F8W_3,  6, in[ 6], 3 );
+   ROUND1_8W( B, C, D, E, A, F8W_3,  5, in[13], 3 );
+   ROUND1_8W( A, B, C, D, E, F8W_3, 12, in[11], 3 );
+   ROUND1_8W( E, A, B, C, D, F8W_3,  7, in[ 5], 3 );
+   ROUND1_8W( D, E, A, B, C, F8W_3,  5, in[12], 3 );
+
+   ROUND1_8W( C, D, E, A, B, F8W_4, 11, in[ 1], 4 );
+   ROUND1_8W( B, C, D, E, A, F8W_4, 12, in[ 9], 4 );
+   ROUND1_8W( A, B, C, D, E, F8W_4, 14, in[11], 4 );
+   ROUND1_8W( E, A, B, C, D, F8W_4, 15, in[10], 4 );
+   ROUND1_8W( D, E, A, B, C, F8W_4, 14, in[ 0], 4 );
+   ROUND1_8W( C, D, E, A, B, F8W_4, 15, in[ 8], 4 );
+   ROUND1_8W( B, C, D, E, A, F8W_4,  9, in[12], 4 );
+   ROUND1_8W( A, B, C, D, E, F8W_4,  8, in[ 4], 4 );
+   ROUND1_8W( E, A, B, C, D, F8W_4,  9, in[13], 4 );
+   ROUND1_8W( D, E, A, B, C, F8W_4, 14, in[ 3], 4 );
+   ROUND1_8W( C, D, E, A, B, F8W_4,  5, in[ 7], 4 );
+   ROUND1_8W( B, C, D, E, A, F8W_4,  6, in[15], 4 );
+   ROUND1_8W( A, B, C, D, E, F8W_4,  8, in[14], 4 );
+   ROUND1_8W( E, A, B, C, D, F8W_4,  6, in[ 5], 4 );
+   ROUND1_8W( D, E, A, B, C, F8W_4,  5, in[ 6], 4 );
+   ROUND1_8W( C, D, E, A, B, F8W_4, 12, in[ 2], 4 );
+
+   ROUND1_8W( B, C, D, E, A, F8W_5,  9, in[ 4], 5 );
+   ROUND1_8W( A, B, C, D, E, F8W_5, 15, in[ 0], 5 );
+   ROUND1_8W( E, A, B, C, D, F8W_5,  5, in[ 5], 5 );
+   ROUND1_8W( D, E, A, B, C, F8W_5, 11, in[ 9], 5 );
+   ROUND1_8W( C, D, E, A, B, F8W_5,  6, in[ 7], 5 );
+   ROUND1_8W( B, C, D, E, A, F8W_5,  8, in[12], 5 );
+   ROUND1_8W( A, B, C, D, E, F8W_5, 13, in[ 2], 5 );
+   ROUND1_8W( E, A, B, C, D, F8W_5, 12, in[10], 5 );
+   ROUND1_8W( D, E, A, B, C, F8W_5,  5, in[14], 5 );
+   ROUND1_8W( C, D, E, A, B, F8W_5, 12, in[ 1], 5 );
+   ROUND1_8W( B, C, D, E, A, F8W_5, 13, in[ 3], 5 );
+   ROUND1_8W( A, B, C, D, E, F8W_5, 14, in[ 8], 5 );
+   ROUND1_8W( E, A, B, C, D, F8W_5, 11, in[11], 5 );
+   ROUND1_8W( D, E, A, B, C, F8W_5,  8, in[ 6], 5 );
+   ROUND1_8W( C, D, E, A, B, F8W_5,  5, in[15], 5 );
+   ROUND1_8W( B, C, D, E, A, F8W_5,  6, in[13], 5 );
+
+   ROUND2_8W( A, B, C, D, E, F8W_5,  8, in[ 5], 1 );
+   ROUND2_8W( E, A, B, C, D, F8W_5,  9, in[14], 1 );
+   ROUND2_8W( D, E, A, B, C, F8W_5,  9, in[ 7], 1 );
+   ROUND2_8W( C, D, E, A, B, F8W_5, 11, in[ 0], 1 );
+   ROUND2_8W( B, C, D, E, A, F8W_5, 13, in[ 9], 1 );
+   ROUND2_8W( A, B, C, D, E, F8W_5, 15, in[ 2], 1 );
+   ROUND2_8W( E, A, B, C, D, F8W_5, 15, in[11], 1 );
+   ROUND2_8W( D, E, A, B, C, F8W_5,  5, in[ 4], 1 );
+   ROUND2_8W( C, D, E, A, B, F8W_5,  7, in[13], 1 );
+   ROUND2_8W( B, C, D, E, A, F8W_5,  7, in[ 6], 1 );
+   ROUND2_8W( A, B, C, D, E, F8W_5,  8, in[15], 1 );
+   ROUND2_8W( E, A, B, C, D, F8W_5, 11, in[ 8], 1 );
+   ROUND2_8W( D, E, A, B, C, F8W_5, 14, in[ 1], 1 );
+   ROUND2_8W( C, D, E, A, B, F8W_5, 14, in[10], 1 );
+   ROUND2_8W( B, C, D, E, A, F8W_5, 12, in[ 3], 1 );
+   ROUND2_8W( A, B, C, D, E, F8W_5,  6, in[12], 1 );
+
+   ROUND2_8W( E, A, B, C, D, F8W_4,  9, in[ 6], 2 );
+   ROUND2_8W( D, E, A, B, C, F8W_4, 13, in[11], 2 );
+   ROUND2_8W( C, D, E, A, B, F8W_4, 15, in[ 3], 2 );
+   ROUND2_8W( B, C, D, E, A, F8W_4,  7, in[ 7], 2 );
+   ROUND2_8W( A, B, C, D, E, F8W_4, 12, in[ 0], 2 );
+   ROUND2_8W( E, A, B, C, D, F8W_4,  8, in[13], 2 );
+   ROUND2_8W( D, E, A, B, C, F8W_4,  9, in[ 5], 2 );
+   ROUND2_8W( C, D, E, A, B, F8W_4, 11, in[10], 2 );
+   ROUND2_8W( B, C, D, E, A, F8W_4,  7, in[14], 2 );
+   ROUND2_8W( A, B, C, D, E, F8W_4,  7, in[15], 2 );
+   ROUND2_8W( E, A, B, C, D, F8W_4, 12, in[ 8], 2 );
+   ROUND2_8W( D, E, A, B, C, F8W_4,  7, in[12], 2 );
+   ROUND2_8W( C, D, E, A, B, F8W_4,  6, in[ 4], 2 );
+   ROUND2_8W( B, C, D, E, A, F8W_4, 15, in[ 9], 2 );
+   ROUND2_8W( A, B, C, D, E, F8W_4, 13, in[ 1], 2 );
+   ROUND2_8W( E, A, B, C, D, F8W_4, 11, in[ 2], 2 );
+
+   ROUND2_8W( D, E, A, B, C, F8W_3,  9, in[15], 3 );
+   ROUND2_8W( C, D, E, A, B, F8W_3,  7, in[ 5], 3 );
+   ROUND2_8W( B, C, D, E, A, F8W_3, 15, in[ 1], 3 );
+   ROUND2_8W( A, B, C, D, E, F8W_3, 11, in[ 3], 3 );
+   ROUND2_8W( E, A, B, C, D, F8W_3,  8, in[ 7], 3 );
+   ROUND2_8W( D, E, A, B, C, F8W_3,  6, in[14], 3 );
+   ROUND2_8W( C, D, E, A, B, F8W_3,  6, in[ 6], 3 );
+   ROUND2_8W( B, C, D, E, A, F8W_3, 14, in[ 9], 3 );
+   ROUND2_8W( A, B, C, D, E, F8W_3, 12, in[11], 3 );
+   ROUND2_8W( E, A, B, C, D, F8W_3, 13, in[ 8], 3 );
+   ROUND2_8W( D, E, A, B, C, F8W_3,  5, in[12], 3 );
+   ROUND2_8W( C, D, E, A, B, F8W_3, 14, in[ 2], 3 );
+   ROUND2_8W( B, C, D, E, A, F8W_3, 13, in[10], 3 );
+   ROUND2_8W( A, B, C, D, E, F8W_3, 13, in[ 0], 3 );
+   ROUND2_8W( E, A, B, C, D, F8W_3,  7, in[ 4], 3 );
+   ROUND2_8W( D, E, A, B, C, F8W_3,  5, in[13], 3 );
+
+   ROUND2_8W( C, D, E, A, B, F8W_2, 15, in[ 8], 4 );
+   ROUND2_8W( B, C, D, E, A, F8W_2,  5, in[ 6], 4 );
+   ROUND2_8W( A, B, C, D, E, F8W_2,  8, in[ 4], 4 );
+   ROUND2_8W( E, A, B, C, D, F8W_2, 11, in[ 1], 4 );
+   ROUND2_8W( D, E, A, B, C, F8W_2, 14, in[ 3], 4 );
+   ROUND2_8W( C, D, E, A, B, F8W_2, 14, in[11], 4 );
+   ROUND2_8W( B, C, D, E, A, F8W_2,  6, in[15], 4 );
+   ROUND2_8W( A, B, C, D, E, F8W_2, 14, in[ 0], 4 );
+   ROUND2_8W( E, A, B, C, D, F8W_2,  6, in[ 5], 4 );
+   ROUND2_8W( D, E, A, B, C, F8W_2,  9, in[12], 4 );
+   ROUND2_8W( C, D, E, A, B, F8W_2, 12, in[ 2], 4 );
+   ROUND2_8W( B, C, D, E, A, F8W_2,  9, in[13], 4 );
+   ROUND2_8W( A, B, C, D, E, F8W_2, 12, in[ 9], 4 );
+   ROUND2_8W( E, A, B, C, D, F8W_2,  5, in[ 7], 4 );
+   ROUND2_8W( D, E, A, B, C, F8W_2, 15, in[10], 4 );
+   ROUND2_8W( C, D, E, A, B, F8W_2,  8, in[14], 4 );
+
+   ROUND2_8W( B, C, D, E, A, F8W_1,  8, in[12], 5 );
+   ROUND2_8W( A, B, C, D, E, F8W_1,  5, in[15], 5 );
+   ROUND2_8W( E, A, B, C, D, F8W_1, 12, in[10], 5 );
+   ROUND2_8W( D, E, A, B, C, F8W_1,  9, in[ 4], 5 );
+   ROUND2_8W( C, D, E, A, B, F8W_1, 12, in[ 1], 5 );
+   ROUND2_8W( B, C, D, E, A, F8W_1,  5, in[ 5], 5 );
+   ROUND2_8W( A, B, C, D, E, F8W_1, 14, in[ 8], 5 );
+   ROUND2_8W( E, A, B, C, D, F8W_1,  6, in[ 7], 5 );
+   ROUND2_8W( D, E, A, B, C, F8W_1,  8, in[ 6], 5 );
+   ROUND2_8W( C, D, E, A, B, F8W_1, 13, in[ 2], 5 );
+   ROUND2_8W( B, C, D, E, A, F8W_1,  6, in[13], 5 );
+   ROUND2_8W( A, B, C, D, E, F8W_1,  5, in[14], 5 );
+   ROUND2_8W( E, A, B, C, D, F8W_1, 15, in[ 0], 5 );
+   ROUND2_8W( D, E, A, B, C, F8W_1, 13, in[ 3], 5 );
+   ROUND2_8W( C, D, E, A, B, F8W_1, 11, in[ 9], 5 );
+   ROUND2_8W( B, C, D, E, A, F8W_1, 11, in[11], 5 );
+
+   tmp =  _mm256_add_epi32( _mm256_add_epi32( h[1], C1 ), D2 );
+   h[1] = _mm256_add_epi32( _mm256_add_epi32( h[2], D1 ), E2 );
+   h[2] = _mm256_add_epi32( _mm256_add_epi32( h[3], E1 ), A2 );
+   h[3] = _mm256_add_epi32( _mm256_add_epi32( h[4], A1 ), B2 );
+   h[4] = _mm256_add_epi32( _mm256_add_epi32( h[0], B1 ), C2 );
+   h[0] = tmp;
+}
+
+
+void ripemd160_8way_init( ripemd160_8way_context *sc )
+{
+   sc->val[0] = _mm256_set1_epi32( IV[0] );
+   sc->val[1] = _mm256_set1_epi32( IV[1] );
+   sc->val[2] = _mm256_set1_epi32( IV[2] );
+   sc->val[3] = _mm256_set1_epi32( IV[3] );
+   sc->val[4] = _mm256_set1_epi32( IV[4] );
+   sc->count_high = sc->count_low = 0;
+}
+
+void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   size_t ptr;
+   const int block_size = 64;
+
+   ptr = (unsigned)sc->count_low & (block_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = block_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (ptr>>2), vdata, clen>>2 );
+      vdata = vdata + (clen>>2);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == block_size )
+      {
+         ripemd160_8way_round( sc );
+         ptr = 0;
+      }
+      clow = sc->count_low;
+      clow2 = clow + clen;
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high++;
+   }
+}
+
+void ripemd160_8way_close( ripemd160_8way_context  *sc, void *dst )
+{
+   unsigned ptr, u;
+   uint32_t low, high;
+   const int block_size = 64;
+   const int pad = block_size - 8;
+
+   ptr = (unsigned)sc->count_low & ( block_size - 1U);
+   sc->buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
+   ptr += 4;
+
+   if ( ptr > pad )
+   {
+       memset_zero_256( sc->buf + (ptr>>2), (block_size - ptr) >> 2 );
+       ripemd160_8way_round( sc );
+       memset_zero_256( sc->buf, pad>>2 );
+   }
+   else
+       memset_zero_256( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+
+    low = sc->count_low;
+    high = (sc->count_high << 3) | (low >> 29);
+    low = low << 3;
+    sc->buf[  pad>>2      ] = _mm256_set1_epi32( low  );
+    sc->buf[ (pad>>2) + 1 ] = _mm256_set1_epi32( high );
+    ripemd160_8way_round( sc );
+    for (u = 0; u < 5; u ++)
+        casti_m256i( dst, u ) = sc->val[u];
+}
+
+#endif // __AVX2__
+
--- a/algo/ripemd/ripemd-hash-4way.h
+++ b/algo/ripemd/ripemd-hash-4way.h
@@ -0,0 +1,38 @@
+#ifndef RIPEMD_HASH_4WAY_H__
+#define RIPEMD_HASH_4WAY_H__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+
+#if defined(__AVX__)
+
+#include "avxdefs.h"
+
+typedef struct
+{
+   __m128i buf[64>>2];
+   __m128i val[5];
+   uint32_t count_high, count_low;
+} __attribute__ ((aligned (64))) ripemd160_4way_context;
+
+void ripemd160_4way_init( ripemd160_4way_context *sc );
+void ripemd160_4way( ripemd160_4way_context *sc, const void *data, size_t len );
+void ripemd160_4way_close( ripemd160_4way_context *sc, void *dst );
+
+#if defined (__AVX2__)
+
+typedef struct
+{
+   __m256i buf[64>>2];
+   __m256i val[5];
+   uint32_t count_high, count_low;
+} __attribute__ ((aligned (64))) ripemd160_8way_context;
+
+void ripemd160_8way_init( ripemd160_8way_context *sc );
+void ripemd160_8way( ripemd160_8way_context *sc, const void *data, size_t len );
+void ripemd160_8way_close( ripemd160_8way_context *sc, void *dst );
+
+
+#endif // __AVX2__
+#endif // __AVX__
+#endif // RIPEMD_HASH_4WAY_H__
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -778,6 +778,7 @@ bool scrypt_miner_thread_init( int thr_id )

 bool register_scrypt_algo( algo_gate_t* gate )
 {
+  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
  gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
  gate->scanhash         = (void*)&scanhash_scrypt;
 //  gate->hash             = (void*)&scrypt_1024_1_1_256_24way;
--- a/algo/sha/md-helper-4way.c
+++ b/algo/sha/md-helper-4way.c
@@ -215,18 +215,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
 #if defined BE64
 #if defined PLW1
    sc->buf[ SPH_MAXPAD>>3 ] =
-                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #elif defined PLW4
    memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
    sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-                mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+                mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
    sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-                mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+                mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #else
    sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-               mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+               mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
    sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-               mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+               mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #endif  // PLW
 #else  // LE64
 #if defined PLW1
@@ -255,7 +255,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
    for ( u = 0; u < rnum; u ++ )
    {
 #if defined BE64
-       ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
+       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
 #else  // LE64
       ((__m256i*)dst)[u] = sc->val[u];
 #endif
--- a/algo/sha/sha2-big-4way.c
+++ b/algo/sha/sha2-big-4way.c
@@ -1,247 +0,0 @@
-/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
-/*
- * SHA-384 / SHA-512 implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#include <stddef.h>
-#include <string.h>
-
-#include "sph_sha2.h"
-
-#if SPH_64
-
-#define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
-#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
-
-#define ROTR64    SPH_ROTR64
-
-#define BSG5_0(x)      (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
-#define BSG5_1(x)      (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
-#define SSG5_0(x)      (ROTR64(x, 1) ^ ROTR64(x, 8) ^ SPH_T64((x) >> 7))
-#define SSG5_1(x)      (ROTR64(x, 19) ^ ROTR64(x, 61) ^ SPH_T64((x) >> 6))
-
-static const sph_u64 K512[80] = {
-	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
-	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
-	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
-	SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
-	SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
-	SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
-	SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
-	SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
-	SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
-	SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
-	SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
-	SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
-	SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
-	SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
-	SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
-	SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
-	SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
-	SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
-	SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
-	SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
-	SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
-	SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
-	SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
-	SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
-	SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
-	SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
-	SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
-	SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
-	SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
-	SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
-	SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
-	SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
-	SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
-	SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
-	SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
-	SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
-	SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
-	SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
-	SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
-	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
-};
-
-static const sph_u64 H384[8] = {
-	SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507),
-	SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939),
-	SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511),
-	SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4)
-};
-
-static const sph_u64 H512[8] = {
-	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
-	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
-	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
-	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
-};
-
-/*
- * This macro defines the body for a SHA-384 / SHA-512 compression function
- * implementation. The "in" parameter should evaluate, when applied to a
- * numerical input parameter from 0 to 15, to an expression which yields
- * the corresponding input block. The "r" parameter should evaluate to
- * an array or pointer expression designating the array of 8 words which
- * contains the input and output of the compression function.
- *
- * SHA-512 is hard for the compiler. If the loop is completely unrolled,
- * then the code will be quite huge (possibly more than 100 kB), and the
- * performance will be degraded due to cache misses on the code. We
- * unroll only eight steps, which avoids all needless copies when
- * 64-bit registers are swapped.
- */
-
-#define SHA3_STEP(A, B, C, D, E, F, G, H, i)   do { \
-		sph_u64 T1, T2; \
-		T1 = SPH_T64(H + BSG5_1(E) + CH(E, F, G) + K512[i] + W[i]); \
-		T2 = SPH_T64(BSG5_0(A) + MAJ(A, B, C)); \
-		D = SPH_T64(D + T1); \
-		H = SPH_T64(T1 + T2); \
-	} while (0)
-
-#define SHA3_ROUND_BODY(in, r)   do { \
-		int i; \
-		sph_u64 A, B, C, D, E, F, G, H; \
-		sph_u64 W[80]; \
- \
- 		for (i = 0; i < 16; i ++) \
-			W[i] = in(i); \
-		for (i = 16; i < 80; i ++) \
- 			W[i] = SPH_T64(SSG5_1(W[i - 2]) + W[i - 7] \
-				+ SSG5_0(W[i - 15]) + W[i - 16]); \
-		A = (r)[0]; \
-		B = (r)[1]; \
-		C = (r)[2]; \
-		D = (r)[3]; \
-		E = (r)[4]; \
-		F = (r)[5]; \
-		G = (r)[6]; \
-		H = (r)[7]; \
-		for (i = 0; i < 80; i += 8) { \
-			SHA3_STEP(A, B, C, D, E, F, G, H, i + 0); \
-			SHA3_STEP(H, A, B, C, D, E, F, G, i + 1); \
-			SHA3_STEP(G, H, A, B, C, D, E, F, i + 2); \
-			SHA3_STEP(F, G, H, A, B, C, D, E, i + 3); \
-			SHA3_STEP(E, F, G, H, A, B, C, D, i + 4); \
-			SHA3_STEP(D, E, F, G, H, A, B, C, i + 5); \
-			SHA3_STEP(C, D, E, F, G, H, A, B, i + 6); \
-			SHA3_STEP(B, C, D, E, F, G, H, A, i + 7); \
-		} \
-		(r)[0] = SPH_T64((r)[0] + A); \
-		(r)[1] = SPH_T64((r)[1] + B); \
-		(r)[2] = SPH_T64((r)[2] + C); \
-		(r)[3] = SPH_T64((r)[3] + D); \
-		(r)[4] = SPH_T64((r)[4] + E); \
-		(r)[5] = SPH_T64((r)[5] + F); \
-		(r)[6] = SPH_T64((r)[6] + G); \
-		(r)[7] = SPH_T64((r)[7] + H); \
-	} while (0)
-
-/*
- * One round of SHA-384 / SHA-512. The data must be aligned for 64-bit access.
- */
-static void
-sha3_round(const unsigned char *data, sph_u64 r[8])
-{
-#define SHA3_IN(x)   sph_dec64be_aligned(data + (8 * (x)))
-	SHA3_ROUND_BODY(SHA3_IN, r);
-#undef SHA3_IN
-}
-
-/* see sph_sha3.h */
-void
-sph_sha384_init(void *cc)
-{
-	sph_sha384_context *sc;
-
-	sc = cc;
-	memcpy(sc->val, H384, sizeof H384);
-	sc->count = 0;
-}
-
-/* see sph_sha3.h */
-void
-sph_sha512_init(void *cc)
-{
-	sph_sha512_context *sc;
-
-	sc = cc;
-	memcpy(sc->val, H512, sizeof H512);
-	sc->count = 0;
-}
-
-#define RFUN   sha3_round
-#define HASH   sha384
-#define BE64   1
-#include "md_helper.c"
-
-/* see sph_sha3.h */
-void
-sph_sha384_close(void *cc, void *dst)
-{
-	sha384_close(cc, dst, 6);
-//	sph_sha384_init(cc);
-}
-
-/* see sph_sha3.h */
-void
-sph_sha384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-	sha384_addbits_and_close(cc, ub, n, dst, 6);
-//	sph_sha384_init(cc);
-}
-
-/* see sph_sha3.h */
-void
-sph_sha512_close(void *cc, void *dst)
-{
-	sha384_close(cc, dst, 8);
-//	sph_sha512_init(cc);
-}
-
-/* see sph_sha3.h */
-void
-sph_sha512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
-{
-	sha384_addbits_and_close(cc, ub, n, dst, 8);
-//	sph_sha512_init(cc);
-}
-
-/* see sph_sha3.h */
-void
-sph_sha384_comp(const sph_u64 msg[16], sph_u64 val[8])
-{
-#define SHA3_IN(x)   msg[x]
-	SHA3_ROUND_BODY(SHA3_IN, val);
-#undef SHA3_IN
-}
-
-#endif
--- a/algo/sha/sha2-hash-4way.c
+++ b/algo/sha/sha2-hash-4way.c
@@ -30,13 +30,568 @@
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */

+#if defined(__AVX__)
+
 #include <stddef.h>
 #include <string.h>

 #include "sha2-hash-4way.h"

+#include <stdio.h>
+
+// SHA-256 32 bit
+
+static const sph_u32 H256[8] = {
+        SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85),
+        SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A),
+        SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
+        SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+};
+
+static const sph_u32 K256[64] = {
+        SPH_C32(0x428A2F98), SPH_C32(0x71374491),
+        SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
+        SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
+        SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
+        SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
+        SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
+        SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
+        SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
+        SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
+        SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
+        SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
+        SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
+        SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
+        SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
+        SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
+        SPH_C32(0x06CA6351), SPH_C32(0x14292967),
+        SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
+        SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
+        SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
+        SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
+        SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
+        SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
+        SPH_C32(0xD192E819), SPH_C32(0xD6990624),
+        SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
+        SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
+        SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
+        SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
+        SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
+        SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
+        SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
+        SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
+        SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
+};
+
+// SHA-256 4 way
+
+#define SHA2s_MEXP( a, b, c, d ) \
+     _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
+                    SSG2_1( W[a] ), W[b] ), SSG2_0( W[c] ) ), W[d] );
+
+#define CHs(X, Y, Z) \
+   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( Y, Z ), X ), Z ) 
+
+#define MAJs(X, Y, Z) \
+   _mm_or_si128( _mm_and_si128( X, Y ), \
+                    _mm_and_si128( _mm_or_si128( X, Y ), Z ) )
+
+#define BSG2_0(x) \
+   _mm_xor_si128( _mm_xor_si128( \
+        mm_rotr_32(x,  2), mm_rotr_32(x, 13) ), mm_rotr_32( x, 22) )
+
+#define BSG2_1(x) \
+   _mm_xor_si128( _mm_xor_si128( \
+        mm_rotr_32(x,  6), mm_rotr_32(x, 11) ), mm_rotr_32( x, 25) )
+
+#define SSG2_0(x) \
+   _mm_xor_si128( _mm_xor_si128( \
+        mm_rotr_32(x,  7), mm_rotr_32(x, 18) ), _mm_srli_epi32(x, 3) ) 
+
+#define SSG2_1(x) \
+   _mm_xor_si128( _mm_xor_si128( \
+        mm_rotr_32(x, 17), mm_rotr_32(x, 19) ), _mm_srli_epi32(x, 10) )
+
+#define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
+do { \
+  register __m128i T1, T2; \
+  T1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( \
+       _mm_add_epi32( H, BSG2_1(E) ), CHs(E, F, G) ), \
+                          _mm_set1_epi32( K256[( (j)+(i) )] ) ), W[i] ); \
+  T2 = _mm_add_epi32( BSG2_0(A), MAJs(A, B, C) ); \
+  D  = _mm_add_epi32( D,  T1 ); \
+  H  = _mm_add_epi32( T1, T2 ); \
+} while (0)
+
+static void
+sha256_4way_round( __m128i *in, __m128i r[8] )
+{
+   register  __m128i A, B, C, D, E, F, G, H;
+   __m128i W[16];
+
+   W[ 0] = mm_bswap_32( in[ 0] );
+   W[ 1] = mm_bswap_32( in[ 1] );
+   W[ 2] = mm_bswap_32( in[ 2] );
+   W[ 3] = mm_bswap_32( in[ 3] );
+   W[ 4] = mm_bswap_32( in[ 4] );
+   W[ 5] = mm_bswap_32( in[ 5] );
+   W[ 6] = mm_bswap_32( in[ 6] );
+   W[ 7] = mm_bswap_32( in[ 7] );
+   W[ 8] = mm_bswap_32( in[ 8] );
+   W[ 9] = mm_bswap_32( in[ 9] );
+   W[10] = mm_bswap_32( in[10] );
+   W[11] = mm_bswap_32( in[11] );
+   W[12] = mm_bswap_32( in[12] );
+   W[13] = mm_bswap_32( in[13] );
+   W[14] = mm_bswap_32( in[14] );
+   W[15] = mm_bswap_32( in[15] );
+
+   A = r[0];
+   B = r[1];
+   C = r[2];
+   D = r[3];
+   E = r[4];
+   F = r[5];
+   G = r[6];
+   H = r[7];
+
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2s_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2s_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2s_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2s_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2s_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2s_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2s_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2s_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2s_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2s_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2s_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2s_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2s_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2s_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2s_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2s_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_4WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_4WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_4WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_4WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_4WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_4WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_4WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_4WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   r[0] = _mm_add_epi32( r[0], A );
+   r[1] = _mm_add_epi32( r[1], B );
+   r[2] = _mm_add_epi32( r[2], C );
+   r[3] = _mm_add_epi32( r[3], D );
+   r[4] = _mm_add_epi32( r[4], E );
+   r[5] = _mm_add_epi32( r[5], F );
+   r[6] = _mm_add_epi32( r[6], G );
+   r[7] = _mm_add_epi32( r[7], H );
+}
+
+void sha256_4way_init( sha256_4way_context *sc )
+{
+   sc->count_high = sc->count_low = 0;
+   sc->val[0] = _mm_set1_epi32( H256[0] );
+   sc->val[1] = _mm_set1_epi32( H256[1] );
+   sc->val[2] = _mm_set1_epi32( H256[2] );
+   sc->val[3] = _mm_set1_epi32( H256[3] );
+   sc->val[4] = _mm_set1_epi32( H256[4] );
+   sc->val[5] = _mm_set1_epi32( H256[5] );
+   sc->val[6] = _mm_set1_epi32( H256[6] );
+   sc->val[7] = _mm_set1_epi32( H256[7] );
+}
+
+void sha256_4way( sha256_4way_context *sc, const void *data, size_t len )
+{
+   __m128i *vdata = (__m128i*)data;
+   size_t ptr;
+   const int buf_size = 64;
+
+   ptr = (unsigned)sc->count_low & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( sc->buf + (ptr>>2), vdata, clen>>2 );
+      vdata = vdata + (clen>>2);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha256_4way_round( sc->buf, sc->val );
+         ptr = 0;
+      }
+      clow = sc->count_low;
+      clow2 = SPH_T32( clow + clen );
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high++;
+   }
+}
+
+void sha256_4way_close( sha256_4way_context *sc, void *dst )
+{
+    unsigned ptr, u;
+    uint32_t low, high;
+    const int buf_size = 64;
+    const int pad = buf_size - 8;
+
+    ptr = (unsigned)sc->count_low & (buf_size - 1U);
+    sc->buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 );
+    ptr += 4;
+
+    if ( ptr > pad )
+    {
+         memset_zero_128( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
+         sha256_4way_round( sc->buf, sc->val );
+         memset_zero_128( sc->buf, pad >> 2 );
+    }
+    else
+         memset_zero_128( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+
+    low = sc->count_low;
+    high = (sc->count_high << 3) | (low >> 29);
+    low = low << 3;
+
+    sc->buf[ pad >> 2 ] =
+                 mm_bswap_32( _mm_set1_epi32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] =
+                 mm_bswap_32( _mm_set1_epi32( low ) );
+    sha256_4way_round( sc->buf, sc->val );
+
+    for ( u = 0; u < 8; u ++ )
+       ((__m128i*)dst)[u] = mm_bswap_32( sc->val[u] );
+}
+
 #if defined(__AVX2__)

+// SHA-256 8 way
+
+#define CHx(X, Y, Z) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
+
+#define MAJx(X, Y, Z) \
+   _mm256_or_si256( _mm256_and_si256( X, Y ), \
+                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
+
+#define BSG2_0x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_rotr_32(x,  2), mm256_rotr_32(x, 13) ), mm256_rotr_32( x, 22) )
+
+#define BSG2_1x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_rotr_32(x,  6), mm256_rotr_32(x, 11) ), mm256_rotr_32( x, 25) )
+
+#define SSG2_0x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_rotr_32(x,  7), mm256_rotr_32(x, 18) ), _mm256_srli_epi32(x, 3) ) 
+
+#define SSG2_1x(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+       mm256_rotr_32(x, 17), mm256_rotr_32(x, 19) ), _mm256_srli_epi32(x, 10) )
+
+#define SHA2x_MEXP( a, b, c, d ) \
+     _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
+                    SSG2_1x( W[a] ), W[b] ), SSG2_0x( W[c] ) ), W[d] );
+
+#define SHA2s_8WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
+do { \
+  register __m256i T1, T2; \
+  T1 = _mm256_add_epi32( _mm256_add_epi32( _mm256_add_epi32( \
+       _mm256_add_epi32( H, BSG2_1x(E) ), CHx(E, F, G) ), \
+                          _mm256_set1_epi32( K256[( (j)+(i) )] ) ), W[i] ); \
+  T2 = _mm256_add_epi32( BSG2_0x(A), MAJx(A, B, C) ); \
+  D  = _mm256_add_epi32( D,  T1 ); \
+  H  = _mm256_add_epi32( T1, T2 ); \
+} while (0)
+
+static void
+sha256_8way_round( __m256i *in, __m256i r[8] )
+{
+   register  __m256i A, B, C, D, E, F, G, H;
+   __m256i W[16];
+
+   W[ 0] = mm256_bswap_32( in[ 0] );
+   W[ 1] = mm256_bswap_32( in[ 1] );
+   W[ 2] = mm256_bswap_32( in[ 2] );
+   W[ 3] = mm256_bswap_32( in[ 3] );
+   W[ 4] = mm256_bswap_32( in[ 4] );
+   W[ 5] = mm256_bswap_32( in[ 5] );
+   W[ 6] = mm256_bswap_32( in[ 6] );
+   W[ 7] = mm256_bswap_32( in[ 7] );
+   W[ 8] = mm256_bswap_32( in[ 8] );
+   W[ 9] = mm256_bswap_32( in[ 9] );
+   W[10] = mm256_bswap_32( in[10] );
+   W[11] = mm256_bswap_32( in[11] );
+   W[12] = mm256_bswap_32( in[12] );
+   W[13] = mm256_bswap_32( in[13] );
+   W[14] = mm256_bswap_32( in[14] );
+   W[15] = mm256_bswap_32( in[15] );
+
+   A = r[0];
+   B = r[1];
+   C = r[2];
+   D = r[3];
+   E = r[4];
+   F = r[5];
+   G = r[6];
+   H = r[7];
+
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, 0 );
+
+//printf("sha256 8 step: D= %08lx H= %08lx\n",*(uint32_t*)&D,*(uint32_t*)&H);
+
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, 0 );
+   SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, 0 );
+   SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, 0 );
+   SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, 0 );
+   SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, 0 );
+   SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, 0 );
+   SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, 0 );
+   SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, 0 );
+   SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, 0 );
+
+//printf("sha256 8 step: A= %08lx B= %08lx\n",*(uint32_t*)&A,*(uint32_t*)&B);
+
+   for ( int j = 16; j < 64; j += 16 )
+   {
+      W[ 0] = SHA2x_MEXP( 14,  9,  1,  0 );
+      W[ 1] = SHA2x_MEXP( 15, 10,  2,  1 );
+      W[ 2] = SHA2x_MEXP(  0, 11,  3,  2 );
+      W[ 3] = SHA2x_MEXP(  1, 12,  4,  3 );
+      W[ 4] = SHA2x_MEXP(  2, 13,  5,  4 );
+      W[ 5] = SHA2x_MEXP(  3, 14,  6,  5 );
+      W[ 6] = SHA2x_MEXP(  4, 15,  7,  6 );
+      W[ 7] = SHA2x_MEXP(  5,  0,  8,  7 );
+      W[ 8] = SHA2x_MEXP(  6,  1,  9,  8 );
+      W[ 9] = SHA2x_MEXP(  7,  2, 10,  9 );
+      W[10] = SHA2x_MEXP(  8,  3, 11, 10 );
+      W[11] = SHA2x_MEXP(  9,  4, 12, 11 );
+      W[12] = SHA2x_MEXP( 10,  5, 13, 12 );
+      W[13] = SHA2x_MEXP( 11,  6, 14, 13 );
+      W[14] = SHA2x_MEXP( 12,  7, 15, 14 );
+      W[15] = SHA2x_MEXP( 13,  8,  0, 15 );
+
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  0, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  1, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F,  2, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E,  3, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D,  4, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C,  5, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B,  6, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A,  7, j );
+      SHA2s_8WAY_STEP( A, B, C, D, E, F, G, H,  8, j );
+      SHA2s_8WAY_STEP( H, A, B, C, D, E, F, G,  9, j );
+      SHA2s_8WAY_STEP( G, H, A, B, C, D, E, F, 10, j );
+      SHA2s_8WAY_STEP( F, G, H, A, B, C, D, E, 11, j );
+      SHA2s_8WAY_STEP( E, F, G, H, A, B, C, D, 12, j );
+      SHA2s_8WAY_STEP( D, E, F, G, H, A, B, C, 13, j );
+      SHA2s_8WAY_STEP( C, D, E, F, G, H, A, B, 14, j );
+      SHA2s_8WAY_STEP( B, C, D, E, F, G, H, A, 15, j );
+   }
+
+   r[0] = _mm256_add_epi32( r[0], A );
+   r[1] = _mm256_add_epi32( r[1], B );
+   r[2] = _mm256_add_epi32( r[2], C );
+   r[3] = _mm256_add_epi32( r[3], D );
+   r[4] = _mm256_add_epi32( r[4], E );
+   r[5] = _mm256_add_epi32( r[5], F );
+   r[6] = _mm256_add_epi32( r[6], G );
+   r[7] = _mm256_add_epi32( r[7], H );
+}
+
+
+void sha256_8way_init( sha256_8way_context *sc )
+{
+   sc->count_high = sc->count_low = 0;
+   sc->val[0] = _mm256_set1_epi32( H256[0] );
+   sc->val[1] = _mm256_set1_epi32( H256[1] );
+   sc->val[2] = _mm256_set1_epi32( H256[2] );
+   sc->val[3] = _mm256_set1_epi32( H256[3] );
+   sc->val[4] = _mm256_set1_epi32( H256[4] );
+   sc->val[5] = _mm256_set1_epi32( H256[5] );
+   sc->val[6] = _mm256_set1_epi32( H256[6] );
+   sc->val[7] = _mm256_set1_epi32( H256[7] );
+}
+
+void sha256_8way( sha256_8way_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   size_t ptr;
+   const int buf_size = 64;
+/*
+printf("sha256 8 update1: len= %d\n", len);
+uint32_t* d = (uint32_t*)data;
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[128],d[136],d[144],d[152]);
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[160],d[168],d[176],d[184]);
+printf("sha256 8 in: %08lx %08lx %08lx %08lx\n",d[192],d[200],d[208],d[216]);
+*/
+   ptr = (unsigned)sc->count_low & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      uint32_t clow, clow2;
+
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (ptr>>2), vdata, clen>>2 );
+      vdata = vdata + (clen>>2);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+/*
+printf("sha256 8 update2: compress\n");
+d = (uint32_t*)sc->buf;
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
+d= (uint32_t*)sc->val;
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+*/
+         sha256_8way_round( sc->buf, sc->val );
+/*
+printf("sha256 8 update3\n");
+d= (uint32_t*)sc->val;
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+*/
+         ptr = 0;
+      }
+      clow = sc->count_low;
+      clow2 = SPH_T32( clow + clen );
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high++;
+   }
+}
+
+void sha256_8way_close( sha256_8way_context *sc, void *dst )
+{
+    unsigned ptr, u;
+    uint32_t low, high;
+    const int buf_size = 64;
+    const int pad = buf_size - 8;
+
+    ptr = (unsigned)sc->count_low & (buf_size - 1U);
+/*
+printf("sha256 8 close1: ptr= %d\n", ptr);
+uint32_t* d = (uint32_t*)sc->buf;
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
+*/
+
+    sc->buf[ ptr>>2 ] = _mm256_set1_epi32( 0x80 );
+    ptr += 4;
+
+    if ( ptr > pad )
+    {
+         memset_zero_256( sc->buf + (ptr>>2), (buf_size - ptr) >> 2 );
+
+//printf("sha256 8 close2: compress\n");
+//uint32_t* d = (uint32_t*)sc->buf;
+//printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+
+
+         sha256_8way_round( sc->buf, sc->val );
+
+//d= (uint32_t*)sc->val;
+//printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+
+         memset_zero_256( sc->buf, pad >> 2 );
+    }
+    else
+         memset_zero_256( sc->buf + (ptr>>2), (pad - ptr) >> 2 );
+
+    low = sc->count_low;
+    high = (sc->count_high << 3) | (low >> 29);
+    low = low << 3;
+
+    sc->buf[ pad >> 2 ] =
+                 mm256_bswap_32( _mm256_set1_epi32( high ) );
+    sc->buf[ ( pad+4 ) >> 2 ] =
+                 mm256_bswap_32( _mm256_set1_epi32( low ) );
+/*
+d = (uint32_t*)sc->buf;
+printf("sha256 8 close3: compress\n");
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[64],d[72],d[80],d[88]);
+printf("sha256 8 buf: %08lx %08lx %08lx %08lx\n",d[96],d[104],d[112],d[120]);
+d= (uint32_t*)sc->val;
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+*/
+
+    sha256_8way_round( sc->buf, sc->val );
+/*
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[0],d[8],d[16],d[24]);
+printf("sha256 8 val: %08lx %08lx %08lx %08lx\n",d[32],d[40],d[48],d[56]);
+*/
+    for ( u = 0; u < 8; u ++ )
+       ((__m256i*)dst)[u] = mm256_bswap_32( sc->val[u] );
+}
+
+
+// SHA-512 4 way 64 bit
+
+static const sph_u64 H512[8] = {
+        SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+        SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+        SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+        SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+
 static const sph_u64 K512[80] = {
 	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
 	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
@@ -80,13 +635,6 @@ static const sph_u64 K512[80] = {
 	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
 };

-static const sph_u64 H512[8] = {
-	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
-	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
-	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
-	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
-};
-
 #define CH(X, Y, Z) \
   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 

@@ -112,7 +660,7 @@ static const sph_u64 H512[8] = {

 #define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
 do { \
-  __m256i T1, T2; \
+  register __m256i T1, T2; \
  T1 = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64( \
       _mm256_add_epi64( H, BSG5_1(E) ), CH(E, F, G) ), \
                         _mm256_set1_epi64x( K512[i] ) ), W[i] ); \
@@ -125,11 +673,11 @@ static void
 sha512_4way_round( __m256i *in, __m256i r[8] )
 {
   int i;
-   __m256i A, B, C, D, E, F, G, H;
+   register __m256i A, B, C, D, E, F, G, H;
   __m256i W[80];

   for ( i = 0; i < 16; i++ )
-      W[i] = mm256_byteswap_64( in[i] );
+      W[i] = mm256_bswap_64( in[i] );
   for ( i = 16; i < 80; i++ )
      W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64(
           SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] );
@@ -182,7 +730,7 @@ void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   size_t ptr;
-   int buf_size = 128;
+   const int buf_size = 128;

   ptr = (unsigned)sc->count & (buf_size - 1U);
   while ( len > 0 )
@@ -207,13 +755,12 @@ void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
 void sha512_4way_close( sha512_4way_context *sc, void *dst )
 {
    unsigned ptr, u;
-    int buf_size = 128;
-    int pad = buf_size - 16;
+    const int buf_size = 128;
+    const int pad = buf_size - 16;

    ptr = (unsigned)sc->count & (buf_size - 1U);
    sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
    ptr += 8;
-
    if ( ptr > pad )
    {
         memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
@@ -224,13 +771,14 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
         memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );

    sc->buf[ pad >> 3 ] =
-                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
    sc->buf[ ( pad+8 ) >> 3 ] = 
-                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
    sha512_4way_round( sc->buf, sc->val );

    for ( u = 0; u < 8; u ++ )
-       ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
+       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
 }

-#endif
+#endif  // __AVX2__
+#endif  // __AVX__
--- a/algo/sha/sha2-hash-4way.h
+++ b/algo/sha/sha2-hash-4way.h
@@ -44,51 +44,39 @@
 #include "sph_types.h"
 #include "avxdefs.h"

-#if 0
+#if defined(__AVX__)

-#define SPH_SIZE_sha224   224
+//#define SPH_SIZE_sha256   256

-#define SPH_SIZE_sha256   256
+// SHA-256 4 way

 typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char buf[64];    /* first field, for alignment */
-	sph_u32 val[8];
-#if SPH_64
-	sph_u64 count;
-#else
-	sph_u32 count_high, count_low;
-#endif
-#endif
-} sph_sha224_context;
+   __m128i buf[64>>2];
+   __m128i val[8];
+   uint32_t count_high, count_low;
+} sha256_4way_context;

-typedef sph_sha224_context sph_sha256_context;
-
-void sph_sha224_init(void *cc);
-
-void sph_sha224(void *cc, const void *data, size_t len);
-
-void sph_sha224_close(void *cc, void *dst);
-
-void sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
-
-void sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]);
-
-void sph_sha256_init(void *cc);
-
-void sph_sha256(void *cc, const void *data, size_t len);
-
-void sph_sha256_close(void *cc, void *dst);
-
-void sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
-
-void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
-
-#endif
+void sha256_4way_init( sha256_4way_context *sc );
+void sha256_4way( sha256_4way_context *sc, const void *data, size_t len );
+void sha256_4way_close( sha256_4way_context *sc, void *dst );

 #if defined (__AVX2__)

-#define SPH_SIZE_sha512   512
+// SHA-256 8 way
+
+typedef struct {
+   __m256i buf[64>>2];
+   __m256i val[8];
+   uint32_t count_high, count_low;
+} sha256_8way_context;
+
+void sha256_8way_init( sha256_8way_context *sc );
+void sha256_8way( sha256_8way_context *sc, const void *data, size_t len );
+void sha256_8way_close( sha256_8way_context *sc, void *dst );
+
+//#define SPH_SIZE_sha512   512
+
+// SHA-512 4 way

 typedef struct {
   __m256i buf[128>>3];
@@ -102,3 +90,4 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst );

 #endif
 #endif
+#endif
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -155,7 +155,7 @@ bool register_sha256t_algo( algo_gate_t* gate )
    gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t;
    gate->hash       = (void*)&sha256t_hash;
-    gate->set_target = (void*)&sha256t_set_target;
+//    gate->set_target = (void*)&sha256t_set_target;
    gate->get_max64  = (void*)&get_max64_0x3ffff;
    return true;
 }
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -52,21 +52,6 @@ extern "C"{

 #define C32   SPH_C32

-/*
- * As of round 2 of the SHA-3 competition, the published reference
- * implementation and test vectors are wrong, because they use
- * big-endian AES tables while the internal decoding uses little-endian.
- * The code below follows the specification. To turn it into a code
- * which follows the reference implementation (the one called "BugFix"
- * on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
- * the code below (from the '#define AES_BIG_ENDIAN...' to the definition
- * of the AES_ROUND_NOKEY macro) and replace it with the version which
- * is commented out afterwards.
- */
-
-#define AES_BIG_ENDIAN   0
-#include "algo/sha/aes_helper.c"
-
 static const sph_u32 IV512[] = {
 	C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
 	C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
@@ -74,198 +59,19 @@ static const sph_u32 IV512[] = {
 	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
 };

-#define AES_ROUND_NOKEY(x0, x1, x2, x3)   do { \
-		sph_u32 t0 = (x0); \
-		sph_u32 t1 = (x1); \
-		sph_u32 t2 = (x2); \
-		sph_u32 t3 = (x3); \
-		AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
-	} while (0)
+// Partially rotate elements in two 128 bit vectors as one 256 bit vector
+// and return the rotated high 128 bits.
+#if defined(__SSSE3__)

-  
-#define KEY_EXPAND_ELT(k0, k1, k2, k3)   do { \
-		sph_u32 kt; \
-		AES_ROUND_NOKEY(k1, k2, k3, k0); \
-		kt = (k0); \
-		(k0) = (k1); \
-		(k1) = (k2); \
-		(k2) = (k3); \
-		(k3) = kt; \
-	} while (0)
+#define mm_rotr256hi_1x32( hi, lo )  _mm_alignr_epi8( lo, hi, 4 )

+#else  // SSE2

-#if SPH_SMALL_FOOTPRINT_SHAVITE
+#define mm_rotr256hi_1x32( hi, lo ) \
+   _mm_or_si128( _mm_srli_si128( hi,  4 ), \
+                 _mm_slli_si128( lo, 12 ) )

-/*
- * This function assumes that "msg" is aligned for 32-bit access.
- */
-static void
-c512(sph_shavite_big_context *sc, const void *msg)
-{
-	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
-	sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
-	sph_u32 rk[448];
-	size_t u;
-	int r, s;
-
-#if SPH_LITTLE_ENDIAN
-	memcpy(rk, msg, 128);
-#else
-	for (u = 0; u < 32; u += 4) {
-		rk[u + 0] = sph_dec32le_aligned(
-			(const unsigned char *)msg + (u << 2) +  0);
-		rk[u + 1] = sph_dec32le_aligned(
-			(const unsigned char *)msg + (u << 2) +  4);
-		rk[u + 2] = sph_dec32le_aligned(
-			(const unsigned char *)msg + (u << 2) +  8);
-		rk[u + 3] = sph_dec32le_aligned(
-			(const unsigned char *)msg + (u << 2) + 12);
-	}
 #endif
-	u = 32;
-	for (;;) {
-		for (s = 0; s < 4; s ++) {
-			sph_u32 x0, x1, x2, x3;
-
-			x0 = rk[u - 31];
-			x1 = rk[u - 30];
-			x2 = rk[u - 29];
-			x3 = rk[u - 32];
-			AES_ROUND_NOKEY(x0, x1, x2, x3);
-			rk[u + 0] = x0 ^ rk[u - 4];
-			rk[u + 1] = x1 ^ rk[u - 3];
-			rk[u + 2] = x2 ^ rk[u - 2];
-			rk[u + 3] = x3 ^ rk[u - 1];
-			if (u == 32) {
-				rk[ 32] ^= sc->count0;
-				rk[ 33] ^= sc->count1;
-				rk[ 34] ^= sc->count2;
-				rk[ 35] ^= SPH_T32(~sc->count3);
-			} else if (u == 440) {
-				rk[440] ^= sc->count1;
-				rk[441] ^= sc->count0;
-				rk[442] ^= sc->count3;
-				rk[443] ^= SPH_T32(~sc->count2);
-			}
-			u += 4;
-
-			x0 = rk[u - 31];
-			x1 = rk[u - 30];
-			x2 = rk[u - 29];
-			x3 = rk[u - 32];
-			AES_ROUND_NOKEY(x0, x1, x2, x3);
-			rk[u + 0] = x0 ^ rk[u - 4];
-			rk[u + 1] = x1 ^ rk[u - 3];
-			rk[u + 2] = x2 ^ rk[u - 2];
-			rk[u + 3] = x3 ^ rk[u - 1];
-			if (u == 164) {
-				rk[164] ^= sc->count3;
-				rk[165] ^= sc->count2;
-				rk[166] ^= sc->count1;
-				rk[167] ^= SPH_T32(~sc->count0);
-			} else if (u == 316) {
-				rk[316] ^= sc->count2;
-				rk[317] ^= sc->count3;
-				rk[318] ^= sc->count0;
-				rk[319] ^= SPH_T32(~sc->count1);
-			}
-			u += 4;
-		}
-		if (u == 448)
-			break;
-		for (s = 0; s < 8; s ++) {
-			rk[u + 0] = rk[u - 32] ^ rk[u - 7];
-			rk[u + 1] = rk[u - 31] ^ rk[u - 6];
-			rk[u + 2] = rk[u - 30] ^ rk[u - 5];
-			rk[u + 3] = rk[u - 29] ^ rk[u - 4];
-			u += 4;
-		}
-	}
-
-	p0 = sc->h[0x0];
-	p1 = sc->h[0x1];
-	p2 = sc->h[0x2];
-	p3 = sc->h[0x3];
-	p4 = sc->h[0x4];
-	p5 = sc->h[0x5];
-	p6 = sc->h[0x6];
-	p7 = sc->h[0x7];
-	p8 = sc->h[0x8];
-	p9 = sc->h[0x9];
-	pA = sc->h[0xA];
-	pB = sc->h[0xB];
-	pC = sc->h[0xC];
-	pD = sc->h[0xD];
-	pE = sc->h[0xE];
-	pF = sc->h[0xF];
-	u = 0;
-	for (r = 0; r < 14; r ++) {
-#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3)   do { \
-		sph_u32 x0, x1, x2, x3; \
-		x0 = r0 ^ rk[u ++]; \
-		x1 = r1 ^ rk[u ++]; \
-		x2 = r2 ^ rk[u ++]; \
-		x3 = r3 ^ rk[u ++]; \
-		AES_ROUND_NOKEY(x0, x1, x2, x3); \
-		x0 ^= rk[u ++]; \
-		x1 ^= rk[u ++]; \
-		x2 ^= rk[u ++]; \
-		x3 ^= rk[u ++]; \
-		AES_ROUND_NOKEY(x0, x1, x2, x3); \
-		x0 ^= rk[u ++]; \
-		x1 ^= rk[u ++]; \
-		x2 ^= rk[u ++]; \
-		x3 ^= rk[u ++]; \
-		AES_ROUND_NOKEY(x0, x1, x2, x3); \
-		x0 ^= rk[u ++]; \
-		x1 ^= rk[u ++]; \
-		x2 ^= rk[u ++]; \
-		x3 ^= rk[u ++]; \
-		AES_ROUND_NOKEY(x0, x1, x2, x3); \
-		l0 ^= x0; \
-		l1 ^= x1; \
-		l2 ^= x2; \
-		l3 ^= x3; \
-	} while (0)
-
-#define WROT(a, b, c, d)   do { \
-		sph_u32 t = d; \
-		d = c; \
-		c = b; \
-		b = a; \
-		a = t; \
-	} while (0)
-
-		C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
-		C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
-
-		WROT(p0, p4, p8, pC);
-		WROT(p1, p5, p9, pD);
-		WROT(p2, p6, pA, pE);
-		WROT(p3, p7, pB, pF);
-
-#undef C512_ELT
-#undef WROT
-	}
-	sc->h[0x0] ^= p0;
-	sc->h[0x1] ^= p1;
-	sc->h[0x2] ^= p2;
-	sc->h[0x3] ^= p3;
-	sc->h[0x4] ^= p4;
-	sc->h[0x5] ^= p5;
-	sc->h[0x6] ^= p6;
-	sc->h[0x7] ^= p7;
-	sc->h[0x8] ^= p8;
-	sc->h[0x9] ^= p9;
-	sc->h[0xA] ^= pA;
-	sc->h[0xB] ^= pB;
-	sc->h[0xC] ^= pC;
-	sc->h[0xD] ^= pD;
-	sc->h[0xE] ^= pE;
-	sc->h[0xF] ^= pF;
-}
-
-#else

 static void
 c512( sph_shavite_big_context *sc, const void *msg )
@@ -284,42 +90,42 @@ c512( sph_shavite_big_context *sc, const void *msg )
   // round
   k00 = m[0];
   x = _mm_xor_si128( p1, k00 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
  
   k01 = m[1];
   x = _mm_xor_si128( x, k01 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );

   k02 = m[2];
   x = _mm_xor_si128( x, k02 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );

   k03 = m[3];
   x = _mm_xor_si128( x, k03 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   p0 = _mm_xor_si128( p0, x );

   k10 = m[4];
   x = _mm_xor_si128( p3, k10 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   
   k11 = m[5];
   x = _mm_xor_si128( x, k11 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );

   k12 = m[6];
   x = _mm_xor_si128( x, k12 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );

   k13 = m[7];
   x = _mm_xor_si128( x, k13 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   p2 = _mm_xor_si128( p2, x );

   for ( r = 0; r < 3; r ++ )
   {
      // round 1, 5, 9
-      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
      k00 = _mm_xor_si128( k00, k13 ); 

      if ( r == 0 )
@@ -327,8 +133,8 @@ c512( sph_shavite_big_context *sc, const void *msg )
                  ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 

      x = _mm_xor_si128( p0, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
      k01 = _mm_xor_si128( k01, k00 );

      if ( r == 1 )
@@ -336,34 +142,34 @@ c512( sph_shavite_big_context *sc, const void *msg )
                  ~sc->count0, sc->count1, sc->count2, sc->count3 ) );

      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
      k02 = _mm_xor_si128( k02, k01 );

      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
      k03 = _mm_xor_si128( k03, k02 );

      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p3 = _mm_xor_si128( p3, x );
-      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
      k10 = _mm_xor_si128( k10, k03 );

      x = _mm_xor_si128( p2, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
      k11 = _mm_xor_si128( k11, k10 );

      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
      k12 = _mm_xor_si128( k12, k11 );

      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
      k13 = _mm_xor_si128( k13, k12 );

      if ( r == 2 )
@@ -371,173 +177,173 @@ c512( sph_shavite_big_context *sc, const void *msg )
                  ~sc->count1, sc->count0, sc->count3, sc->count2 ) );

      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p1 = _mm_xor_si128( p1, x );

      // round 2, 6, 10

-      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13 ) );
      x = _mm_xor_si128( p3, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00 ) );
      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01 ) );
      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02 ) );
      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

      p2 = _mm_xor_si128( p2, x );
-      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03 ) );
      x = _mm_xor_si128( p1, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10 ) );
      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11 ) );
      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12 ) );
      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p0 = _mm_xor_si128( p0, x );

      // round 3, 7, 11

-      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
      k00 = _mm_xor_si128( k00, k13 );

      x = _mm_xor_si128( p2, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
      k01 = _mm_xor_si128( k01, k00 );

      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
      k02 = _mm_xor_si128( k02, k01 );

      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
      k03 = _mm_xor_si128( k03, k02 );

      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p1 = _mm_xor_si128( p1, x );
-      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
      k10 = _mm_xor_si128( k10, k03 );

      x = _mm_xor_si128( p0, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
      k11 = _mm_xor_si128( k11, k10 );

      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
      k12 = _mm_xor_si128( k12, k11 );

      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
      k13 = _mm_xor_si128( k13, k12 );

      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p3 = _mm_xor_si128( p3, x );

      // round 4, 8, 12

-      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13 ) );

      x = _mm_xor_si128( p1, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00 ) );

      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01 ) );

      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02 ) );

      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p0 = _mm_xor_si128( p0, x );
-      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03 ) );

      x = _mm_xor_si128( p3, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10 ) );

      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11 ) );

      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12 ) );

      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p2 = _mm_xor_si128( p2, x );
   }

   // round 13

-   k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+   k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
   k00 = _mm_xor_si128( k00, k13 );

   x = _mm_xor_si128( p0, k00 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); 
+   x = _mm_aesenc_si128( x, m128_zero );
+   k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) ); 
   k01 = _mm_xor_si128( k01, k00 );

   x = _mm_xor_si128( x, k01 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
   k02 = _mm_xor_si128( k02, k01 );

   x = _mm_xor_si128( x, k02 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
   k03 = _mm_xor_si128( k03, k02 );

   x = _mm_xor_si128( x, k03 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   p3 = _mm_xor_si128( p3, x );
-   k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+   k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
   k10 = _mm_xor_si128( k10, k03 );

   x = _mm_xor_si128( p2, k10 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
   k11 = _mm_xor_si128( k11, k10 );

   x = _mm_xor_si128( x, k11 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
   k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );

   x = _mm_xor_si128( x, k12 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
   k13 = _mm_xor_si128( k13, k12 );

   x = _mm_xor_si128( x, k13 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   p1 = _mm_xor_si128( p1, x );

   h[0] = _mm_xor_si128( h[0], p2 );
@@ -546,7 +352,6 @@ c512( sph_shavite_big_context *sc, const void *msg )
   h[3] = _mm_xor_si128( h[3], p1 );
 }

-#endif

 static void
 shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv )
--- a/algo/simd/sse2/nist.c
+++ b/algo/simd/sse2/nist.c
--- a/algo/simd/sse2/nist.h
+++ b/algo/simd/sse2/nist.h
--- a/algo/simd/sse2/simd-compat.h
+++ b/algo/simd/sse2/simd-compat.h
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -0,0 +1,853 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "simd-hash-2way.h"
+
+#if defined (__AVX2__)
+
+// imported from simd_iv.h
+
+uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc,
+                           0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
+                           0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f,
+                           0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
+                           0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8,
+                           0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
+                           0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4,
+                           0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 };
+
+/* Twiddle tables */
+
+static const m256_v16 FFT64_Twiddle[] =
+{
+    {{ 1,    2,    4,    8,   16,   32,   64,  128,
+       1,    2,    4,    8,   16,   32,   64,  128 }},
+    {{ 1,   60,    2,  120,    4,  -17,    8,  -34,
+       1,   60,    2,  120,    4,  -17,    8,  -34 }},
+    {{ 1,  120,    8,  -68,   64,  -30,   -2,   17,
+       1,  120,    8,  -68,   64,  -30,   -2,   17 }},
+    {{ 1,   46,   60,  -67,    2,   92,  120,  123,
+       1,   46,   60,  -67,    2,   92,  120,  123 }},
+    {{ 1,   92,  -17,  -22,   32,  117,  -30,   67,
+       1,   92,  -17,  -22,   32,  117,  -30,   67 }},
+    {{ 1,  -67,  120,  -73,    8,  -22,  -68,  -70,
+       1,  -67,  120,  -73,    8,  -22,  -68,  -70 }},
+    {{ 1,  123,  -34,  -70,  128,   67,   17,   35,
+       1,  123,  -34,  -70,  128,   67,   17,   35 }},
+};
+
+static const m256_v16 FFT128_Twiddle[] =
+{
+    {{   1, -118,   46,  -31,   60,  116,  -67,  -61,
+         1, -118,   46,  -31,   60,  116,  -67,  -61 }},
+    {{   2,   21,   92,  -62,  120,  -25,  123, -122,
+         2,   21,   92,  -62,  120,  -25,  123, -122 }},
+    {{   4,   42,  -73, -124,  -17,  -50,  -11,   13,
+         4,   42,  -73, -124,  -17,  -50,  -11,   13 }},
+    {{   8,   84,  111,    9,  -34, -100,  -22,   26,
+         8,   84,  111,    9,  -34, -100,  -22,   26 }},
+    {{  16,  -89,  -35,   18,  -68,   57,  -44,   52,
+        16,  -89,  -35,   18,  -68,   57,  -44,   52 }},
+    {{  32,   79,  -70,   36,  121,  114,  -88,  104,
+        32,   79,  -70,   36,  121,  114,  -88,  104 }},
+    {{  64,  -99,  117,   72,  -15,  -29,   81,  -49,
+        64,  -99,  117,   72,  -15,  -29,   81,  -49 }},
+    {{ 128,   59,  -23, -113,  -30,  -58,  -95,  -98,
+       128,   59,  -23, -113,  -30,  -58,  -95,  -98 }},
+};
+
+static const m256_v16 FFT256_Twiddle[] =
+{
+    {{   1,   41, -118,   45,   46,   87,  -31,   14,
+         1,   41, -118,   45,   46,   87,  -31,   14 }},
+    {{  60, -110,  116, -127,  -67,   80,  -61,   69,
+        60, -110,  116, -127,  -67,   80,  -61,   69 }},
+    {{   2,   82,   21,   90,   92,  -83,  -62,   28,
+         2,   82,   21,   90,   92,  -83,  -62,   28 }},
+    {{ 120,   37,  -25,    3,  123,  -97, -122, -119,
+       120,   37,  -25,    3,  123,  -97, -122, -119 }},
+    {{   4,  -93,   42,  -77,  -73,   91, -124,   56,
+         4,  -93,   42,  -77,  -73,   91, -124,   56 }},
+    {{ -17,   74,  -50,    6,  -11,   63,   13,   19,
+       -17,   74,  -50,    6,  -11,   63,   13,   19 }},
+    {{   8,   71,   84,  103,  111,  -75,    9,  112,
+         8,   71,   84,  103,  111,  -75,    9,  112 }},
+    {{ -34, -109, -100,   12,  -22,  126,   26,   38,
+       -34, -109, -100,   12,  -22,  126,   26,   38 }},
+    {{  16, -115,  -89,  -51,  -35,  107,   18,  -33,
+        16, -115,  -89,  -51,  -35,  107,   18,  -33 }},
+    {{ -68,   39,   57,   24,  -44,   -5,   52,   76,
+       -68,   39,   57,   24,  -44,   -5,   52,   76 }},
+    {{  32,   27,   79, -102,  -70,  -43,   36,  -66,
+        32,   27,   79, -102,  -70,  -43,   36,  -66 }},
+    {{ 121,   78,  114,   48,  -88,  -10,  104, -105,
+       121,   78,  114,   48,  -88,  -10,  104, -105 }},
+    {{  64,   54,  -99,   53,  117,  -86,   72,  125,
+        64,   54,  -99,   53,  117,  -86,   72,  125 }},
+    {{ -15, -101,  -29,   96,   81,  -20,  -49,   47,
+       -15, -101,  -29,   96,   81,  -20,  -49,   47 }},
+    {{ 128,  108,   59,  106,  -23,   85, -113,   -7,
+       128,  108,   59,  106,  -23,   85, -113,   -7 }},
+    {{ -30,   55,  -58,  -65,  -95,  -40,  -98,   94,
+       -30,   55,  -58,  -65,  -95,  -40,  -98,   94 }}
+};
+
+#define SHUFXOR_1 0xb1          /* 0b10110001 */
+#define SHUFXOR_2 0x4e          /* 0b01001110 */
+#define SHUFXOR_3 0x1b          /* 0b00011011 */
+
+#define CAT(x, y) x##y
+#define XCAT(x,y) CAT(x,y)
+
+#define shufxor(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
+
+// imported from vector.c
+
+#define REDUCE(x) \
+  _mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi16( 255 ) ), \
+                                         _mm256_srai_epi16( x, 8 ) )
+
+#define EXTRA_REDUCE_S(x)\
+  _mm256_sub_epi16( x, \
+         _mm256_and_si256( _mm256_set1_epi16( 257 ), \
+                           _mm256_cmpgt_epi16( x, _mm256_set1_epi16( 128 ) ) ) )
+
+#define REDUCE_FULL_S( x )  EXTRA_REDUCE_S( REDUCE (x ) )
+
+#define DO_REDUCE( i )      X(i) = REDUCE( X(i) )
+
+#define DO_REDUCE_FULL_S(i) \
+do { \
+    X(i) = REDUCE( X(i) );                        \
+    X(i) = EXTRA_REDUCE_S( X(i) );                \
+} while(0)
+
+void fft64_2way( void *a )
+{
+  __m256i* const A = a;
+  register __m256i X0, X1, X2, X3, X4, X5, X6, X7;
+
+#define X(i) X##i
+
+  X0 = A[0];
+  X1 = A[1];
+  X2 = A[2];
+  X3 = A[3];
+  X4 = A[4];
+  X5 = A[5];
+  X6 = A[6];
+  X7 = A[7];
+
+#define DO_REDUCE(i)   X(i) = REDUCE( X(i) )
+
+   // Begin with 8 parallels DIF FFT_8
+   //
+   // FFT_8 using w=4 as 8th root of unity
+   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
+   //  Output data is in revbin_permuted order.
+
+  static const int w[] = {0, 2, 4, 6};
+//   __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
+
+
+#define BUTTERFLY_0( i,j ) \
+do { \
+    __m256i v = X(j); \
+    X(j) = _mm256_add_epi16( X(i), X(j) ); \
+    X(i) = _mm256_sub_epi16( X(i), v ); \
+} while(0)
+
+#define BUTTERFLY_N( i,j,n ) \
+do { \
+    __m256i v = X(j); \
+    X(j) = _mm256_add_epi16( X(i), X(j) ); \
+    X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
+} while(0)
+
+  BUTTERFLY_0( 0, 4 );
+  BUTTERFLY_N( 1, 5, 1 );
+  BUTTERFLY_N( 2, 6, 2 );
+  BUTTERFLY_N( 3, 7, 3 );
+
+  DO_REDUCE( 2 );
+  DO_REDUCE( 3 );
+
+  BUTTERFLY_0( 0, 2 );
+  BUTTERFLY_0( 4, 6 );
+  BUTTERFLY_N( 1, 3, 2 );
+  BUTTERFLY_N( 5, 7, 2 );
+
+  DO_REDUCE( 1 );
+
+  BUTTERFLY_0( 0, 1 );
+  BUTTERFLY_0( 2, 3 );
+  BUTTERFLY_0( 4, 5 );
+  BUTTERFLY_0( 6, 7 );
+
+  /* We don't need to reduce X(7) */
+  DO_REDUCE_FULL_S( 0 );
+  DO_REDUCE_FULL_S( 1 );
+  DO_REDUCE_FULL_S( 2 );
+  DO_REDUCE_FULL_S( 3 );
+  DO_REDUCE_FULL_S( 4 );
+  DO_REDUCE_FULL_S( 5 );
+  DO_REDUCE_FULL_S( 6 );
+
+#undef BUTTERFLY_0
+#undef BUTTERFLY_N
+
+  // Multiply by twiddle factors
+  X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i );
+  X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i );
+  X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i );
+  X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i );
+  X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i );
+  X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i );
+  X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i );
+
+  // Transpose the FFT state with a revbin order permutation
+  // on the rows and the column.
+  // This will make the full FFT_64 in order.
+#define INTERLEAVE(i,j) \
+  do { \
+    __m256i t1= X(i); \
+    __m256i t2= X(j); \
+    X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
+    X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
+  } while(0)
+
+  INTERLEAVE( 1, 0 );
+  INTERLEAVE( 3, 2 );
+  INTERLEAVE( 5, 4 );
+  INTERLEAVE( 7, 6 );
+
+  INTERLEAVE( 2, 0 );
+  INTERLEAVE( 3, 1 );
+  INTERLEAVE( 6, 4 );
+  INTERLEAVE( 7, 5 );
+
+  INTERLEAVE( 4, 0 );
+  INTERLEAVE( 5, 1 );
+  INTERLEAVE( 6, 2 );
+  INTERLEAVE( 7, 3 );
+
+#undef INTERLEAVE
+
+   //Finish with 8 parallels DIT FFT_8
+   //FFT_8 using w=4 as 8th root of unity
+   // Unrolled decimation in time (DIT) radix-2 NTT.
+   // Input data is in revbin_permuted order.
+
+#define BUTTERFLY_0( i,j ) \
+do { \
+   __m256i u = X(j); \
+   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
+   X(i) = _mm256_add_epi16( u, X(i) ); \
+} while(0)
+
+
+#define BUTTERFLY_N( i,j,n ) \
+do { \
+   __m256i u = X(j); \
+   X(i) = _mm256_slli_epi16( X(i), w[n] ); \
+   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
+   X(i) = _mm256_add_epi16( u, X(i) ); \
+} while(0)
+
+  DO_REDUCE( 0 );
+  DO_REDUCE( 1 );
+  DO_REDUCE( 2 );
+  DO_REDUCE( 3 );
+  DO_REDUCE( 4 );
+  DO_REDUCE( 5 );
+  DO_REDUCE( 6 );
+  DO_REDUCE( 7 );
+
+  BUTTERFLY_0( 0, 1 );
+  BUTTERFLY_0( 2, 3 );
+  BUTTERFLY_0( 4, 5 );
+  BUTTERFLY_0( 6, 7 );
+
+  BUTTERFLY_0( 0, 2 );
+  BUTTERFLY_0( 4, 6 );
+  BUTTERFLY_N( 1, 3, 2 );
+  BUTTERFLY_N( 5, 7, 2 );
+
+  DO_REDUCE( 3 );
+
+  BUTTERFLY_0( 0, 4 );
+  BUTTERFLY_N( 1, 5, 1 );
+  BUTTERFLY_N( 2, 6, 2 );
+  BUTTERFLY_N( 3, 7, 3 );
+
+  DO_REDUCE_FULL_S( 0 );
+  DO_REDUCE_FULL_S( 1 );
+  DO_REDUCE_FULL_S( 2 );
+  DO_REDUCE_FULL_S( 3 );
+  DO_REDUCE_FULL_S( 4 );
+  DO_REDUCE_FULL_S( 5 );
+  DO_REDUCE_FULL_S( 6 );
+  DO_REDUCE_FULL_S( 7 );
+
+#undef BUTTERFLY
+
+  A[0] = X0;
+  A[1] = X1;
+  A[2] = X2;
+  A[3] = X3;
+  A[4] = X4;
+  A[5] = X5;
+  A[6] = X6;
+  A[7] = X7;
+
+#undef X
+}
+
+void fft128_2way( void *a )
+{
+  int i;
+  // Temp space to help for interleaving in the end
+  __m256i B[8];
+  __m256i *A = (__m256i*) a;
+//  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
+
+  /* Size-2 butterflies */
+  for ( i = 0; i<8; i++ )
+  {
+    B[ i ]   = _mm256_add_epi16( A[ i ], A[ i+8 ] );
+    B[ i ]   = REDUCE_FULL_S( B[ i ] );
+    A[ i+8 ] = _mm256_sub_epi16( A[ i ], A[ i+8 ] );
+    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
+    A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].m256i );
+    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
+  }
+
+  fft64_2way( B );
+  fft64_2way( A+8 );
+
+  /* Transpose (i.e. interleave) */
+  for ( i = 0; i < 8; i++ )
+  {
+    A[ 2*i   ] = _mm256_unpacklo_epi16( B[ i ], A[ i+8 ] );
+    A[ 2*i+1 ] = _mm256_unpackhi_epi16( B[ i ], A[ i+8 ] );
+  }
+}
+
+void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
+{
+  static const m256_v16 Tweak      = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
+  static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
+
+  __m256i *X = (__m256i*)x;
+  __m256i *A = (__m256i*)a;
+//  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
+
+#define UNPACK( i ) \
+do { \
+    __m256i t = X[i]; \
+    A[2*i]   = _mm256_unpacklo_epi8( t, m256_zero ); \
+    A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].m256i ); \
+    A[2*i+8] = REDUCE(A[2*i+8]); \
+    A[2*i+1] = _mm256_unpackhi_epi8( t, m256_zero ); \
+    A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].m256i ); \
+    A[2*i+9] = REDUCE(A[2*i+9]); \
+} while(0)
+
+    // This allows to tweak the last butterflies to introduce X^127
+#define UNPACK_TWEAK( i,tw ) \
+do { \
+    __m256i t = X[i]; \
+    __m256i tmp; \
+    A[2*i]   = _mm256_unpacklo_epi8( t, m256_zero ); \
+    A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].m256i ); \
+    A[2*i+8] = REDUCE( A[ 2*i+8 ] ); \
+    tmp      = _mm256_unpackhi_epi8( t, m256_zero ); \
+    A[2*i+1] = _mm256_add_epi16( tmp, tw ); \
+    A[2*i+9] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
+                                   FFT128_Twiddle[ 2*i+1 ].m256i );\
+    A[2*i+9] = REDUCE( A[ 2*i+9 ] );                       \
+} while(0)
+
+  UNPACK( 0 );
+  UNPACK( 1 );
+  UNPACK( 2 );
+  if ( final )
+    UNPACK_TWEAK( 3, FinalTweak.m256i );
+  else
+    UNPACK_TWEAK( 3, Tweak.m256i );
+
+#undef UNPACK
+#undef UNPACK_TWEAK
+
+  fft64_2way( a );
+  fft64_2way( a+128 );
+}
+
+void fft256_2way_msg( uint16_t *a, const uint8_t *x, int final )
+{
+  static const m256_v16 Tweak      = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
+  static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
+
+  __m256i *X = (__m256i*)x;
+  __m256i *A = (__m256i*)a;
+//  __m256i *Twiddle = (__m256i*)FFT256_Twiddle;
+
+#define UNPACK( i ) \
+do { \
+    __m256i t = X[i]; \
+    A[ 2*i      ] = _mm256_unpacklo_epi8( t, m256_zero ); \
+    A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
+                                        FFT256_Twiddle[ 2*i ].m256i ); \
+    A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
+    A[ 2*i +  1 ] = _mm256_unpackhi_epi8( t, m256_zero ); \
+    A[ 2*i + 17 ] = _mm256_mullo_epi16( A[ 2*i + 1 ], \
+                                        FFT256_Twiddle[ 2*i + 1 ].m256i ); \
+    A[ 2*i + 17 ] = REDUCE( A[ 2*i + 17 ] ); \
+} while(0)
+
+   // This allows to tweak the last butterflies to introduce X^127
+#define UNPACK_TWEAK( i,tw ) \
+do { \
+    __m256i t = X[i]; \
+    __m256i tmp; \
+    A[ 2*i      ] = _mm256_unpacklo_epi8( t, m256_zero ); \
+    A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
+                                        FFT256_Twiddle[ 2*i ].m256i ); \
+    A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
+    tmp           = _mm256_unpackhi_epi8( t, m256_zero ); \
+    A[ 2*i +  1 ] = _mm256_add_epi16( tmp, tw ); \
+    A[ 2*i + 17 ] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
+                                        FFT256_Twiddle[ 2*i + 1 ].m256i ); \
+  } while(0)
+
+  UNPACK( 0 );
+  UNPACK( 1 );
+  UNPACK( 2 );
+  UNPACK( 3 );
+  UNPACK( 4 );
+  UNPACK( 5 );
+  UNPACK( 6 );
+  if ( final )
+    UNPACK_TWEAK( 7, FinalTweak.m256i );
+  else
+    UNPACK_TWEAK( 7, Tweak.m256i );
+
+#undef UNPACK
+#undef UNPACK_TWEAK
+
+  fft128_2way( a );
+  fft128_2way( a+256 );
+}
+
+void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
+{
+  register __m256i S0l, S1l, S2l, S3l;
+  register __m256i S0h, S1h, S2h, S3h;
+  __m256i *S = (__m256i*) state;
+  __m256i *M = (__m256i*) msg;
+  __m256i *W = (__m256i*) fft;
+  static const m256_v16 code[] = { mm256_setc1_16(185), mm256_setc1_16(233) };
+
+  S0l = _mm256_xor_si256( S[0], M[0] );
+  S0h = _mm256_xor_si256( S[1], M[1] );
+  S1l = _mm256_xor_si256( S[2], M[2] );
+  S1h = _mm256_xor_si256( S[3], M[3] );
+  S2l = _mm256_xor_si256( S[4], M[4] );
+  S2h = _mm256_xor_si256( S[5], M[5] );
+  S3l = _mm256_xor_si256( S[6], M[6] );
+  S3h = _mm256_xor_si256( S[7], M[7] );
+
+#define S(i) S##i
+
+#define F_0(B, C, D) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( C,D ), B ), D )
+#define F_1(B, C, D) \
+   _mm256_or_si256( _mm256_and_si256( D, C ),\
+                    _mm256_and_si256( _mm256_or_si256( D,C ), B ) )
+
+#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
+#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
+
+  // We split the round function in two halfes
+  // so as to insert some independent computations in between
+
+#define SUM7_00 0
+#define SUM7_01 1
+#define SUM7_02 2
+#define SUM7_03 3
+#define SUM7_04 4
+#define SUM7_05 5
+#define SUM7_06 6
+
+#define SUM7_10 1
+#define SUM7_11 2
+#define SUM7_12 3
+#define SUM7_13 4
+#define SUM7_14 5
+#define SUM7_15 6
+#define SUM7_16 0
+
+#define SUM7_20 2
+#define SUM7_21 3
+#define SUM7_22 4
+#define SUM7_23 5
+#define SUM7_24 6
+#define SUM7_25 0
+#define SUM7_26 1
+
+#define SUM7_30 3
+#define SUM7_31 4
+#define SUM7_32 5
+#define SUM7_33 6
+#define SUM7_34 0
+#define SUM7_35 1
+#define SUM7_36 2
+
+#define SUM7_40 4
+#define SUM7_41 5
+#define SUM7_42 6
+#define SUM7_43 0
+#define SUM7_44 1
+#define SUM7_45 2
+#define SUM7_46 3
+
+#define SUM7_50 5
+#define SUM7_51 6
+#define SUM7_52 0
+#define SUM7_53 1
+#define SUM7_54 2
+#define SUM7_55 3
+#define SUM7_56 4
+
+#define SUM7_60 6
+#define SUM7_61 0
+#define SUM7_62 1
+#define SUM7_63 2
+#define SUM7_64 3
+#define SUM7_65 4
+#define SUM7_66 5
+
+#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
+
+#define PERM_0(d,a) /* XOR 1 */ \
+do { \
+    d##l = shufxor( a##l, 1 ); \
+    d##h = shufxor( a##h, 1 ); \
+ } while(0)
+
+#define PERM_1(d,a) /* XOR 6 */ \
+do { \
+    d##l = shufxor( a##h, 2 ); \
+    d##h = shufxor( a##l, 2 ); \
+} while(0)
+
+#define PERM_2(d,a) /* XOR 2 */ \
+do { \
+    d##l = shufxor( a##l, 2 ); \
+    d##h = shufxor( a##h, 2 ); \
+} while(0)
+
+#define PERM_3(d,a) /* XOR 3 */ \
+do { \
+    d##l = shufxor( a##l, 3 ); \
+    d##h = shufxor( a##h, 3 ); \
+} while(0)
+
+#define PERM_4(d,a) /* XOR 5 */ \
+do { \
+    d##l = shufxor( a##h, 1 ); \
+    d##h = shufxor( a##l, 1 ); \
+} while(0)
+
+#define PERM_5(d,a) /* XOR 7 */ \
+do { \
+    d##l = shufxor( a##h, 3 ); \
+    d##h = shufxor( a##l, 3 ); \
+} while(0)
+
+#define PERM_6(d,a) /* XOR 4 */ \
+do { \
+    d##l = a##h; \
+    d##h = a##l; \
+} while(0)
+
+#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
+do { \
+    TTl  = Fl( a,b,c,fun ); \
+    TTh  = Fh( a,b,c,fun ); \
+    a##l = mm256_rotl_32( a##l, r ); \
+    a##h = mm256_rotl_32( a##h, r ); \
+    w##l = _mm256_add_epi32( w##l, d##l ); \
+    w##h = _mm256_add_epi32( w##h, d##h ); \
+    TTl  = _mm256_add_epi32( TTl, w##l ); \
+    TTh  = _mm256_add_epi32( TTh, w##h ); \
+    TTl  = mm256_rotl_32( TTl, s ); \
+    TTh  = mm256_rotl_32( TTh, s ); \
+    PERM( z,d,a ); \
+} while(0)
+
+#define STEP_1( a,b,c,d,w,fun,r,s,z )   STEP_1_( a,b,c,d,w,fun,r,s,z )
+
+#define STEP_2_( a,b,c,d,w,fun,r,s ) \
+do { \
+    d##l = _mm256_add_epi32( d##l, TTl ); \
+    d##h = _mm256_add_epi32( d##h, TTh ); \
+} while(0)
+
+#define STEP_2( a,b,c,d,w,fun,r,s )  STEP_2_( a,b,c,d,w,fun,r,s )
+
+#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
+do { \
+    register __m256i TTl, TTh, Wl=w1, Wh=w2; \
+    STEP_1( a,b,c,d,W,fun,r,s,z ); \
+    STEP_2( a,b,c,d,W,fun,r,s ); \
+} while(0);
+
+#define MSG_l(x) (2*(x))
+#define MSG_h(x) (2*(x)+1)
+
+#define MSG( w,hh,ll,u,z ) \
+do { \
+    int a = MSG_##u(hh); \
+    int b = MSG_##u(ll); \
+    w##l = _mm256_unpacklo_epi16( W[a], W[b] ); \
+    w##l = _mm256_mullo_epi16( w##l, code[z].m256i ); \
+    w##h = _mm256_unpackhi_epi16( W[a], W[b]) ; \
+    w##h = _mm256_mullo_epi16( w##h, code[z].m256i ); \
+} while(0)
+
+#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
+do { \
+    register __m256i W0l, W1l, W2l, W3l, TTl; \
+    register __m256i W0h, W1h, W2h, W3h, TTh; \
+    MSG( W0, h0, l0, u0, z ); \
+    STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
+    MSG( W1, h1, l1, u1, z ); \
+    STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
+    STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
+    MSG( W2,h2,l2,u2,z ); \
+    STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
+    STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
+    MSG( W3,h3,l3,u3,z ); \
+    STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
+    STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
+    STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
+} while(0)
+
+   // 4 rounds with code 185
+#define PERM_START 0
+   ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
+#undef PERM_START
+#define PERM_START 4
+   ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
+#undef PERM_START
+#define PERM_START 1
+   ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
+#undef PERM_START
+#define PERM_START 5
+   ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
+#undef PERM_START
+
+   // 4 rounds with code 233
+#define PERM_START 2
+   ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
+#undef PERM_START
+#define PERM_START 6
+   ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
+#undef PERM_START
+#define PERM_START 3
+   ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
+#undef PERM_START
+#define PERM_START 0
+   ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
+#undef PERM_START
+
+   // 1 round as feed-forward
+#define PERM_START 4
+   STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0 );
+   STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
+   STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
+   STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3 );
+
+   S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
+   S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
+
+#undef PERM_START
+#undef STEP_1
+#undef STEP_2
+#undef STEP
+#undef ROUND
+}
+
+void SIMD_2way_Compress( simd_2way_context *state, const void *m, int final )
+{
+   m256_v16 Y[32];
+   uint16_t *y = (uint16_t*) Y[0].u16;
+   fft256_2way_msg( y, m, final );
+   rounds512_2way( state->A, m, y );
+}
+
+// imported from nist.c
+
+int simd_2way_init( simd_2way_context *state, int hashbitlen )
+{
+  __m256i *A = (__m256i*)state->A;
+  int n = 8;
+
+  state->hashbitlen = hashbitlen;
+  state->n_feistels = n;
+  state->blocksize = 128*8;
+  state->count = 0;
+
+  for ( int i = 0; i < 8; i++ )
+       A[i] = _mm256_set_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
+                                SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0],
+                                SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
+                                SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] );
+  return 0;
+}
+
+int simd_2way_update( simd_2way_context *state, const void *data,
+                             int databitlen )
+{
+  int bs      = state->blocksize;
+  int current = state->count & (bs - 1);
+
+  while ( databitlen > 0 )
+  {
+    if ( current == 0 && databitlen >= bs )
+    {
+       // We can hash the data directly from the input buffer.
+      SIMD_2way_Compress( state, data, 0 );
+      databitlen -= bs;
+      data += 2*(bs/8);
+      state->count += bs;
+    }
+    else
+    {
+       // Copy a chunk of data to the buffer
+      int len = bs - current;
+      if ( databitlen < len )
+      {
+        memcpy( state->buffer + 2*(current/8), data, 2*((databitlen+7)/8) );
+        state->count += databitlen;
+        return 0;
+      }
+      else
+      {
+        memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
+        state->count += len;
+        databitlen -= len;
+        data += 2*(len/8);
+        current = 0;
+        SIMD_2way_Compress( state, state->buffer, 0 );
+      }
+    }
+  }
+  return 0;
+}
+
+int simd_2way_close( simd_2way_context *state, void *hashval )
+{
+  uint64_t l;
+  int current = state->count & (state->blocksize - 1);
+  int i;
+  int isshort = 1;
+
+  // If there is still some data in the buffer, hash it
+  if ( current )
+  {
+    current = ( current+7 ) / 8;
+    memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current ) );
+    SIMD_2way_Compress( state, state->buffer, 0 );
+  }
+
+  //* Input the message length as the last block
+  memset( state->buffer, 0, 2*(state->blocksize / 8) );
+  l = state->count;
+  for ( i = 0; i < 8; i++ )
+  {
+    state->buffer[ i     ] = l & 0xff;
+    state->buffer[ i+16 ] = l & 0xff;
+    l >>= 8;
+  }
+  if ( state->count < 16384 )
+    isshort = 2;
+
+  SIMD_2way_Compress( state, state->buffer, isshort );
+  memcpy( hashval, state->A, 2*(state->hashbitlen / 8) );
+
+  return 0;
+}
+
+int simd_2way_update_close( simd_2way_context *state, void *hashval,
+                            const void *data, int databitlen )
+{
+  int current, i;
+  int bs = state->blocksize;  // bits in one lane
+  int isshort = 1;
+  uint64_t l;
+
+  current = state->count & (bs - 1);
+
+  while ( databitlen > 0 )
+  {
+    if ( current == 0 && databitlen >= bs )
+    {
+      // We can hash the data directly from the input buffer.
+      SIMD_2way_Compress( state, data, 0 );
+      databitlen -= bs;
+      data += 2*( bs/8 );
+      state->count += bs;
+    }
+    else
+    {
+      // Copy a chunk of data to the buffer
+      int len = bs - current;
+      if ( databitlen < len )
+      {
+        memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
+        state->count += databitlen;
+        break;
+      }
+      else
+      {
+        memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
+        state->count += len;
+        databitlen -= len;
+        data += 2*( len/8 );
+        current = 0;
+        SIMD_2way_Compress( state, state->buffer, 0 );
+      }
+    }
+  }
+
+  current = state->count & (state->blocksize - 1);
+
+  // If there is still some data in the buffer, hash it
+  if ( current )
+  {
+    current = ( current+7 ) / 8;
+    memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current) );
+    SIMD_2way_Compress( state, state->buffer, 0 );
+  }
+
+  //* Input the message length as the last block
+  memset( state->buffer, 0, 2*( state->blocksize/8 ) );
+  l = state->count;
+  for ( i = 0; i < 8; i++ )
+  {
+    state->buffer[ i    ] = l & 0xff;
+    state->buffer[ i+16 ] = l & 0xff;
+    l >>= 8;
+  }
+  if ( state->count < 16384 )
+    isshort = 2;
+
+  SIMD_2way_Compress( state, state->buffer, isshort );
+  memcpy( hashval, state->A, 2*( state->hashbitlen / 8 ) );
+  return 0;
+}
+
+#endif
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -0,0 +1,27 @@
+#ifndef SIMD_HASH_2WAY_H__
+#define SIMD_HASH_2WAY_H__ 1
+
+#include "simd-compat.h"
+
+#if defined(__AVX2__)
+
+#include "avxdefs.h"
+
+typedef struct {
+  uint32_t A[ 32*2 ] __attribute__((aligned(64)));
+  uint8_t buffer[ 128*2 ] __attribute__((aligned(64)));
+  uint64_t count;
+  unsigned int hashbitlen;
+  unsigned int blocksize;
+  unsigned int n_feistels;
+  
+} simd_2way_context;
+
+int simd_2way_init( simd_2way_context *state, int hashbitlen );
+int simd_2way_update( simd_2way_context *state, const void *data,
+                      int databitlen );
+int simd_2way_close( simd_2way_context *state, void *hashval );
+int simd_2way_update_close( simd_2way_context *state, void *hashval,
+                            const void *data, int databitlen );
+#endif
+#endif
--- a/algo/simd/sse2/simd_iv.h
+++ b/algo/simd/sse2/simd_iv.h
@@ -1,3 +1,6 @@
+#if !defined(SIMD_IV_H__)
+#define SIMD_IV_H__
+
 u32 IV_224[] = {
  0x33586e9f, 0x12fff033, 0xb2d9f64d, 0x6f8fea53,
  0xde943106, 0x2742e439, 0x4fbab5ac, 0x62b9ff96,
@@ -25,3 +28,5 @@ u32 IV_512[] = {
  0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
  0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
 };
+
+#endif
--- a/algo/simd/sse2/defs_x5.h
+++ b/algo/simd/sse2/defs_x5.h
@@ -1,23 +0,0 @@
-
-#ifndef DEFS_X5_H__
-#define DEFS_X5_H__
-#include <emmintrin.h>
-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
-typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
-
-typedef unsigned char uint8;
-typedef unsigned int uint32;
-typedef unsigned long long uint64;
-
-typedef struct {
-    uint32 buffer[8]; /* Buffer to be hashed */
-    __m128i chainv[10];   /* Chaining values */
-    uint64 bitlen[2]; /* Message length in bits */
-    uint32 rembitlen; /* Length of buffer data to be hashed */
-    int hashbitlen;
-} hashState_luffa;
-
-
-typedef unsigned char byte;
-#endif
--- a/algo/simd/sse2/sph_types.h
+++ b/algo/simd/sse2/sph_types.h
--- a/algo/simd/sse2/vector.c
+++ b/algo/simd/sse2/vector.c
@@ -63,13 +63,13 @@ MAYBE_INLINE void fft64(void *a) {
  v16* const A = a;

  register v16 X0, X1, X2, X3, X4, X5, X6, X7;
-
+/*
 #if V16_SIZE == 8
 #define X(i) A[i]
 #elif V16_SIZE == 4
 #define X(i) A[2*i]
 #endif
-
+*/
 #define X(i) X##i

  X0 = A[0];
@@ -623,6 +623,11 @@ void rounds(u32* state, const unsigned char* msg, short* fft) {
  STEP(S(1), S(2), S(3), S(0), S[3], 0, 25,  4, 20);

  S[0] = S(0);  S[1] = S(1);  S[2] = S(2);  S[3] = S(3);
+
+#undef ROUND
+#undef STEP
+#undef STEP_1
+#undef STEP_2
 }


@@ -849,24 +854,32 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
   */
 #define PERM_START 0
  ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
+#undef PERM_START
 #define PERM_START 4
  ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
+#undef PERM_START
 #define PERM_START 1
  ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
+#undef PERM_START
 #define PERM_START 5
  ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
+#undef PERM_START

  /*
   * 4 rounds with code 233
   */
 #define PERM_START 2
  ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
+#undef PERM_START
 #define PERM_START 6
  ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
+#undef PERM_START
 #define PERM_START 3
  ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
+#undef PERM_START
 #define PERM_START 0
  ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
+#undef PERM_START


  /*
@@ -877,9 +890,15 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
  STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
  STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
  STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3);
+#undef PERM_START

  S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
  S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
+
+#undef ROUND
+#undef STEP
+#undef STEP_1
+#undef STEP_2
 }

 void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	3363d61524	v3.8.4.1	2018-03-22 14:28:03 -04:00
Jay D Dee	20fe05054c	v3.8.4	2018-03-18 12:51:03 -04:00
Jay D Dee	157508bd07	v3.8.3.3	2018-02-25 14:15:07 -05:00
Jay D Dee	c24a4bdbc2	v3.8.3.2	2018-02-24 14:36:19 -05:00
Jay D Dee	59c7848d91	v3.8.3.1	2018-02-23 15:45:32 -05:00
Jay D Dee	3c02653dbe	v3.8.3	2018-02-23 12:39:15 -05:00
Jay D Dee	502ed0b1fe	v3.8.2.1	2018-02-17 13:52:24 -05:00
Jay D Dee	d60a268972	v3.8.2	2018-02-15 14:48:50 -05:00
Jay D Dee	e4265a6f11	v3.8.1.1	2018-02-09 23:30:14 -05:00
Jay D Dee	a28daca3ce	v3.8.1	2018-02-07 16:38:45 -05:00