v3.8.1

v3.8.0.1
v3.8.0
2025-09-17 23:44:27 +00:00 · 2018-02-07 16:38:45 -05:00 · 2018-02-05 22:10:18 -05:00 · 2018-01-23 21:02:16 -05:00 · 2018-01-16 15:11:44 -05:00 · 2018-01-08 22:04:43 -05:00
206 changed files with 17697 additions and 4922 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,7 +22,6 @@ cpuminer_SOURCES = \
  api.c \
  sysinfos.c \
  algo-gate-api.c\
-  crypto/blake2s.c \
  crypto/oaes_lib.c \
  crypto/c_keccak.c \
  crypto/c_groestl.c \
@@ -38,7 +37,6 @@ cpuminer_SOURCES = \
  algo/argon2/ar2/cores.c \
  algo/argon2/ar2/ar2-scrypt-jane.c \
  algo/argon2/ar2/blake2b.c \
-  algo/axiom.c \
  algo/blake/sph_blake.c \
  algo/blake/blake-hash-4way.c \
  algo/blake/blake-gate.c \
@@ -46,9 +44,12 @@ cpuminer_SOURCES = \
  algo/blake/blake-4way.c \
  algo/blake/sph_blake2b.c \
  algo/blake/blake2b.c \
+  algo/blake/sph-blake2s.c \
  algo/blake/blake2s.c \
+  algo/blake/blakecoin-gate.c \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
+  algo/blake/blakecoin-4way.c \
  algo/blake/decred-gate.c \
  algo/blake/decred.c \
  algo/blake/decred-4way.c \
@@ -56,6 +57,7 @@ cpuminer_SOURCES = \
  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
  algo/bmw/sph_bmw.c \
+  algo/bmw/bmw-hash-4way.c \
  algo/bmw/bmw256.c \
  algo/cryptonight/cryptolight.c \
  algo/cryptonight/cryptonight-common.c\
@@ -63,10 +65,8 @@ cpuminer_SOURCES = \
  algo/cryptonight/cryptonight.c\
  algo/cubehash/sph_cubehash.c \
  algo/cubehash/sse2/cubehash_sse2.c\
-  algo/drop.c \
  algo/echo/sph_echo.c \
  algo/echo/aes_ni/hash.c\
-  algo/fresh.c \
  algo/gost/sph_gost.c \
  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
@@ -75,11 +75,12 @@ cpuminer_SOURCES = \
  algo/groestl/aes_ni/hash-groestl256.c \
  algo/fugue/sph_fugue.c \
  algo/hamsi/sph_hamsi.c \
-  algo/haval/haval.c\
+  algo/hamsi/hamsi-hash-4way.c \
+  algo/haval/haval.c \
+  algo/haval/haval-hash-4way.c \
  algo/heavy/sph_hefty1.c \
  algo/heavy/heavy.c \
  algo/heavy/bastion.c \
-  algo/hmq1725.c \
  algo/hodl/aes.c \
  algo/hodl/hodl-gate.c \
  algo/hodl/hodl-wolf.c \
@@ -99,41 +100,57 @@ cpuminer_SOURCES = \
  algo/lbry.c \
  algo/luffa/sph_luffa.c \
  algo/luffa/luffa.c \
-  algo/luffa/sse2/luffa_for_sse2.c \
+  algo/luffa/luffa_for_sse2.c \
+  algo/luffa/luffa-hash-2way.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
+  algo/lyra2/lyra2rev2-gate.c \
  algo/lyra2/lyra2rev2.c \
+  algo/lyra2/lyra2rev2-4way.c \
  algo/lyra2/lyra2re.c \
  algo/lyra2/lyra2z-gate.c \
  algo/lyra2/lyra2z.c \
  algo/lyra2/lyra2z-4way.c \
  algo/lyra2/lyra2z330.c \
+  algo/lyra2/lyra2h-gate.c \
  algo/lyra2/lyra2h.c \
+  algo/lyra2/lyra2h-4way.c \
  algo/m7m.c \
-  algo/neoscrypt.c \
+  algo/neoscrypt/neoscrypt.c \
  algo/nist5/nist5-gate.c \
  algo/nist5/nist5-4way.c \
  algo/nist5/nist5.c \
+  algo/nist5/zr5.c \
  algo/pluck.c \
-  algo/polytimos/polytimos-gate.c \
-  algo/polytimos/polytimos.c \
+  algo/quark/quark-gate.c \
  algo/quark/quark.c \
+  algo/quark/quark-4way.c \
+  algo/quark/anime-gate.c \
+  algo/quark/anime.c \
+  algo/quark/anime-4way.c \
+  algo/qubit/qubit-gate.c \
  algo/qubit/qubit.c \
+  algo/qubit/qubit-2way.c \
+  algo/qubit/deep-gate.c \
+  algo/qubit/deep-2way.c \
  algo/qubit/deep.c \
  algo/ripemd/sph_ripemd.c \
  algo/scrypt.c \
  algo/scryptjane/scrypt-jane.c \
  algo/sha/sph_sha2.c \
  algo/sha/sph_sha2big.c \
+  algo/sha/sha2-hash-4way.c \
  algo/sha/sha2.c \
  algo/sha/sha256t.c \
  algo/shabal/sph_shabal.c \
+  algo/shabal/shabal-hash-4way.c \
  algo/shavite/sph_shavite.c \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite.c \
  algo/simd/sph_simd.c \
-  algo/simd/sse2/nist.c \
-  algo/simd/sse2/vector.c \
+  algo/simd/nist.c \
+  algo/simd/vector.c \
+  algo/simd/simd-hash-2way.c \
  algo/skein/sph_skein.c \
  algo/skein/skein-hash-4way.c \
  algo/skein/skein.c \
@@ -142,15 +159,9 @@ cpuminer_SOURCES = \
  algo/skein/skein2.c \
  algo/skein/skein2-4way.c \
  algo/skein/skein2-gate.c \
-  algo/skunk.c \
  algo/sm3/sm3.c \
+  algo/sm3/sm3-hash-4way.c \
  algo/tiger/sph_tiger.c \
-  algo/timetravel.c \
-  algo/timetravel10.c \
-  algo/tribus/tribus-gate.c \
-  algo/tribus/tribus.c \
-  algo/tribus/tribus-4way.c \
-  algo/veltor.c \
  algo/whirlpool/sph_whirlpool.c \
  algo/whirlpool/whirlpool-hash-4way.c \
  algo/whirlpool/whirlpool-gate.c \
@@ -159,21 +170,65 @@ cpuminer_SOURCES = \
  algo/whirlpool/whirlpoolx.c \
  algo/x11/x11-gate.c \
  algo/x11/x11.c \
-  algo/x11/x11evo.c \
+  algo/x11/x11-4way.c \
+  algo/x11/x11gost-gate.c \
  algo/x11/x11gost.c \
+  algo/x11/x11gost-4way.c \
+  algo/x11/c11-gate.c \
  algo/x11/c11.c \
-  algo/x11/phi1612.c \
+  algo/x11/c11-4way.c \
+  algo/x11/tribus-gate.c \
+  algo/x11/tribus.c \
+  algo/x11/tribus-4way.c \
+  algo/x11/timetravel-gate.c \
+  algo/x11/timetravel.c \
+  algo/x11/timetravel-4way.c \
+  algo/x11/timetravel10-gate.c \
+  algo/x11/timetravel10.c \
+  algo/x11/timetravel10-4way.c \
+  algo/x11/fresh.c \
+  algo/x11/x11evo.c \
+  algo/x11/x11evo-4way.c \
+  algo/x11/x11evo-gate.c \
+  algo/x13/x13-gate.c \
  algo/x13/x13.c \
+  algo/x13/x13-4way.c \
+  algo/x13/x13sm3-gate.c \
  algo/x13/x13sm3.c \
+  algo/x13/x13sm3-4way.c \
+  algo/x13/phi1612-gate.c \
+  algo/x13/phi1612.c \
+  algo/x13/phi1612-4way.c \
+  algo/x13/skunk-gate.c \
+  algo/x13/skunk-4way.c \
+  algo/x13/skunk.c \
+  algo/x13/drop.c \
+  algo/x14/x14-gate.c \
  algo/x14/x14.c \
+  algo/x14/x14-4way.c \
+  algo/x14/veltor-gate.c \
+  algo/x14/veltor.c \
+  algo/x14/veltor-4way.c \
+  algo/x14/polytimos-gate.c \
+  algo/x14/polytimos.c \
+  algo/x14/polytimos-4way.c \
+  algo/x14/axiom.c \
+  algo/x15/x15-gate.c \
  algo/x15/x15.c \
+  algo/x15/x15-4way.c \
+  algo/x17/x17-gate.c \
  algo/x17/x17.c \
-  algo/xevan.c \
+  algo/x17/x17-4way.c \
+  algo/x17/xevan-gate.c \
+  algo/x17/xevan.c \
+  algo/x17/xevan-4way.c \
+  algo/x17/x16r-gate.c \
+  algo/x17/x16r.c \
+  algo/x17/x16r-4way.c \
+  algo/x17/hmq1725.c \
  algo/yescrypt/yescrypt.c \
-  algo/yescrypt/sha256_Y.c\
-  algo/yescrypt/yescrypt-simd.c\
-  algo/zr5.c
-
+  algo/yescrypt/sha256_Y.c \
+  algo/yescrypt/yescrypt-simd.c

 disable_flags =

--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ See file RELEASE_NOTES for change log and compile instructions.
 Supported Algorithms
 --------------------

+                          anime        Animecoin
                          argon2
                          axiom        Shabal-256 MemoHash
                          bastion
@@ -68,7 +69,7 @@ Supported Algorithms
                          timetravel10 Bitcore
                          tribus       Denarius (DNR)
                          vanilla      blake256r8vnl (VCash)
-                          veltor
+                          veltor       (VLT)
                          whirlpool
                          whirlpoolx
                          x11          Dash
@@ -78,9 +79,11 @@ Supported Algorithms
                          x13sm3       hsr (Hshare)
                          x14          X14
                          x15          X15
+                          x16r         Ravencoin
                          x17
                          xevan        Bitsend
                          yescrypt     Globalboost-Y (BSTY)
+                          yescryptr8   BitZeny (ZNY)\n\
                          yescryptr16  Yenten (YTN)
                          zr5          Ziftr

@@ -96,13 +99,16 @@ algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
 Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
 performance.

+ARM CPUs are not supported.
+
 2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
 Centos are known to work and have all dependencies in their repositories.
 Others may work but may require more effort.
 64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.

-3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
-may work wallet mining but there are no guarantees.
+MacOS, OSx is not supported.
+
+3. Stratum pool. Some algos may work wallet mining using getwork.

 Errata
 ------
@@ -132,10 +138,13 @@ output from the miner showing the startup and any errors.
 Donations
 ---------

-I do not do this for money but I have a donation address if users
-are so inclined.
+cpuminer-opt has no fees of any kind but donations are accepted.

-bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations
+BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
+ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
+LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
+BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
+BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ

 Happy mining!

--- a/README.txt
+++ b/README.txt
@@ -17,17 +17,20 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.

-Exe name                  Compile opts         Arch name
+Exe name                Compile flags              Arch name

-cpuminer-sse2.exe         -march=core2         Core2   
-cpuminer-sse42.exe        -march=corei7        Nehalem
-cpuminer-aes-sse42.exe    -maes -msse4.2"      Westmere
-cpuminer-aes-avx.exe      -march=corei7-avx"   Sandybridge, Ivybridge
-cpuminer-aes-avx2.exe     "-march=core-avx2"   Haswell, Broadwell, Skylake, Kabylake
-cpuminer-4way.exe         "-march=core-avx2 -DFOUR_WAY"
+cpuminer-sse2.exe      "-march=core2"              Core2, Nehalem   
+cpuminer-aes-sse42.exe "-maes -msse4.2"            Westmere
+cpuminer-aes-avx.exe   "-march=corei7-avx"         Sandybridge, Ivybridge
+cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
+cpuminer-avx2-sha.exe  "-march=core-avx2 -msha"    Ryzen
+
+If you like this software feel free to donate:
+
+BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
+ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
+LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
+BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
+BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ

-4way requires a CPU with AES and AVX2. It is still under development and
-only a few algos are supported. See change log in RELEASE_NOTES in source
-package for supported algos.

-There is no binary support available for SHA on AMD Ryzen CPUs.
--- a/61
+++ b/61
@@ -27,8 +27,9 @@ Compile Instructions

 Requirements:

-Intel Core2 or newer, or AMD Steamroller or newer CPU.
-64 bit Linux or Windows operating system.
+Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
+supported.
+64 bit Linux or Windows operating system. Apple is not supported.

 Building on linux prerequisites:

@@ -91,20 +92,14 @@ SPH may give slightly better performance on algos that use sha256 when using
 openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
 better than SPH.

-DFOUR_WAY
-
-4 way will give much better performance on supported algos with CPUs
-that have AVX2 and should only be used on CPUs with AVX2. 4 way algo
-support will be added incrementally, see change log below for supported algos.
- 
 Start mining.

 ./cpuminer -a algo -o url -u username -p password

 Windows

-The following in how the Windows binary releases are built. It's old and
-not very good but it works, for me anyway.
+Precompiled Windows binaries are built on a Linux host using Mingw
+with a more recent compiler than the following Windows hosted procedure.

 Building on Windows prerequisites:

@@ -136,7 +131,7 @@ or similar Windows program.
 In msys shell cd to miner directory.
 cd /c/path/to/cpuminer-opt

-Run winbuild.sh to build on Windows or execute the following commands.
+Run build.sh to build on Windows or execute the following commands.

 ./autogen.sh
 CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
@@ -148,9 +143,9 @@ cpuminer.exe -a algo -o url -u user -p password

 The following tips may be useful for older AMD CPUs.

-AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
-supported by cpuminer-opt due to an incompatible implementation of SSE2 on
-these CPUs. Some algos may crash the miner with an invalid instruction.
+AMD CPUs older than Steamroller, including Athlon x2 and Phenom II x4, are
+not supported by cpuminer-opt due to an incompatible implementation of SSE2
+on these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.

 Some users with AMD CPUs without AES_NI have reported problems compiling
@@ -164,6 +159,42 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.8.1
+
+Fixes x16r on CPUs with only SSE2.
+More Optimizations for X algos, qubit & deep.
+Corrected algo optimizations for scrypt and yescrypt, no new optimizations.
+
+v3.8.0.1
+
+Fixed x16r AVX2 low hash rate.
+
+v3.8.0
+
+4way no longer a seperate feature, included in AVX2.
+Added x16r algo for Ravencoin, anime algo for Animecoin.
+More 4way optimizations for X13 and up.
+Tweaked CPU affinity to better support more than 64 CPUs.
+Fixed compile problem on some old AMD CPUs.
+
+v3.7.10
+
+4way optimizations for lyra2rev2, lyra2h, quark, timetravel8, timetravel10
+   x11evo, blakecoin.
+Faster x13sm3 (hsr).
+Added share difficulty to accepted message.
+
+v3.7.9
+
+Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.
+Additional 4way optimizations for X algos.
+New algo yescryptr8 for BitZeny, not to be confused with original
+yescrypt Globalboost-Y.
+
+v3.7.8
+
+Partial 4way optimization for most X algos including c11, xevan, phi, hsr
+
 v3.7.7

 Fixed regression caused by 64 CPU support.
@@ -182,7 +213,7 @@ New algo keccakc for Creative coin with 4way optimizations
 Rewrote some AVX/AVX2 code for more consistent implementation and some
 optimizing.

-Enhanced capabilities check to support 4way, mor eprecise reporting of
+Enhanced capabilities check to support 4way, more precise reporting of
 features (not all algos use SSE2), and better error messages when using
 an incompatible pre-built version (Windows users).

--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -16,7 +16,7 @@
 #include <memory.h>
 #include <unistd.h>
 #include <openssl/sha.h>
-#include "miner.h"
+//#include "miner.h"
 #include "algo-gate-api.h"

 // Define null and standard functions.
@@ -155,6 +155,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )

   switch (algo)
   {
+     case ALGO_ANIME:        register_anime_algo       ( gate ); break;
     case ALGO_ARGON2:       register_argon2_algo      ( gate ); break;
     case ALGO_AXIOM:        register_axiom_algo       ( gate ); break;
     case ALGO_BASTION:      register_bastion_algo     ( gate ); break;
@@ -211,14 +212,16 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo  ( gate ); break;
     case ALGO_X11:          register_x11_algo         ( gate ); break;
     case ALGO_X11EVO:       register_x11evo_algo      ( gate ); break;
-     case ALGO_X11GOST:      register_sib_algo         ( gate ); break;
+     case ALGO_X11GOST:      register_x11gost_algo     ( gate ); break;
     case ALGO_X13:          register_x13_algo         ( gate ); break;
     case ALGO_X13SM3:       register_x13sm3_algo      ( gate ); break;
     case ALGO_X14:          register_x14_algo         ( gate ); break;
     case ALGO_X15:          register_x15_algo         ( gate ); break;
+     case ALGO_X16R:         register_x16r_algo        ( gate ); break;
     case ALGO_X17:          register_x17_algo         ( gate ); break;
     case ALGO_XEVAN:        register_xevan_algo       ( gate ); break;
     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
+     case ALGO_YESCRYPTR8:   register_yescryptr8_algo  ( gate ); break;
     case ALGO_YESCRYPTR16:  register_yescryptr16_algo ( gate ); break;
     case ALGO_ZR5:          register_zr5_algo         ( gate ); break;
    default:
@@ -278,6 +281,7 @@ const char* const algo_alias_map[][2] =
 {
 //   alias                proper
  { "bitcore",           "timetravel10" },
+  { "bitzeny",           "yescryptr8"   },
  { "blake256r8",        "blakecoin"    },
  { "blake256r8vnl",     "vanilla"      },
  { "blake256r14",       "blake"        },
@@ -300,10 +304,9 @@ const char* const algo_alias_map[][2] =
 //  { "sia",               "blake2b"      },
  { "sib",               "x11gost"      },
  { "timetravel8",       "timetravel"   },
-  { "yes",               "yescrypt"     },
  { "ziftr",             "zr5"          },
  { "yenten",            "yescryptr16"  },
-  { "yescryptr8",        "yescrypt"     },
+  { "yescryptr8k",       "yescrypt"     },
  { "zcoin",             "lyra2z"       },
  { "zoin",              "lyra2z330"    },
  { NULL,                NULL           }   
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -1,7 +1,6 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <stdint.h>
-
 #include "miner.h"

 /////////////////////////////
@@ -91,7 +90,7 @@ typedef  uint32_t set_t;
 #define AVX_OPT         4
 #define AVX2_OPT        8
 #define SHA_OPT      0x10
-#define FOUR_WAY_OPT 0x20
+//#define FOUR_WAY_OPT 0x20

 // return set containing all elements from sets a & b
 inline set_t set_union ( set_t a, set_t b ) { return a | b; }
@@ -213,7 +212,8 @@ int64_t get_max64_0x3fffffLL();
 int64_t get_max64_0x1ffff();
 int64_t get_max64_0xffffLL();

-void std_set_target   ( struct work *work, double job_diff );
+void std_set_target(    struct work *work, double job_diff );
+void alt_set_target(    struct work* work, double job_diff );
 void scrypt_set_target( struct work *work, double job_diff );

 bool std_le_work_decode( const json_t *val, struct work *work );
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -1,31 +1,22 @@
 #include "blake-gate.h"
-#include "sph_blake.h"
+
+#if defined (BLAKE_4WAY)
+
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>

-#if defined (BLAKE_4WAY)
+blake256r14_4way_context blake_ctx;

 void blakehash_4way(void *state, const void *input)
 {
-     uint32_t vhash[4*4] __attribute__ ((aligned (64)));
-     uint32_t hash0[4] __attribute__ ((aligned (32)));
-     uint32_t hash1[4] __attribute__ ((aligned (32)));
-     uint32_t hash2[4] __attribute__ ((aligned (32)));
-     uint32_t hash3[4] __attribute__ ((aligned (32)));
-     blake256_4way_context ctx;
-
-     blake256_4way_init( &ctx );
-     blake256_4way( &ctx, input, 16 );
-     blake256_4way_close( &ctx, vhash );
-
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash1, 32 );
-     memcpy( state+96, hash1, 32 );
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256r14_4way_context ctx;
+     memcpy( &ctx, &blake_ctx, sizeof ctx );
+     blake256r14_4way( &ctx, input + (64<<2), 16 );
+     blake256r14_4way_close( &ctx, vhash );
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -36,21 +27,24 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
-//   uint32_t HTarget = ptarget[7];
+   uint32_t HTarget = ptarget[7];
   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found = 0;

-//   if (opt_benchmark)
-//      HTarget = 0x7f;
+   if (opt_benchmark)
+      HTarget = 0x7f;

   // we need big endian data...
   swab32_array( edata, pdata, 20 );

   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

+   blake256r14_4way_init( &blake_ctx );
+   blake256r14_4way( &blake_ctx, vdata, 64 );
+
   uint32_t *noncep = vdata + 76;   // 19*4
   do {
      found[0] = found[1] = found[2] = found[3] = false;
@@ -61,45 +55,36 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,

      blakehash_4way( hash, vdata );

-      if ( hash[7] == 0 )
+      if (  hash[7] <= HTarget && fulltest( hash, ptarget ) )
      {
-         if ( fulltest( hash, ptarget ) )
-         {
-             found[0] = true;
-             num_found++;
-             nonces[0] = n;
-             pdata[19] = n;
-         }
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          pdata[19] = n;
+          work_set_target_ratio( work, hash );
      }
-      if ( (hash+8)[7] == 0 ) 
+      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
      {
-         if ( fulltest( hash+8, ptarget ) ) 
-         {
-             found[1] = true;
-             num_found++;
-             nonces[1] = n+1;
-         }
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
      }
-      if ( (hash+16)[7] == 0 )
+      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
      {
-          if ( fulltest( hash+8, ptarget ) )
-          {
-              found[2] = true;
-              num_found++;
-              nonces[2] = n+2;
-          }
+           found[2] = true;
+           num_found++;
+           nonces[2] = n+2;
+           work_set_target_ratio( work, hash+16 );
      }
-      if ( (hash+24)[7] == 0 )
+      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
      {
-         if ( fulltest( hash+8, ptarget ) )
-         {
-              found[3] = true;
-              num_found++;
-              nonces[3] = n+3;
-         }
+           found[3] = true;
+           num_found++;
+           nonces[3] = n+3;
+           work_set_target_ratio( work, hash+24 );
      }
-       n += 4;
-      *hashes_done = n - first_nonce + 1;
+      n += 4;

   } while ( (num_found == 0) && (n < max_nonce) 
             && !work_restart[thr_id].restart );
--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -7,6 +7,7 @@ int64_t blake_get_max64 ()

 bool register_blake_algo( algo_gate_t* gate )
 {
+  gate->optimizations = AVX2_OPT;
  gate->get_max64 = (void*)&blake_get_max64;
 //#if defined (__AVX2__) && defined (FOUR_WAY)
 //   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
@@ -14,7 +15,6 @@ bool register_blake_algo( algo_gate_t* gate )
 //  gate->hash      = (void*)&blakehash_8way;
 #if defined(BLAKE_4WAY)
  four_way_not_tested();
-  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_blake_4way;
  gate->hash      = (void*)&blakehash_4way;
 #else
--- a/algo/blake/blake-gate.h
+++ b/algo/blake/blake-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX__)
+#if defined(__AVX2__)
  #define BLAKE_4WAY
 #endif

--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -36,7 +36,6 @@
 #include <string.h>
 #include <limits.h>

-//#include "sph_blake.h"
 #include "blake-hash-4way.h"

 #ifdef __cplusplus
@@ -79,6 +78,8 @@ static const sph_u64 IV512[8] = {

 #if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64

+// Blake-256 4 & 8 way, Blake-512 4way
+
 static const unsigned sigma[16][16] = {
 	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
 	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
@@ -98,18 +99,6 @@ static const unsigned sigma[16][16] = {
 	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 }
 };

-/*
-  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
- 14 10  4  8  9 15 13  6  1 12  0  2 11  7  5  3
- 11  8 12  0  5  2 15 13 10 14  3  6  7  1  9  4
-  7  9  3  1 13 12 11 14  2  6  5 10  4  0 15  8
-  9  0  5  7  2  4 10 15 14  1 11 12  6  8  3 13
-  2 12  6 10  0 11  8  3  4 13  7  5 15 14  1  9
- 12  5  1 15 14 13  4 10  0  7  6  3  9  2  8 11
- 13 11  7 14 12  1  3  9  5  0 15  4  8  6  2 10
-  6 15 14  9 11  3  0  8 12  2 13  7  1  4 10  5
- 10  2  8  4  7  6  1  5 15 11  9 14  3 12 13  0
-*/
 #endif

 #define Z00   0
@@ -286,6 +275,8 @@ static const unsigned sigma[16][16] = {
 #define Mx_(n)      Mx__(n)
 #define Mx__(n)     M ## n

+// Blake-256 4 & 8 way
+
 #define CSx(r, i)   CSx_(Z ## r ## i)
 #define CSx_(n)     CSx__(n)
 #define CSx__(n)    CS ## n
@@ -324,6 +315,8 @@ static const sph_u32 CS[16] = {

 #if defined(__AVX2__)

+// Blake-512 4 way
+
 #define CBx(r, i)   CBx_(Z ## r ## i)
 #define CBx_(n)     CBx__(n)
 #define CBx__(n)    CB ## n
@@ -414,6 +407,35 @@ do { \

 #if defined (__AVX2__)

+// BLAKE256 8 WAY
+
+#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \
+do { \
+   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
+                 _mm256_set1_epi32( c1 ), m0 ), b ), a ); \
+   d = mm256_rotr_32( _mm256_xor_si256( d, a ), 16 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_rotr_32( _mm256_xor_si256( b, c ), 12 ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \
+                 _mm256_set1_epi32( c0 ), m1 ), b ), a ); \
+   d = mm256_rotr_32( _mm256_xor_si256( d, a ), 8 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_rotr_32( _mm256_xor_si256( b, c ), 7 ); \
+} while (0)
+
+#define ROUND_S_8WAY(r)   do { \
+        GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
+        GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
+        GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
+        GS_8WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
+        GS_8WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
+        GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
+        GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
+        GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
+} while (0)
+
+// Blake-512 4 way
+
 #define GB_4WAY(m0, m1, c0, c1, a, b, c, d)   do { \
   a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \
                 _mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \
@@ -504,14 +526,9 @@ do { \
 		(state)->T1 = T1; \
 	} while (0)

-//#define BLAKE32_ROUNDS 8
-#ifndef BLAKE32_ROUNDS
-#define BLAKE32_ROUNDS 14
-#endif
-
 #if SPH_COMPACT_BLAKE_32

-#define COMPRESS32_4WAY   do { \
+#define COMPRESS32_4WAY( rounds )   do { \
 	__m128i M[16]; \
 	__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
 	__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
@@ -536,23 +553,23 @@ do { \
                          , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
-	M[0x0] = mm_byteswap_32( *(buf +  0) ); \
-	M[0x1] = mm_byteswap_32( *(buf +  1) ); \
-	M[0x2] = mm_byteswap_32( *(buf +  2) ); \
-	M[0x3] = mm_byteswap_32( *(buf +  3) ); \
-	M[0x4] = mm_byteswap_32( *(buf +  4) ); \
-	M[0x5] = mm_byteswap_32( *(buf +  5) ); \
-	M[0x6] = mm_byteswap_32( *(buf +  6) ); \
-	M[0x7] = mm_byteswap_32( *(buf +  7) ); \
-	M[0x8] = mm_byteswap_32( *(buf +  8) ); \
-	M[0x9] = mm_byteswap_32( *(buf +  9) ); \
-	M[0xA] = mm_byteswap_32( *(buf + 10) ); \
-	M[0xB] = mm_byteswap_32( *(buf + 11) ); \
-	M[0xC] = mm_byteswap_32( *(buf + 12) ); \
-	M[0xD] = mm_byteswap_32( *(buf + 13) ); \
-	M[0xE] = mm_byteswap_32( *(buf + 14) ); \
-	M[0xF] = mm_byteswap_32( *(buf + 15) ); \
-	for (r = 0; r < BLAKE32_ROUNDS; r ++) \
+	M[0x0] = mm_bswap_32( *(buf +  0) ); \
+	M[0x1] = mm_bswap_32( *(buf +  1) ); \
+	M[0x2] = mm_bswap_32( *(buf +  2) ); \
+	M[0x3] = mm_bswap_32( *(buf +  3) ); \
+	M[0x4] = mm_bswap_32( *(buf +  4) ); \
+	M[0x5] = mm_bswap_32( *(buf +  5) ); \
+	M[0x6] = mm_bswap_32( *(buf +  6) ); \
+	M[0x7] = mm_bswap_32( *(buf +  7) ); \
+	M[0x8] = mm_bswap_32( *(buf +  8) ); \
+	M[0x9] = mm_bswap_32( *(buf +  9) ); \
+	M[0xA] = mm_bswap_32( *(buf + 10) ); \
+	M[0xB] = mm_bswap_32( *(buf + 11) ); \
+	M[0xC] = mm_bswap_32( *(buf + 12) ); \
+	M[0xD] = mm_bswap_32( *(buf + 13) ); \
+	M[0xE] = mm_bswap_32( *(buf + 14) ); \
+	M[0xF] = mm_bswap_32( *(buf + 15) ); \
+	for (r = 0; r < rounds; r ++) \
 		ROUND_S_4WAY(r); \
        H0 = _mm_xor_si128( _mm_xor_si128( \
                                   _mm_xor_si128( S0, V0 ), V8 ), H0 ); \
@@ -576,85 +593,194 @@ do { \

 // current impl

-#define COMPRESS32_4WAY   do { \
-	__m128i M0, M1, M2, M3, M4, M5, M6, M7; \
-	__m128i M8, M9, MA, MB, MC, MD, ME, MF; \
-	__m128i V0, V1, V2, V3, V4, V5, V6, V7; \
-	__m128i V8, V9, VA, VB, VC, VD, VE, VF; \
-	V0 = H0; \
-	V1 = H1; \
-	V2 = H2; \
-	V3 = H3; \
-	V4 = H4; \
-	V5 = H5; \
-	V6 = H6; \
-	V7 = H7; \
-        V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
-        V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
-        VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
-        VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
-        VC = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
-                            _mm_set_epi32( CS4, CS4, CS4, CS4 ) ); \
-        VD = _mm_xor_si128( _mm_set_epi32( T0, T0, T0, T0 ), \
-                            _mm_set_epi32( CS5, CS5, CS5, CS5 ) ); \
-        VE = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
-                            _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
-        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
-                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
-	M0 = mm_byteswap_32( * buf ); \
-	M1 = mm_byteswap_32( *(buf+1) ); \
-	M2 = mm_byteswap_32( *(buf+2) ); \
-	M3 = mm_byteswap_32( *(buf+3) ); \
-	M4 = mm_byteswap_32( *(buf+4) ); \
-	M5 = mm_byteswap_32( *(buf+5) ); \
-	M6 = mm_byteswap_32( *(buf+6) ); \
-	M7 = mm_byteswap_32( *(buf+7) ); \
-	M8 = mm_byteswap_32( *(buf+8) ); \
-	M9 = mm_byteswap_32( *(buf+9) ); \
-	MA = mm_byteswap_32( *(buf+10) ); \
-	MB = mm_byteswap_32( *(buf+11) ); \
-	MC = mm_byteswap_32( *(buf+12) ); \
-	MD = mm_byteswap_32( *(buf+13) ); \
-	ME = mm_byteswap_32( *(buf+14) ); \
-	MF = mm_byteswap_32( *(buf+15) ); \
-	ROUND_S_4WAY(0); \
-	ROUND_S_4WAY(1); \
-	ROUND_S_4WAY(2); \
-	ROUND_S_4WAY(3); \
-	ROUND_S_4WAY(4); \
-	ROUND_S_4WAY(5); \
-	ROUND_S_4WAY(6); \
-	ROUND_S_4WAY(7); \
-	if (BLAKE32_ROUNDS == 14) { \
-	ROUND_S_4WAY(8); \
-	ROUND_S_4WAY(9); \
-	ROUND_S_4WAY(0); \
-	ROUND_S_4WAY(1); \
-	ROUND_S_4WAY(2); \
-	ROUND_S_4WAY(3); \
-	} \
-        H0 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
-        H1 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
-        H2 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
-        H3 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
-        H4 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
-        H5 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
-        H6 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
-        H7 = _mm_xor_si128( _mm_xor_si128( \
-                                _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
-	} while (0)
+#define COMPRESS32_4WAY( rounds ) \
+do { \
+   __m128i M0, M1, M2, M3, M4, M5, M6, M7; \
+   __m128i M8, M9, MA, MB, MC, MD, ME, MF; \
+   __m128i V0, V1, V2, V3, V4, V5, V6, V7; \
+   __m128i V8, V9, VA, VB, VC, VD, VE, VF; \
+   V0 = H0; \
+   V1 = H1; \
+   V2 = H2; \
+   V3 = H3; \
+   V4 = H4; \
+   V5 = H5; \
+   V6 = H6; \
+   V7 = H7; \
+   V8 = _mm_xor_si128( S0, _mm_set_epi32( CS0, CS0, CS0, CS0 ) ); \
+   V9 = _mm_xor_si128( S1, _mm_set_epi32( CS1, CS1, CS1, CS1 ) ); \
+   VA = _mm_xor_si128( S2, _mm_set_epi32( CS2, CS2, CS2, CS2 ) ); \
+   VB = _mm_xor_si128( S3, _mm_set_epi32( CS3, CS3, CS3, CS3 ) ); \
+   VC = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS4 ) ); \
+   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
+   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
+   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
+   M0 = mm_bswap_32( * buf ); \
+   M1 = mm_bswap_32( *(buf+1) ); \
+   M2 = mm_bswap_32( *(buf+2) ); \
+   M3 = mm_bswap_32( *(buf+3) ); \
+   M4 = mm_bswap_32( *(buf+4) ); \
+   M5 = mm_bswap_32( *(buf+5) ); \
+   M6 = mm_bswap_32( *(buf+6) ); \
+   M7 = mm_bswap_32( *(buf+7) ); \
+   M8 = mm_bswap_32( *(buf+8) ); \
+   M9 = mm_bswap_32( *(buf+9) ); \
+   MA = mm_bswap_32( *(buf+10) ); \
+   MB = mm_bswap_32( *(buf+11) ); \
+   MC = mm_bswap_32( *(buf+12) ); \
+   MD = mm_bswap_32( *(buf+13) ); \
+   ME = mm_bswap_32( *(buf+14) ); \
+   MF = mm_bswap_32( *(buf+15) ); \
+   ROUND_S_4WAY(0); \
+   ROUND_S_4WAY(1); \
+   ROUND_S_4WAY(2); \
+   ROUND_S_4WAY(3); \
+   ROUND_S_4WAY(4); \
+   ROUND_S_4WAY(5); \
+   ROUND_S_4WAY(6); \
+   ROUND_S_4WAY(7); \
+   if (rounds == 14) \
+   { \
+      ROUND_S_4WAY(8); \
+      ROUND_S_4WAY(9); \
+      ROUND_S_4WAY(0); \
+      ROUND_S_4WAY(1); \
+      ROUND_S_4WAY(2); \
+      ROUND_S_4WAY(3); \
+   } \
+   H0 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V8, V0 ), S0 ), H0 ); \
+   H1 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( V9, V1 ), S1 ), H1 ); \
+   H2 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VA, V2 ), S2 ), H2 ); \
+   H3 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VB, V3 ), S3 ), H3 ); \
+   H4 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VC, V4 ), S0 ), H4 ); \
+   H5 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VD, V5 ), S1 ), H5 ); \
+   H6 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VE, V6 ), S2 ), H6 ); \
+   H7 = _mm_xor_si128( _mm_xor_si128( _mm_xor_si128( VF, V7 ), S3 ), H7 ); \
+} while (0)

 #endif

 #if defined (__AVX2__)

+// Blake-256 8 way
+
+#define DECL_STATE32_8WAY \
+   __m256i H0, H1, H2, H3, H4, H5, H6, H7; \
+   __m256i S0, S1, S2, S3; \
+   sph_u32 T0, T1;
+
+#define READ_STATE32_8WAY(state) \
+do { \
+   H0 = (state)->H[0]; \
+   H1 = (state)->H[1]; \
+   H2 = (state)->H[2]; \
+   H3 = (state)->H[3]; \
+   H4 = (state)->H[4]; \
+   H5 = (state)->H[5]; \
+   H6 = (state)->H[6]; \
+   H7 = (state)->H[7]; \
+   S0 = (state)->S[0]; \
+   S1 = (state)->S[1]; \
+   S2 = (state)->S[2]; \
+   S3 = (state)->S[3]; \
+   T0 = (state)->T0; \
+   T1 = (state)->T1; \
+} while (0)
+
+#define WRITE_STATE32_8WAY(state) \
+do { \
+   (state)->H[0] = H0; \
+   (state)->H[1] = H1; \
+   (state)->H[2] = H2; \
+   (state)->H[3] = H3; \
+   (state)->H[4] = H4; \
+   (state)->H[5] = H5; \
+   (state)->H[6] = H6; \
+   (state)->H[7] = H7; \
+   (state)->S[0] = S0; \
+   (state)->S[1] = S1; \
+   (state)->S[2] = S2; \
+   (state)->S[3] = S3; \
+   (state)->T0 = T0; \
+   (state)->T1 = T1; \
+} while (0)
+
+#define COMPRESS32_8WAY( rounds ) \
+do { \
+   __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
+   __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
+   __m256i V0, V1, V2, V3, V4, V5, V6, V7; \
+   __m256i V8, V9, VA, VB, VC, VD, VE, VF; \
+   V0 = H0; \
+   V1 = H1; \
+   V2 = H2; \
+   V3 = H3; \
+   V4 = H4; \
+   V5 = H5; \
+   V6 = H6; \
+   V7 = H7; \
+   V8 = _mm256_xor_si256( S0, _mm256_set1_epi32( CS0 ) ); \
+   V9 = _mm256_xor_si256( S1, _mm256_set1_epi32( CS1 ) ); \
+   VA = _mm256_xor_si256( S2, _mm256_set1_epi32( CS2 ) ); \
+   VB = _mm256_xor_si256( S3, _mm256_set1_epi32( CS3 ) ); \
+   VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS4 ) ); \
+   VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
+   VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
+   VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
+   M0 = mm256_bswap_32( * buf ); \
+   M1 = mm256_bswap_32( *(buf+1) ); \
+   M2 = mm256_bswap_32( *(buf+2) ); \
+   M3 = mm256_bswap_32( *(buf+3) ); \
+   M4 = mm256_bswap_32( *(buf+4) ); \
+   M5 = mm256_bswap_32( *(buf+5) ); \
+   M6 = mm256_bswap_32( *(buf+6) ); \
+   M7 = mm256_bswap_32( *(buf+7) ); \
+   M8 = mm256_bswap_32( *(buf+8) ); \
+   M9 = mm256_bswap_32( *(buf+9) ); \
+   MA = mm256_bswap_32( *(buf+10) ); \
+   MB = mm256_bswap_32( *(buf+11) ); \
+   MC = mm256_bswap_32( *(buf+12) ); \
+   MD = mm256_bswap_32( *(buf+13) ); \
+   ME = mm256_bswap_32( *(buf+14) ); \
+   MF = mm256_bswap_32( *(buf+15) ); \
+   ROUND_S_8WAY(0); \
+   ROUND_S_8WAY(1); \
+   ROUND_S_8WAY(2); \
+   ROUND_S_8WAY(3); \
+   ROUND_S_8WAY(4); \
+   ROUND_S_8WAY(5); \
+   ROUND_S_8WAY(6); \
+   ROUND_S_8WAY(7); \
+   if (rounds == 14) \
+   { \
+      ROUND_S_8WAY(8); \
+      ROUND_S_8WAY(9); \
+      ROUND_S_8WAY(0); \
+      ROUND_S_8WAY(1); \
+      ROUND_S_8WAY(2); \
+      ROUND_S_8WAY(3); \
+   } \
+   H0 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), \
+                                                              S0 ), H0 ); \
+   H1 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), \
+                                                              S1 ), H1 ); \
+   H2 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), \
+                                                              S2 ), H2 ); \
+   H3 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), \
+                                                              S3 ), H3 ); \
+   H4 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), \
+                                                              S0 ), H4 ); \
+   H5 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), \
+                                                              S1 ), H5 ); \
+   H6 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), \
+                                                              S2 ), H6 ); \
+   H7 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), \
+                                                              S3 ), H7 ); \
+} while (0)
+
+
+// Blake-512 4 way
+
 #define DECL_STATE64_4WAY \
 	__m256i H0, H1, H2, H3, H4, H5, H6, H7; \
        __m256i S0, S1, S2, S3; \
@@ -722,22 +848,22 @@ do { \
                               _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
        VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
                               _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
-	M[0x0] = mm256_byteswap_64( *(buf+0) ); \
-	M[0x1] = mm256_byteswap_64( *(buf+1) ); \
-	M[0x2] = mm256_byteswap_64( *(buf+2) ); \
-	M[0x3] = mm256_byteswap_64( *(buf+3) ); \
-	M[0x4] = mm256_byteswap_64( *(buf+4) ); \
-	M[0x5] = mm256_byteswap_64( *(buf+5) ); \
-	M[0x6] = mm256_byteswap_64( *(buf+6) ); \
-	M[0x7] = mm256_byteswap_64( *(buf+7) ); \
-	M[0x8] = mm256_byteswap_64( *(buf+8) ); \
-	M[0x9] = mm256_byteswap_64( *(buf+9) ); \
-	M[0xA] = mm256_byteswap_64( *(buf+10) ); \
-	M[0xB] = mm256_byteswap_64( *(buf+11) ); \
-	M[0xC] = mm256_byteswap_64( *(buf+12) ); \
-	M[0xD] = mm256_byteswap_64( *(buf+13) ); \
-	M[0xE] = mm256_byteswap_64( *(buf+14) ); \
-	M[0xF] = mm256_byteswap_64( *(buf+15) ); \
+	M[0x0] = mm256_bswap_64( *(buf+0) ); \
+	M[0x1] = mm256_bswap_64( *(buf+1) ); \
+	M[0x2] = mm256_bswap_64( *(buf+2) ); \
+	M[0x3] = mm256_bswap_64( *(buf+3) ); \
+	M[0x4] = mm256_bswap_64( *(buf+4) ); \
+	M[0x5] = mm256_bswap_64( *(buf+5) ); \
+	M[0x6] = mm256_bswap_64( *(buf+6) ); \
+	M[0x7] = mm256_bswap_64( *(buf+7) ); \
+	M[0x8] = mm256_bswap_64( *(buf+8) ); \
+	M[0x9] = mm256_bswap_64( *(buf+9) ); \
+	M[0xA] = mm256_bswap_64( *(buf+10) ); \
+	M[0xB] = mm256_bswap_64( *(buf+11) ); \
+	M[0xC] = mm256_bswap_64( *(buf+12) ); \
+	M[0xD] = mm256_bswap_64( *(buf+13) ); \
+	M[0xE] = mm256_bswap_64( *(buf+14) ); \
+	M[0xF] = mm256_bswap_64( *(buf+15) ); \
 	for (r = 0; r < 16; r ++) \
 		ROUND_B_4WAY(r); \
        H0 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -787,22 +913,22 @@ do { \
                            _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) );  \
     VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
                            _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) );  \
-     M0 = mm256_byteswap_64( *(buf + 0) ); \
-     M1 = mm256_byteswap_64( *(buf + 1) ); \
-     M2 = mm256_byteswap_64( *(buf + 2) ); \
-     M3 = mm256_byteswap_64( *(buf + 3) ); \
-     M4 = mm256_byteswap_64( *(buf + 4) ); \
-     M5 = mm256_byteswap_64( *(buf + 5) ); \
-     M6 = mm256_byteswap_64( *(buf + 6) ); \
-     M7 = mm256_byteswap_64( *(buf + 7) ); \
-     M8 = mm256_byteswap_64( *(buf + 8) ); \
-     M9 = mm256_byteswap_64( *(buf + 9) ); \
-     MA = mm256_byteswap_64( *(buf + 10) ); \
-     MB = mm256_byteswap_64( *(buf + 11) ); \
-     MC = mm256_byteswap_64( *(buf + 12) ); \
-     MD = mm256_byteswap_64( *(buf + 13) ); \
-     ME = mm256_byteswap_64( *(buf + 14) ); \
-     MF = mm256_byteswap_64( *(buf + 15) ); \
+     M0 = mm256_bswap_64( *(buf + 0) ); \
+     M1 = mm256_bswap_64( *(buf + 1) ); \
+     M2 = mm256_bswap_64( *(buf + 2) ); \
+     M3 = mm256_bswap_64( *(buf + 3) ); \
+     M4 = mm256_bswap_64( *(buf + 4) ); \
+     M5 = mm256_bswap_64( *(buf + 5) ); \
+     M6 = mm256_bswap_64( *(buf + 6) ); \
+     M7 = mm256_bswap_64( *(buf + 7) ); \
+     M8 = mm256_bswap_64( *(buf + 8) ); \
+     M9 = mm256_bswap_64( *(buf + 9) ); \
+     MA = mm256_bswap_64( *(buf + 10) ); \
+     MB = mm256_bswap_64( *(buf + 11) ); \
+     MC = mm256_bswap_64( *(buf + 12) ); \
+     MD = mm256_bswap_64( *(buf + 13) ); \
+     ME = mm256_bswap_64( *(buf + 14) ); \
+     MF = mm256_bswap_64( *(buf + 15) ); \
     ROUND_B_4WAY(0); \
     ROUND_B_4WAY(1); \
     ROUND_B_4WAY(2); \
@@ -841,19 +967,20 @@ do { \

 #endif

-static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 };
+static const sph_u32 salt_zero_4way_small[4] = { 0, 0, 0, 0 };

 static void
 blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv,
-                   const sph_u32 *salt)
+                   const sph_u32 *salt, int rounds )
 {
-        int i;
-        for ( i = 0; i < 8; i++ )
-           sc->H[i] = _mm_set_epi32( iv[i], iv[i], iv[i], iv[i] );
-        for ( i = 0; i < 4; i++ )
-           sc->S[i] = _mm_set_epi32( salt[i], salt[i], salt[i], salt[i] );
-	sc->T0 = sc->T1 = 0;
-	sc->ptr = 0;
+   int i;
+   for ( i = 0; i < 8; i++ )
+      sc->H[i] = _mm_set1_epi32( iv[i] );
+   for ( i = 0; i < 4; i++ )
+      sc->S[i] = _mm_set1_epi32( salt[i] );
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+   sc->rounds = rounds;
 }

 static void
@@ -891,7 +1018,7 @@ blake32_4way( blake_4way_small_context *sc, const void *data, size_t len )
           {
 		if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
 			T1 = SPH_T32(T1 + 1);
-		COMPRESS32_4WAY;
+                COMPRESS32_4WAY( sc->rounds );
 		ptr = 0;
 	   }
 	}
@@ -914,61 +1041,176 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,

   ptr = sc->ptr;
   bit_len = ((unsigned)ptr << 3);
-//   unsigned z = 0x80 >> n;
-//   unsigned zz = ((ub & -z) | z) & 0xFF;
-//   u.buf[ptr>>2] = _mm_set_epi32( zz, zz, zz, zz );
   u.buf[ptr>>2] = _mm_set1_epi32( 0x80 );
   tl = sc->T0 + bit_len;
   th = sc->T1;

   if ( ptr == 0 )
   {
-	sc->T0 = SPH_C32(0xFFFFFE00);
-	sc->T1 = SPH_C32(0xFFFFFFFF);
+	sc->T0 = SPH_C32(0xFFFFFE00UL);
+	sc->T1 = SPH_C32(0xFFFFFFFFUL);
   }
   else if ( sc->T0 == 0 )
   {
-	sc->T0 = SPH_C32(0xFFFFFE00) + bit_len;
+	sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
 	sc->T1 = SPH_T32(sc->T1 - 1);
   } 
   else
 	sc->T0 -= 512 - bit_len;

-//   if ( ptr <= 48 )
   if ( ptr <= 52 )
   {
       memset_zero_128( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
-//       memset_zero_128( u.buf + (ptr>>2) + 1, (48 - ptr) >> 2 );
       if (out_size_w32 == 8)
           u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
-                                    _mm_set_epi32( 0x010000000, 0x01000000,
-                                                   0x010000000, 0x01000000 ) );
-       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
-       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
+                                        _mm_set1_epi32( 0x01000000UL ) );
+       *(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
+       *(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
       blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
   }
   else
   {
 	memset_zero_128( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
 	blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
-	sc->T0 = SPH_C32(0xFFFFFE00);
-	sc->T1 = SPH_C32(0xFFFFFFFF);
+	sc->T0 = SPH_C32(0xFFFFFE00UL);
+	sc->T1 = SPH_C32(0xFFFFFFFFUL);
 	memset_zero_128( u.buf, 56>>2 );
       if (out_size_w32 == 8)
-           u.buf[52>>2] = _mm_set_epi32( 0x010000000, 0x01000000,
-                                         0x010000000, 0x01000000 );
-        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set_epi32( th, th, th, th ) );
-        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set_epi32( tl, tl, tl, tl ) );
+           u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
+        *(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
+        *(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
 	blake32_4way( sc, u.buf, 64 );
   }
   out = (__m128i*)dst;
   for ( k = 0; k < out_size_w32; k++ )
-        out[k] = mm_byteswap_32( sc->H[k] );
-//        out[k] =  sc->H[k];
+        out[k] = mm_bswap_32( sc->H[k] );
 }

 #if defined (__AVX2__)

+// Blake-256 8 way
+
+static const sph_u32 salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+static void
+blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv,
+                   const sph_u32 *salt, int rounds )
+{
+   int i;
+   for ( i = 0; i < 8; i++ )
+      sc->H[i] = _mm256_set1_epi32( iv[i] );
+   for ( i = 0; i < 4; i++ )
+      sc->S[i] = _mm256_set1_epi32( salt[i] );
+   sc->T0 = sc->T1 = 0;
+   sc->ptr = 0;
+   sc->rounds = rounds;
+}
+
+static void
+blake32_8way( blake_8way_small_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf;
+   size_t ptr;
+   const int buf_size = 64;   // number of elements, sizeof/4
+   DECL_STATE32_8WAY
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   if ( len < buf_size - ptr )
+   {
+        memcpy_256( buf + (ptr>>2), vdata, len>>2 );
+        ptr += len;
+        sc->ptr = ptr;
+        return;
+   }
+
+   READ_STATE32_8WAY(sc);
+   while ( len > 0 )
+   {
+      size_t clen;
+
+      clen = buf_size - ptr;
+      if (clen > len)
+           clen = len;
+      memcpy_256( buf + (ptr>>2), vdata, clen>>2 );
+      ptr += clen;
+      vdata += (clen>>2);
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+          if ( ( T0 = SPH_T32(T0 + 512) ) < 512 )
+                T1 = SPH_T32(T1 + 1);
+          COMPRESS32_8WAY( sc->rounds );
+          ptr = 0;
+      }
+   }
+   WRITE_STATE32_8WAY(sc);
+   sc->ptr = ptr;
+}
+
+static void
+blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
+                    void *dst, size_t out_size_w32 )
+{
+   union {
+        __m256i buf[16];
+        sph_u32 dummy;
+   } u;
+   size_t ptr, k;
+   unsigned bit_len;
+   sph_u32 th, tl;
+   __m256i *out;
+
+   ptr = sc->ptr;
+   bit_len = ((unsigned)ptr << 3);
+   u.buf[ptr>>2] = _mm256_set1_epi32( 0x80 );
+   tl = sc->T0 + bit_len;
+   th = sc->T1;
+
+   if ( ptr == 0 )
+   {
+        sc->T0 = SPH_C32(0xFFFFFE00UL);
+        sc->T1 = SPH_C32(0xFFFFFFFFUL);
+   }
+   else if ( sc->T0 == 0 )
+   {
+        sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len;
+        sc->T1 = SPH_T32(sc->T1 - 1);
+   }
+   else
+        sc->T0 -= 512 - bit_len;
+
+   if ( ptr <= 52 )
+   {
+       memset_zero_256( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 );
+       if (out_size_w32 == 8)
+           u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2],
+                                           _mm256_set1_epi32( 0x01000000UL ) );
+       *(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
+       *(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
+       blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
+   }
+   else
+   {
+        memset_zero_256( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 );
+        blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
+        sc->T0 = SPH_C32(0xFFFFFE00UL);
+        sc->T1 = SPH_C32(0xFFFFFFFFUL);
+        memset_zero_256( u.buf, 56>>2 );
+       if (out_size_w32 == 8)
+           u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
+        *(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
+        *(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
+        blake32_8way( sc, u.buf, 64 );
+   }
+   out = (__m256i*)dst;
+   for ( k = 0; k < out_size_w32; k++ )
+        out[k] = mm256_bswap_32( sc->H[k] );
+}
+
+// Blake-512 4 way
+
 static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 };

 static void
@@ -977,9 +1219,9 @@ blake64_4way_init( blake_4way_big_context *sc, const sph_u64 *iv,
 {
        int i;
        for ( i = 0; i < 8; i++ )
-           sc->H[i] = _mm256_set_epi64x( iv[i], iv[i], iv[i], iv[i] );
+           sc->H[i] = _mm256_set1_epi64x( iv[i] );
        for ( i = 0; i < 4; i++ )
-           sc->S[i] = _mm256_set_epi64x( salt[i], salt[i], salt[i], salt[i] );
+           sc->S[i] = _mm256_set1_epi64x( salt[i] );
        sc->T0 = sc->T1 = 0;
        sc->ptr = 0;
 }
@@ -1051,12 +1293,12 @@ blake64_4way_close( blake_4way_big_context *sc,
   th = sc->T1;
   if (ptr == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
-	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+	sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
   }
   else if ( sc->T0 == 0 )
   {
-	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len;
+	sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL) + bit_len;
 	sc->T1 = SPH_T64(sc->T1 - 1);
   } 
   else
@@ -1068,13 +1310,10 @@ blake64_4way_close( blake_4way_big_context *sc,
       memset_zero_256( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
       if ( out_size_w64 == 8 )
          u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
-                                    _mm256_set_epi64x( 0x0100000000000000,
-                                                       0x0100000000000000,
-                                                       0x0100000000000000,
-                                                       0x0100000000000000 ) );
-       *(u.buf+(112>>3)) = mm256_byteswap_64(
+                                 _mm256_set1_epi64x( 0x0100000000000000ULL ) );
+       *(u.buf+(112>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(u.buf+(120>>3)) = mm256_byteswap_64(
+       *(u.buf+(120>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( tl, tl, tl, tl ) );

       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
@@ -1084,33 +1323,32 @@ blake64_4way_close( blake_4way_big_context *sc,
       memset_zero_256( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );

       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
-       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
-       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
+       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00ULL);
+       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFFULL);
       memset_zero_256( u.buf, 112>>3 ); 
       if ( out_size_w64 == 8 )
-           u.buf[104>>3] = _mm256_set_epi64x( 0x0100000000000000,
-                                              0x0100000000000000,
-                                              0x0100000000000000,
-                                              0x0100000000000000 );
-
-       *(u.buf+(112>>3)) = mm256_byteswap_64(
+           u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
+       *(u.buf+(112>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( th, th, th, th ) );
-       *(u.buf+(120>>3)) = mm256_byteswap_64(
+       *(u.buf+(120>>3)) = mm256_bswap_64(
                                    _mm256_set_epi64x( tl, tl, tl, tl ) );

       blake64_4way( sc, u.buf, 128 );
   }
   out = (__m256i*)dst;
   for ( k = 0; k < out_size_w64; k++ )
-       out[k] = mm256_byteswap_64( sc->H[k] );
+       out[k] = mm256_bswap_64( sc->H[k] );
 }

 #endif

+// Blake-256 4 way & 8 way
+
+// default 14 rounds, backward copatibility
 void
 blake256_4way_init(void *cc)
 {
-	blake32_4way_init(cc, IV256, salt_zero_small);
+   blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 );
 }

 void
@@ -1122,15 +1360,110 @@ blake256_4way(void *cc, const void *data, size_t len)
 void
 blake256_4way_close(void *cc, void *dst)
 {
-	blake256_4way_addbits_and_close(cc, 0, 0, dst);
+        blake32_4way_close(cc, 0, 0, dst, 8);
+}
+
+#if defined(__AVX2__)
+void
+blake256_8way_init(void *cc)
+{
+   blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
 }

 void
-blake256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+blake256_8way(void *cc, const void *data, size_t len)
 {
-	blake32_4way_close(cc, ub, n, dst, 8);
+        blake32_8way(cc, data, len);
 }

+void
+blake256_8way_close(void *cc, void *dst)
+{
+        blake32_8way_close(cc, 0, 0, dst, 8);
+}
+
+#endif
+
+// 14 rounds Blake, Decred
+void blake256r14_4way_init(void *cc)
+{
+   blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 );
+}
+
+void
+blake256r14_4way(void *cc, const void *data, size_t len)
+{
+   blake32_4way(cc, data, len);
+}
+
+void
+blake256r14_4way_close(void *cc, void *dst)
+{
+   blake32_4way_close(cc, 0, 0, dst, 8);
+}
+
+#if defined(__AVX2__)
+
+void blake256r14_8way_init(void *cc)
+{
+   blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 );
+}
+
+void
+blake256r14_8way(void *cc, const void *data, size_t len)
+{
+   blake32_8way(cc, data, len);
+}
+
+void
+blake256r14_8way_close(void *cc, void *dst)
+{
+   blake32_8way_close(cc, 0, 0, dst, 8);
+}
+
+#endif
+
+// 8 rounds Blakecoin, Vanilla
+void blake256r8_4way_init(void *cc)
+{
+   blake32_4way_init( cc, IV256, salt_zero_4way_small, 8 );
+}
+
+void
+blake256r8_4way(void *cc, const void *data, size_t len)
+{
+   blake32_4way(cc, data, len);
+}
+
+void
+blake256r8_4way_close(void *cc, void *dst)
+{
+   blake32_4way_close(cc, 0, 0, dst, 8);
+}
+
+#if defined (__AVX2__)
+
+void blake256r8_8way_init(void *cc)
+{
+   blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 );
+}
+
+void
+blake256r8_8way(void *cc, const void *data, size_t len)
+{
+   blake32_8way(cc, data, len);
+}
+
+void
+blake256r8_8way_close(void *cc, void *dst)
+{
+   blake32_8way_close(cc, 0, 0, dst, 8);
+}
+
+#endif
+
+// Blake-512 4 way
+
 #if defined (__AVX2__)

 void
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -35,7 +35,9 @@
 */

 #ifndef __BLAKE_HASH_4WAY__
-#define __BLAKE_HASH_4WAY___
+#define __BLAKE_HASH_4WAY__
+
+#ifdef __AVX__

 #ifdef __cplusplus
 extern "C"{
@@ -45,41 +47,75 @@ extern "C"{
 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"

-/**
- * Output size (in bits) for BLAKE-256.
- */
 #define SPH_SIZE_blake256   256

-#if SPH_64
-
-/**
- * Output size (in bits) for BLAKE-512.
- */
 #define SPH_SIZE_blake512   512

-#endif
+// With AVX only Blake-256 4 way is available.
+// With AVX2 Blake-256 8way & Blake-512 4 way are also available.
+
+// Blake-256 4 way

-#ifdef __AVX__
 typedef struct {
-        __m128i buf[16] __attribute__ ((aligned (64)));
-        __m128i H[8];
-        __m128i S[4];    
-        size_t ptr;
-	sph_u32 T0, T1;
+   __m128i buf[16] __attribute__ ((aligned (64)));
+   __m128i H[8];
+   __m128i S[4];    
+   size_t ptr;
+   sph_u32 T0, T1;
+   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
 } blake_4way_small_context;

+// Default 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
-
 void blake256_4way_init(void *cc);
 void blake256_4way(void *cc, const void *data, size_t len);
 void blake256_4way_close(void *cc, void *dst);
-void blake256_4way_addbits_and_close(
-        void *cc, unsigned ub, unsigned n, void *dst);

-#endif
+// 14 rounds, blake, decred
+typedef blake_4way_small_context blake256r14_4way_context;
+void blake256r14_4way_init(void *cc);
+void blake256r14_4way(void *cc, const void *data, size_t len);
+void blake256r14_4way_close(void *cc, void *dst);
+
+// 8 rounds, blakecoin, vanilla
+typedef blake_4way_small_context blake256r8_4way_context;
+void blake256r8_4way_init(void *cc);
+void blake256r8_4way(void *cc, const void *data, size_t len);
+void blake256r8_4way_close(void *cc, void *dst);

 #ifdef __AVX2__

+// Blake-256 8 way
+
+typedef struct {
+   __m256i buf[16] __attribute__ ((aligned (64)));
+   __m256i H[8];
+   __m256i S[4];
+   size_t ptr;
+   sph_u32 T0, T1;
+   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
+} blake_8way_small_context;
+
+// Default 14 rounds
+typedef blake_8way_small_context blake256_8way_context;
+void blake256_8way_init(void *cc);
+void blake256_8way(void *cc, const void *data, size_t len);
+void blake256_8way_close(void *cc, void *dst);
+
+// 14 rounds, blake, decred
+typedef blake_8way_small_context blake256r14_8way_context;
+void blake256r14_8way_init(void *cc);
+void blake256r14_8way(void *cc, const void *data, size_t len);
+void blake256r14_8way_close(void *cc, void *dst);
+
+// 8 rounds, blakecoin, vanilla
+typedef blake_8way_small_context blake256r8_8way_context;
+void blake256r8_8way_init(void *cc);
+void blake256r8_8way(void *cc, const void *data, size_t len);
+void blake256r8_8way_close(void *cc, void *dst);
+
+// Blake-512 4 way
+
 typedef struct {
        __m256i buf[16] __attribute__ ((aligned (64)));
        __m256i H[8];
@@ -103,3 +139,5 @@ void blake512_4way_addbits_and_close(
 #endif

 #endif
+
+#endif
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -3,7 +3,7 @@
 #include <string.h>
 #include <stdint.h>

-#include "crypto/blake2s.h"
+#include "sph-blake2s.h"

 static __thread blake2s_state s_midstate;
 static __thread blake2s_state s_ctx;
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -0,0 +1,106 @@
+#include "blakecoin-gate.h"
+
+#if defined (BLAKECOIN_4WAY)
+
+#include "blake-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+
+blake256r8_4way_context blakecoin_ctx;
+
+void blakecoin_4way_hash(void *state, const void *input)
+{
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256r8_4way_context ctx;
+     memcpy( &ctx, &blakecoin_ctx, sizeof ctx );
+     blake256r8_4way( &ctx, input + (64<<2), 16 );
+     blake256r8_4way_close( &ctx, vhash );
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+
+   if (opt_benchmark)
+      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( edata, pdata, 20 );
+
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   blake256r8_4way_init( &blakecoin_ctx );
+   blake256r8_4way( &blakecoin_ctx, vdata, 64 );
+
+   uint32_t *noncep = vdata + 76;   // 19*4
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+
+      blakecoin_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if (  hash[7] <= HTarget && fulltest( hash, ptarget ) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) ) 
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
+      {
+           found[2] = true;
+           num_found++;
+           nonces[2] = n+2;
+           work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
+      {
+           found[3] = true;
+           num_found++;
+           nonces[3] = n+3;
+           work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+
+   } while ( (num_found == 0) && (n < max_nonce) 
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+
+   // workaround to prevent flood of hash reports when nonce range exhasuted
+   // and thread is spinning waiting for new work
+   if ( ( n >= max_nonce ) && ( *hashes_done < 10 ) )
+   {
+      *hashes_done = 0;
+      sleep(1);
+   }
+
+   return num_found;
+}
+
+#endif
+
--- a/algo/blake/blakecoin-gate.c
+++ b/algo/blake/blakecoin-gate.c
@@ -0,0 +1,70 @@
+#include "blakecoin-gate.h"
+#include <memory.h>
+
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blakecoin_get_max64 ()
+{
+  return 0x7ffffLL;
+//  return 0x3fffffLL;
+}
+
+// Blakecoin 4 way hashes so fast it runs out of nonces.
+// This is an attempt to solve this but the result may be
+// to rehash old nonces until new work is received.
+void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id,
+                     uint32_t *end_nonce_ptr, bool clean_job )
+{
+   uint32_t *nonceptr = algo_gate.get_nonceptr( work->data );
+ 
+//   if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) )
+//      algo_gate.stratum_gen_work( &stratum, g_work );
+
+   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) 
+   || ( *nonceptr >= *end_nonce_ptr )
+   || ( (  work->job_id != g_work->job_id ) && clean_job ) )
+/*
+   if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size )
+      && ( clean_job || ( *nonceptr >= *end_nonce_ptr )
+         || ( work->job_id != g_work->job_id ) ) )
+*/   
+   {
+     work_free( work );
+     work_copy( work, g_work );
+     *nonceptr = 0xffffffffU / opt_n_threads * thr_id;
+     if ( opt_randomize )
+       *nonceptr += ( (rand() *4 ) & UINT32_MAX ) / opt_n_threads;
+     *end_nonce_ptr = ( 0xffffffffU / opt_n_threads ) * (thr_id+1) - 0x20; 
+// try incrementing the xnonce to chsnge the data
+//     for ( int i = 0; i < work->xnonce2_size && !( ++work->xnonce2[i] ); i++ );
+   }
+   else
+       ++(*nonceptr);
+}
+
+
+// vanilla uses default gen merkle root, otherwise identical to blakecoin
+bool register_vanilla_algo( algo_gate_t* gate )
+{
+#if defined(BLAKECOIN_4WAY)
+//  four_way_not_tested();
+  gate->scanhash  = (void*)&scanhash_blakecoin_4way;
+  gate->hash      = (void*)&blakecoin_4way_hash;
+//  gate->get_new_work = (void*)&bc4w_get_new_work;
+//  blakecoin_4way_init( &blake_4way_init_ctx );
+#else
+  gate->scanhash = (void*)&scanhash_blakecoin;
+  gate->hash     = (void*)&blakecoinhash;
+//  blakecoin_init( &blake_init_ctx );
+#endif
+  gate->optimizations = AVX2_OPT;
+  gate->get_max64 = (void*)&blakecoin_get_max64;
+  return true;
+}
+
+bool register_blakecoin_algo( algo_gate_t* gate )
+{
+  register_vanilla_algo( gate );
+  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  return true;
+}
+
--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -0,0 +1,21 @@
+#ifndef __BLAKECOIN_GATE_H__
+#define __BLAKECOIN_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__)
+  #define BLAKECOIN_4WAY
+#endif
+
+#if defined (BLAKECOIN_4WAY)
+void blakecoin_4way_hash(void *state, const void *input);
+int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
+void blakecoinhash( void *state, const void *input );
+int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done );
+
+#endif
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "blakecoin-gate.h"
 #define BLAKE32_ROUNDS 8
 #include "sph_blake.h"

@@ -98,7 +98,7 @@ void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
 SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
 }
 */
-
+/*
 // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
 int64_t blakecoin_get_max64 ()
 {
@@ -121,4 +121,4 @@ bool register_blakecoin_algo( algo_gate_t* gate )
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  return true;
 }
-
+*/
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -1,5 +1,4 @@
 #include "decred-gate.h"
-#include "sph_blake.h"
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
@@ -9,7 +8,6 @@
 #if defined (DECRED_4WAY)

 static __thread blake256_4way_context blake_mid;
-static __thread bool ctx_midstate_done = false;

 void decred_hash_4way( void *state, const void *input )
 {
@@ -18,50 +16,14 @@ void decred_hash_4way( void *state, const void *input )
     uint32_t hash1[8] __attribute__ ((aligned (32)));
     uint32_t hash2[8] __attribute__ ((aligned (32)));
     uint32_t hash3[8] __attribute__ ((aligned (32)));
-     blake256_4way_context ctx __attribute__ ((aligned (64)));
-
-     sph_blake256_context ctx2 __attribute__ ((aligned (64)));
-     uint32_t hash[16] __attribute__ ((aligned (64)));
-     uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
-
-     mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
-
     void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
+     blake256_4way_context ctx __attribute__ ((aligned (64)));

     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
     blake256_4way( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
-/*
-     sph_blake256_init( &ctx2 );
-     sph_blake256( &ctx2, sin0, 180 );
-     sph_blake256_close( &ctx2, hash );
-*/
-/*
-     blake256_4way_init( &ctx );
-     blake256_4way( &ctx, input, 180 );
-     blake256_4way_close( &ctx, vhash );
-*/
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
-/*
-        for ( int i = 0; i < 8; i++ )
-          if ( hash[i] != hash0[i] )
-            printf(" hash mismatch, i = %u\n",i);
-
-printf("hash:  %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
-                             *(hash+2), *(hash+3) );
-printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
-                             *(hash0+2), *(hash0+3) );
-printf("\n");
-*/
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash2, 32 );
-     memcpy( state+96, hash3, 32 );
-
-//     memcpy( state, hash, 32 );
-
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -69,21 +31,21 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
-        uint32_t _ALIGN(64) edata[48];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-        uint32_t n = first_nonce;
-        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
+   uint32_t _ALIGN(64) edata[48];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
+   uint32_t n = first_nonce;
+   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found = 0;

-        ctx_midstate_done = false;
-        memcpy( edata, pdata, 180 );
+   // copy to buffer guaranteed to be aligned.
+   memcpy( edata, pdata, 180 );

   // use the old way until  new way updated for size.
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 );
+   mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );

   blake256_4way_init( &blake_mid );
   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
@@ -106,22 +68,13 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
          nonces[0] = n;
          pdata[DECRED_NONCE_INDEX] = n;
      }
-/*
      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
      {
-printf("found 1\n");          
-
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] );
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] );
-printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
-printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
-
          work_set_target_ratio( work, hash+8 );
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
      }
-*/
      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
      {
          work_set_target_ratio( work, hash+16 );
@@ -129,24 +82,15 @@ printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[
          num_found++;
          nonces[2] = n+2;
      }
-/*
+
      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
      {
-printf("found 3\n");          
-
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] );
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] );
-printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
-printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
-
          work_set_target_ratio( work, hash+24 );
          found[3] = true;
          num_found++;
          nonces[3] = n+3;
      }
-*/
-      n += 2;
-//      n += 4;
+      n += 4;
  } while ( (num_found == 0) && (n < max_nonce) 
            && !work_restart[thr_id].restart );

--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -145,15 +145,13 @@ bool register_decred_algo( algo_gate_t* gate )
 {
 #if defined(DECRED_4WAY)
  four_way_not_tested();
-  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_decred_4way;
  gate->hash      = (void*)&decred_hash_4way;
 #else
-  gate->optimizations = SSE2_OPT;
  gate->scanhash  = (void*)&scanhash_decred;
  gate->hash      = (void*)&decred_hash;
 #endif
-
+  gate->optimizations = AVX2_OPT;
  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
  gate->display_extra_data    = (void*)&decred_decode_extradata;
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -18,7 +18,7 @@
 //                         uint64_t *hashes_done );
 #endif

-#if defined(FOUR_WAY) && defined(__AVX__)
+#if defined(__AVX2__)
  #define DECRED_4WAY
 #endif

--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -1,4 +1,7 @@
 #include "pentablake-gate.h"
+
+#if defined (__AVX2__)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,8 +12,6 @@

 //#define DEBUG_ALGO

-#ifdef PENTABLAKE_4WAY
-
 extern void pentablakehash_4way( void *output, const void *input )
 {
 	unsigned char _ALIGN(32) hash[128];
--- a/algo/blake/pentablake-gate.c
+++ b/algo/blake/pentablake-gate.c
@@ -9,7 +9,7 @@ bool register_pentablake_algo( algo_gate_t* gate )
    gate->scanhash  = (void*)&scanhash_pentablake;
    gate->hash      = (void*)&pentablakehash;
 #endif
-    gate->optimizations = FOUR_WAY_OPT;
+    gate->optimizations = AVX2_OPT;
    gate->get_max64 = (void*)&get_max64_0x3ffff;
    return true;
 };
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX__)
+#if defined(__AVX2__)
  #define PENTABLAKE_4WAY
 #endif

--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -16,7 +16,7 @@
 #include <stdio.h>

 #include "algo/sha/sph_types.h"
-#include "crypto/blake2s.h"
+#include "sph-blake2s.h"

 static const uint32_t blake2s_IV[8] =
 {
--- a/algo/blake/sph-blake2s.h
+++ b/algo/blake/sph-blake2s.h
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -0,0 +1,95 @@
+/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
+ * functions which differ by their output size; this implementation
+ * defines BMW for output sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_bmw.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef BMW_HASH_H__
+#define BMW_HASH_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#ifdef __AVX2__
+
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_bmw256   256
+
+#define SPH_SIZE_bmw512   512
+
+typedef struct {
+   __m128i buf[64];
+   __m128i H[16];
+   size_t ptr;
+   sph_u32 bit_count;  // assume bit_count fits in 32 bits
+} bmw_4way_small_context;
+
+typedef bmw_4way_small_context bmw256_4way_context;
+
+typedef struct {
+   __m256i buf[16];
+   __m256i H[16];
+   size_t ptr;
+   sph_u64 bit_count;
+} bmw_4way_big_context;
+
+typedef bmw_4way_big_context bmw512_4way_context;
+
+void bmw256_4way_init(void *cc);
+
+void bmw256_4way(void *cc, const void *data, size_t len);
+
+void bmw256_4way_close(void *cc, void *dst);
+
+void bmw256_4way_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+void bmw512_4way_init(void *cc);
+
+void bmw512_4way(void *cc, const void *data, size_t len);
+
+void bmw512_4way_close(void *cc, void *dst);
+
+void bmw512_4way_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -3,7 +3,8 @@
 #include "cryptonight.h"
 #include "miner.h"
 #include "crypto/c_keccak.h"
-#include "avxdefs.h"
+#include <immintrin.h>
+//#include "avxdefs.h"

 void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey);
 void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state);
--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -10,6 +10,10 @@
 #endif
 #include "cubehash_sse2.h"
 #include "algo/sha/sha3-defs.h"
+#include <stdbool.h>
+#include <unistd.h>
+#include <memory.h>
+#include "avxdefs.h"

 static void transform( cubehashParam *sp )
 {
@@ -125,6 +129,18 @@ static void transform( cubehashParam *sp )
 #endif
 }  // transform

+// Cubehash context initializing is very expensive.
+// Cache the intial value for faster reinitializing.
+cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));
+
+int cubehashReinit( cubehashParam *sp )
+{
+   memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
+   return SUCCESS;
+
+}
+
+// Initialize the cache then copy to sp.
 int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
 {
    int i;
@@ -135,24 +151,26 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)

    /* Sanity checks */
    if ( rounds <= 0 || rounds > 32 )
-         rounds = CUBEHASH_ROUNDS;
+       rounds = CUBEHASH_ROUNDS;
    if ( blockbytes <= 0 || blockbytes >= 256)
-         blockbytes = CUBEHASH_BLOCKBYTES;
+       blockbytes = CUBEHASH_BLOCKBYTES;

    // all sizes of __m128i
-    sp->hashlen   = hashbitlen/128;
-    sp->blocksize = blockbytes/16;
-    sp->rounds    = rounds;
-    sp->pos       = 0;
+    cube_ctx_cache.hashlen   = hashbitlen/128;
+    cube_ctx_cache.blocksize = blockbytes/16;
+    cube_ctx_cache.rounds    = rounds;
+    cube_ctx_cache.pos       = 0;

    for ( i = 0; i < 8; ++i )
-         sp->x[i] = _mm_set_epi32(0, 0, 0, 0);
+       cube_ctx_cache.x[i] = _mm_setzero_si128();;

-    sp->x[0] = _mm_set_epi32( 0, rounds, blockbytes, hashbitlen / 8 );
+    cube_ctx_cache.x[0] = _mm_set_epi32( 0, rounds, blockbytes,
+                                         hashbitlen / 8 );

    for ( i = 0; i < 10; ++i )
-         transform(sp);
-//    sp->pos = 0;
+       transform( &cube_ctx_cache );
+
+    memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
    return SUCCESS;
 }

--- a/algo/cubehash/sse2/cubehash_sse2.h
+++ b/algo/cubehash/sse2/cubehash_sse2.h
@@ -29,6 +29,8 @@ extern "C" {
 #endif

 int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes);
+// reinitialize context with same parameters, much faster.
+int cubehashReinit( cubehashParam* sp );

 int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size);

--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -0,0 +1,935 @@
+/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
+/*
+ * Hamsi implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+//#include "miner.h"
+#include "hamsi-hash-4way.h"
+
+#if defined(__AVX2__)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/*
+ * The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
+ * table lookup during message expansion (1 to 8, inclusive). If we note
+ * w the number of bits per message word (w=32 for Hamsi-224/256, w=64
+ * for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
+ * Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
+ * then we will get t tables (where t=ceil(w/n)) of individual size
+ * 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
+ * n=5, there are 7 tables, but the last one uses only two bits on
+ * input, not five).
+ *
+ * Also, we read t rows of r words from RAM. Words in a given row are
+ * concatenated in RAM in that order, so most of the cost is about
+ * reading the first row word; comparatively, cache misses are thus
+ * less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
+ *
+ * When n=1, tables are "special" in that we omit the first entry of
+ * each table (which always contains 0), so that total table size is
+ * halved.
+ *
+ * We thus have the following (size1 is the cumulative table size of
+ * Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
+ * are for Hamsi-224/256 and Hamsi-384/512, respectively).
+ *
+ *   n      size1      size2    t1    t2
+ * ---------------------------------------
+ *   1       1024       4096    32    64
+ *   2       2048       8192    16    32
+ *   3       2688      10880    11    22
+ *   4       4096      16384     8    16
+ *   5       6272      25600     7    13
+ *   6      10368      41984     6    11
+ *   7      16896      73856     5    10
+ *   8      32768     131072     4     8
+ *
+ * So there is a trade-off: a lower n makes the tables fit better in
+ * L1 cache, but increases the number of memory accesses. The optimal
+ * value depends on the amount of available L1 cache and the relative
+ * impact of a cache miss.
+ *
+ * Experimentally, in ideal benchmark conditions (which are not necessarily
+ * realistic with regards to L1 cache contention), it seems that n=8 is
+ * the best value on "big" architectures (those with 32 kB or more of L1
+ * cache), while n=4 is better on "small" architectures. This was tested
+ * on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
+ * (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
+ * (8 kB L1 cache).
+ *
+ * Note: with n=1, the 32 tables (actually implemented as one big table)
+ * are read entirely and sequentially, regardless of the input data,
+ * thus avoiding any data-dependent table access pattern.
+ */
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+//#include "hamsi-helper-4way.c"
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
+	SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
+	SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
+	SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
+	SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
+	SPH_C32(0x6769756d)
+};
+
+static const sph_u32 alpha_n[] = {
+	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
+};
+
+static const sph_u32 alpha_f[] = {
+	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
+	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
+	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
+	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
+};
+
+// imported from hamsi helper
+
+/* Note: this table lists bits within each byte from least
+   siginificant to most significant. */
+static const sph_u32 T512[64][16] = {
+	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
+	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
+	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
+	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
+	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
+	  SPH_C32(0x9e69af68) },
+	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
+	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
+	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
+	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
+	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
+	  SPH_C32(0x0c26f262) },
+	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
+	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
+	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
+	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
+	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
+	  SPH_C32(0xdc24e61f) },
+	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
+	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
+	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
+	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
+	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
+	  SPH_C32(0x3daac2da) },
+	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
+	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
+	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
+	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
+	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
+	  SPH_C32(0x78cace29) },
+	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
+	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
+	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
+	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
+	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
+	  SPH_C32(0x2dd1f9ab) },
+	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
+	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
+	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
+	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
+	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
+	  SPH_C32(0xbf2c0be2) },
+	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
+	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
+	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
+	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
+	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
+	  SPH_C32(0x32219526) },
+	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
+	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
+	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
+	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
+	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
+	  SPH_C32(0xac8e6c88) },
+	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
+	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
+	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
+	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
+	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
+	  SPH_C32(0x7b1bd6b9) },
+	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
+	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
+	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
+	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
+	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
+	  SPH_C32(0xf746c320) },
+	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
+	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
+	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
+	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
+	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
+	  SPH_C32(0x69505b3a) },
+	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
+	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
+	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
+	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
+	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
+	  SPH_C32(0x8a341574) },
+	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
+	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
+	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
+	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
+	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
+	  SPH_C32(0x450360bf) },
+	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
+	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
+	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
+	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
+	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
+	  SPH_C32(0xf3d45758) },
+	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
+	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
+	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
+	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
+	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
+	  SPH_C32(0x925c44e9) },
+	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
+	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
+	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
+	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
+	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
+	  SPH_C32(0xa123ff9f) },
+	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
+	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
+	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
+	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
+	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
+	  SPH_C32(0x1568ff0f) },
+	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
+	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
+	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
+	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
+	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
+	  SPH_C32(0xc5c1eb3e) },
+	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
+	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
+	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
+	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
+	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
+	  SPH_C32(0x1af21fe1) },
+	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
+	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
+	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
+	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
+	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
+	  SPH_C32(0x857f3c2b) },
+	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
+	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
+	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
+	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
+	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
+	  SPH_C32(0x2ba05a55) },
+	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
+	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
+	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
+	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
+	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
+	  SPH_C32(0xfeabf254) },
+	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
+	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
+	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
+	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
+	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
+	  SPH_C32(0xfe1cdc7f) },
+	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
+	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
+	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
+	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
+	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
+	  SPH_C32(0xb0a51834) },
+	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
+	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
+	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
+	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
+	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
+	  SPH_C32(0xa6b8c28d) },
+	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
+	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
+	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
+	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
+	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
+	  SPH_C32(0x3a4e99d7) },
+	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
+	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
+	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
+	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
+	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
+	  SPH_C32(0xe1844257) },
+	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
+	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
+	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
+	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
+	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
+	  SPH_C32(0x2c3b504e) },
+	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
+	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
+	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
+	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
+	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
+	  SPH_C32(0x524a0d59) },
+	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
+	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
+	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
+	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
+	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
+	  SPH_C32(0x378dd173) },
+	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
+	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
+	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
+	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
+	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
+	  SPH_C32(0x8b6c72bd) },
+	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
+	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
+	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
+	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
+	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
+	  SPH_C32(0x8e67b7fa) },
+	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
+	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
+	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
+	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
+	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
+	  SPH_C32(0x443d3004) },
+	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
+	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
+	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
+	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
+	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
+	  SPH_C32(0xf4f6ea7b) },
+	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
+	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
+	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
+	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
+	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
+	  SPH_C32(0x979961d0) },
+	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
+	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
+	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
+	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
+	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
+	  SPH_C32(0x98aa496e) },
+	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
+	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
+	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
+	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
+	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
+	  SPH_C32(0x094e3198) },
+	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
+	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
+	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
+	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
+	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
+	  SPH_C32(0xe86cba2e) },
+	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
+	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
+	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
+	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
+	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
+	  SPH_C32(0x4b7eec55) },
+	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
+	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
+	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
+	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
+	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
+	  SPH_C32(0x1e7536a6) },
+	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
+	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
+	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
+	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
+	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
+	  SPH_C32(0x24314f17) },
+	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
+	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
+	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
+	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
+	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
+	  SPH_C32(0x9075b1ce) },
+	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
+	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
+	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
+	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
+	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
+	  SPH_C32(0x9b6ef888) },
+	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
+	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
+	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
+	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
+	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
+	  SPH_C32(0xd8b61463) },
+	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
+	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
+	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
+	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
+	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
+	  SPH_C32(0x3ea660f7) },
+	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
+	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
+	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
+	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
+	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
+	  SPH_C32(0x7f975691) },
+	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
+	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
+	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
+	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
+	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
+	  SPH_C32(0x2c94459e) },
+	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
+	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
+	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
+	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
+	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
+	  SPH_C32(0x56a7b19f) },
+	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
+	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
+	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
+	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
+	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
+	  SPH_C32(0x81fdf908) },
+	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
+	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
+	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
+	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
+	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
+	  SPH_C32(0x5bd61539) },
+	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
+	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
+	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
+	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
+	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
+	  SPH_C32(0x15b961e7) },
+	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
+	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
+	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
+	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
+	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
+	  SPH_C32(0x2a2c18f0) },
+	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
+	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
+	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
+	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
+	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
+	  SPH_C32(0x551e3d6e) },
+	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
+	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
+	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
+	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
+	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
+	  SPH_C32(0x33c5244f) },
+	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
+	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
+	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
+	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
+	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
+	  SPH_C32(0x8a58e6a4) },
+	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
+	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
+	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
+	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
+	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
+	  SPH_C32(0xda878000) },
+	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
+	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
+	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
+	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
+	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
+	  SPH_C32(0x3c5dfffe) },
+	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
+	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
+	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
+	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
+	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
+	  SPH_C32(0x7b1675d7) },
+	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
+	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
+	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
+	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
+	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
+	  SPH_C32(0x2879ebac) },
+	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
+	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
+	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
+	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
+	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
+	  SPH_C32(0xbe0a679e) },
+	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
+	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
+	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
+	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
+	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
+	  SPH_C32(0x30aebcf7) },
+	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
+	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
+	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
+	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
+	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
+	  SPH_C32(0xc7ff60f0) },
+	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
+	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
+	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
+	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
+	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
+	  SPH_C32(0xe7e00a94) }
+};
+
+#define INPUT_BIG \
+do { \
+  __m256i db = *buf; \
+  const sph_u32 *tp = &T512[0][0]; \
+  m0 = m256_zero; \
+  m1 = m256_zero; \
+  m2 = m256_zero; \
+  m3 = m256_zero; \
+  m4 = m256_zero; \
+  m5 = m256_zero; \
+  m6 = m256_zero; \
+  m7 = m256_zero; \
+  for ( int u = 0; u < 64; u++ ) \
+  { \
+     __m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
+     dm = mm256_negate_32( _mm256_or_si256( dm, \
+                         _mm256_slli_epi64( dm, 32 ) ) ); \
+     m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x1], tp[0x0], tp[0x1], tp[0x0], \
+                                    tp[0x1], tp[0x0], tp[0x1], tp[0x0] ) ) ); \
+     m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x3], tp[0x2], tp[0x3], tp[0x2], \
+                                    tp[0x3], tp[0x2], tp[0x3], tp[0x2] ) ) ); \
+     m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x5], tp[0x4], tp[0x5], tp[0x4], \
+                                    tp[0x5], tp[0x4], tp[0x5], tp[0x4] ) ) ); \
+     m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x7], tp[0x6], tp[0x7], tp[0x6], \
+                                    tp[0x7], tp[0x6], tp[0x7], tp[0x6] ) ) ); \
+     m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x9], tp[0x8], tp[0x9], tp[0x8], \
+                                    tp[0x9], tp[0x8], tp[0x9], tp[0x8] ) ) ); \
+     m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0xB], tp[0xA], tp[0xB], tp[0xA], \
+                                    tp[0xB], tp[0xA], tp[0xB], tp[0xA] ) ) ); \
+     m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0xD], tp[0xC], tp[0xD], tp[0xC], \
+                                    tp[0xD], tp[0xC], tp[0xD], tp[0xC] ) ) ); \
+     m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0xF], tp[0xE], tp[0xF], tp[0xE], \
+                                    tp[0xF], tp[0xE], tp[0xF], tp[0xE] ) ) ); \
+     tp += 0x10; \
+     db = _mm256_srli_epi64( db, 1 ); \
+  } \
+} while (0)
+
+#define SBOX( a, b, c, d ) \
+do { \
+  __m256i t; \
+  t = a; \
+  a = _mm256_and_si256( a, c ); \
+  a = _mm256_xor_si256( a, d ); \
+  c = _mm256_xor_si256( c, b ); \
+  c = _mm256_xor_si256( c, a ); \
+  d = _mm256_or_si256( d, t ); \
+  d = _mm256_xor_si256( d, b ); \
+  t = _mm256_xor_si256( t, c ); \
+  b = d; \
+  d = _mm256_or_si256( d, t ); \
+  d = _mm256_xor_si256( d, a ); \
+  a = _mm256_and_si256( a, b ); \
+  t = _mm256_xor_si256( t, a ); \
+  b = _mm256_xor_si256( b, d ); \
+  b = _mm256_xor_si256( b, t ); \
+  a = c; \
+  c = b; \
+  b = d; \
+  d = mm256_not( t ); \
+} while (0)
+
+#define L( a, b, c, d ) \
+do { \
+   a = mm256_rotl_32( a, 13 ); \
+   c = mm256_rotl_32( c,  3 ); \
+   b = _mm256_xor_si256( b, _mm256_xor_si256( a, c ) ); \
+   d = _mm256_xor_si256( d, _mm256_xor_si256( c, \
+                                              _mm256_slli_epi32( a, 3 ) ) ); \
+   b = mm256_rotl_32( b, 1 ); \
+   d = mm256_rotl_32( d, 7 ); \
+   a = _mm256_xor_si256( a, _mm256_xor_si256( b, d ) ); \
+   c = _mm256_xor_si256( c, _mm256_xor_si256( d, \
+                                              _mm256_slli_epi32( b, 7 ) ) ); \
+   a = mm256_rotl_32( a,  5 ); \
+   c = mm256_rotl_32( c, 22 ); \
+} while (0)
+
+#define DECL_STATE_BIG \
+   __m256i c0, c1, c2, c3, c4, c5, c6, c7; \
+
+#define READ_STATE_BIG(sc) \
+do { \
+   c0 = sc->h[0x0]; \
+   c1 = sc->h[0x1]; \
+   c2 = sc->h[0x2]; \
+   c3 = sc->h[0x3]; \
+   c4 = sc->h[0x4]; \
+   c5 = sc->h[0x5]; \
+   c6 = sc->h[0x6]; \
+   c7 = sc->h[0x7]; \
+} while (0)
+
+#define WRITE_STATE_BIG(sc) \
+do { \
+   sc->h[0x0] = c0; \
+   sc->h[0x1] = c1; \
+   sc->h[0x2] = c2; \
+   sc->h[0x3] = c3; \
+   sc->h[0x4] = c4; \
+   sc->h[0x5] = c5; \
+   sc->h[0x6] = c6; \
+   sc->h[0x7] = c7; \
+} while (0)
+
+#define s0   m0
+#define s1   c0
+#define s2   m1
+#define s3   c1
+#define s4   c2
+#define s5   m2
+#define s6   c3
+#define s7   m3
+#define s8   m4
+#define s9   c4
+#define sA   m5
+#define sB   c5
+#define sC   c6
+#define sD   m6
+#define sE   c7
+#define sF   m7
+
+#define ROUND_BIG(rc, alpha) \
+do { \
+  __m256i t0, t1, t2, t3; \
+  s0 = _mm256_xor_si256( s0, _mm256_set_epi32( \
+        alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00], \
+        alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00] ) ); \
+  s1 = _mm256_xor_si256( s1, _mm256_set_epi32( \
+                     alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02], \
+                     alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02] ) ); \
+  s2 = _mm256_xor_si256( s2, _mm256_set_epi32( \
+                     alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04], \
+                     alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04] ) ); \
+  s3 = _mm256_xor_si256( s3, _mm256_set_epi32( \
+                     alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06], \
+                     alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06] ) ); \
+  s4 = _mm256_xor_si256( s4, _mm256_set_epi32( \
+                     alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08], \
+                     alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08] ) ); \
+  s5 = _mm256_xor_si256( s5, _mm256_set_epi32( \
+                     alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A], \
+                     alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A] ) ); \
+  s6 = _mm256_xor_si256( s6, _mm256_set_epi32( \
+                     alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C], \
+                     alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C] ) ); \
+  s7 = _mm256_xor_si256( s7, _mm256_set_epi32( \
+                     alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E], \
+                     alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E] ) ); \
+  s8 = _mm256_xor_si256( s8, _mm256_set_epi32( \
+                     alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10], \
+                     alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10] ) ); \
+  s9 = _mm256_xor_si256( s9, _mm256_set_epi32( \
+                     alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12], \
+                     alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12] ) ); \
+  sA = _mm256_xor_si256( sA, _mm256_set_epi32( \
+                     alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14], \
+                     alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14] ) ); \
+  sB = _mm256_xor_si256( sB, _mm256_set_epi32( \
+                     alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16], \
+                     alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16] ) ); \
+  sC = _mm256_xor_si256( sC, _mm256_set_epi32( \
+                     alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18], \
+                     alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18] ) ); \
+  sD = _mm256_xor_si256( sD, _mm256_set_epi32( \
+                     alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A], \
+                     alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A] ) ); \
+  sE = _mm256_xor_si256( sE, _mm256_set_epi32( \
+                     alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C], \
+                     alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C] ) ); \
+  sF = _mm256_xor_si256( sF, _mm256_set_epi32( \
+                     alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E], \
+                     alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E] ) ); \
+\
+  SBOX( s0, s4, s8, sC ); \
+  SBOX( s1, s5, s9, sD ); \
+  SBOX( s2, s6, sA, sE ); \
+  SBOX( s3, s7, sB, sF ); \
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \
+                           _mm256_bslli_epi128( s5, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \
+                           _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
+  L( s0, t1, s9, t3 ); \
+  s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
+                           _mm256_bslli_epi128( s6, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \
+                           _mm256_bslli_epi128( sF, 4 ), 0xAA ); \
+  L( s1, t1, sA, t3 ); \
+  s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \
+                           _mm256_bslli_epi128( s7, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \
+                           _mm256_bslli_epi128( sC, 4 ), 0xAA ); \
+  L( s2, t1, sB, t3 ); \
+  s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \
+                           _mm256_bslli_epi128( s4, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \
+                           _mm256_bslli_epi128( sD, 4 ), 0xAA ); \
+  L( s3, t1, s8, t3 ); \
+  s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \
+  t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \
+  t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \
+                           _mm256_bslli_epi128( sB, 4 ), 0xAA ); \
+  L( t0, t1, t2, t3 ); \
+  s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
+  s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \
+  s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
+  s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \
+  s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \
+  sA = _mm256_blend_epi32( sA, t2, 0xAA ); \
+  s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \
+  sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \
+\
+  t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
+                           _mm256_bslli_epi128( sD, 4 ), 0xAA ); \
+  t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \
+  L( t0, t1, t2, t3 ); \
+  s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \
+  sC = _mm256_blend_epi32( sC, t0, 0xAA ); \
+  s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \
+  sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \
+  s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
+  sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \
+  s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
+  sF = _mm256_blend_epi32( sF, t3, 0xAA ); \
+} while (0)
+
+#define P_BIG \
+do { \
+   ROUND_BIG(0, alpha_n); \
+   ROUND_BIG(1, alpha_n); \
+   ROUND_BIG(2, alpha_n); \
+   ROUND_BIG(3, alpha_n); \
+   ROUND_BIG(4, alpha_n); \
+   ROUND_BIG(5, alpha_n); \
+} while (0)
+
+#define PF_BIG \
+do { \
+   ROUND_BIG( 0, alpha_f); \
+   ROUND_BIG( 1, alpha_f); \
+   ROUND_BIG( 2, alpha_f); \
+   ROUND_BIG( 3, alpha_f); \
+   ROUND_BIG( 4, alpha_f); \
+   ROUND_BIG( 5, alpha_f); \
+   ROUND_BIG( 6, alpha_f); \
+   ROUND_BIG( 7, alpha_f); \
+   ROUND_BIG( 8, alpha_f); \
+   ROUND_BIG( 9, alpha_f); \
+   ROUND_BIG(10, alpha_f); \
+   ROUND_BIG(11, alpha_f); \
+} while (0)
+
+#define T_BIG \
+do { /* order is important */ \
+   c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \
+   c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \
+   c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \
+   c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \
+   c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \
+   c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \
+   c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \
+   c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \
+} while (0)
+
+void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
+{
+   DECL_STATE_BIG
+   sph_u32 tmp;
+
+   tmp = SPH_T32( (sph_u32)num << 6 );
+   sc->count_low = SPH_T32( sc->count_low + tmp );
+   sc->count_high += (sph_u32)( (num >> 13) >> 13 );
+   if ( sc->count_low < tmp )
+      sc->count_high++;
+
+   READ_STATE_BIG( sc );
+   while ( num-- > 0 )
+   {
+      __m256i m0, m1, m2, m3, m4, m5, m6, m7;
+
+      INPUT_BIG;
+      P_BIG;
+      T_BIG;
+      buf++;
+   }
+   WRITE_STATE_BIG( sc );
+}
+
+void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
+{
+   __m256i m0, m1, m2, m3, m4, m5, m6, m7;
+   DECL_STATE_BIG
+   READ_STATE_BIG( sc );
+   INPUT_BIG;
+   PF_BIG;
+   T_BIG;
+   WRITE_STATE_BIG( sc );
+}
+
+void hamsi512_4way_init( hamsi_4way_big_context *sc )
+{
+   sc->partial_len = 0;
+   sph_u32 lo, hi;
+   sc->count_high = sc->count_low = 0;
+   for ( int i = 0; i < 8; i++ )
+   {
+      lo = 2*i;
+      hi = 2*i + 1;
+      sc->h[i] = _mm256_set_epi32( IV512[hi], IV512[lo], IV512[hi], IV512[lo],
+                                   IV512[hi], IV512[lo], IV512[hi], IV512[lo] );
+   }
+}
+
+void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+
+// It looks like the only way to get in here is if core was previously called
+// with a very small len
+// That's not likely even with 80 byte input so deprecate partial len
+/*
+   if ( sc->partial_len != 0 )
+   {
+      size_t mlen;
+
+      mlen = 8 - sc->partial_len;
+      if ( len < mlen )
+      {
+         memcpy_256( sc->partial + (sc->partial_len >> 3), data, len>>3 );
+         sc->partial_len += len;
+         return;
+      }
+      else
+      {
+         memcpy_256( sc->partial + (sc->partial_len >> 3), data, mlen>>3 );
+         len -= mlen;
+         vdata += mlen>>3;
+         hamsi_big( sc, sc->partial, 1 );
+         sc->partial_len = 0;
+      }
+   }
+*/
+
+   hamsi_big( sc, vdata, len>>3 );
+   vdata += ( (len& ~(size_t)7) >> 3 );
+   len &= (size_t)7;
+   memcpy_256( sc->buf, vdata, len>>3 );
+   sc->partial_len = len;
+}
+
+void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
+{
+   __m256i *out = (__m256i*)dst;
+   __m256i pad[1];
+   size_t u;
+   int ch, cl;
+
+   sph_enc32be( &ch, sc->count_high );
+   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] =  _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch );
+   sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
+                                  0UL, 0x80UL, 0UL, 0x80UL );
+   hamsi_big( sc, sc->buf, 1 );
+   hamsi_big_final( sc, pad );
+   for ( u = 0; u < 8; u ++ )
+      out[u] = mm256_bswap_32( sc->h[u] );
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -0,0 +1,72 @@
+/* $Id: sph_hamsi.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Hamsi interface. This code implements Hamsi with the recommended
+ * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_hamsi.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef HAMSI_4WAY_H__
+#define HAMSI_4WAY_H__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+
+#if defined (__AVX__)
+
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#define SPH_SIZE_hamsi512   512
+
+// Partial is only scalar but needs pointer ref for hamsi-helper
+// deprecate partial_len
+typedef struct {
+   __m256i h[8];
+   __m256i buf[1];
+   size_t partial_len;
+   sph_u32 count_high, count_low;
+} hamsi_4way_big_context;
+
+typedef hamsi_4way_big_context hamsi512_4way_context;
+
+void hamsi512_4way_init( hamsi512_4way_context *sc );
+void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
+void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
--- a/algo/hamsi/sph_hamsi.c.test
+++ b/algo/hamsi/sph_hamsi.c.test
@@ -0,0 +1,940 @@
+/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
+/*
+ * Hamsi implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_hamsi.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAMSI
+#define SPH_SMALL_FOOTPRINT_HAMSI   1
+#endif
+
+/*
+ * The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
+ * table lookup during message expansion (1 to 8, inclusive). If we note
+ * w the number of bits per message word (w=32 for Hamsi-224/256, w=64
+ * for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
+ * Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
+ * then we will get t tables (where t=ceil(w/n)) of individual size
+ * 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
+ * n=5, there are 7 tables, but the last one uses only two bits on
+ * input, not five).
+ *
+ * Also, we read t rows of r words from RAM. Words in a given row are
+ * concatenated in RAM in that order, so most of the cost is about
+ * reading the first row word; comparatively, cache misses are thus
+ * less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
+ *
+ * When n=1, tables are "special" in that we omit the first entry of
+ * each table (which always contains 0), so that total table size is
+ * halved.
+ *
+ * We thus have the following (size1 is the cumulative table size of
+ * Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
+ * are for Hamsi-224/256 and Hamsi-384/512, respectively).
+ *
+ *   n      size1      size2    t1    t2
+ * ---------------------------------------
+ *   1       1024       4096    32    64
+ *   2       2048       8192    16    32
+ *   3       2688      10880    11    22
+ *   4       4096      16384     8    16
+ *   5       6272      25600     7    13
+ *   6      10368      41984     6    11
+ *   7      16896      73856     5    10
+ *   8      32768     131072     4     8
+ *
+ * So there is a trade-off: a lower n makes the tables fit better in
+ * L1 cache, but increases the number of memory accesses. The optimal
+ * value depends on the amount of available L1 cache and the relative
+ * impact of a cache miss.
+ *
+ * Experimentally, in ideal benchmark conditions (which are not necessarily
+ * realistic with regards to L1 cache contention), it seems that n=8 is
+ * the best value on "big" architectures (those with 32 kB or more of L1
+ * cache), while n=4 is better on "small" architectures. This was tested
+ * on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
+ * (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
+ * (8 kB L1 cache).
+ *
+ * Note: with n=1, the 32 tables (actually implemented as one big table)
+ * are read entirely and sequentially, regardless of the input data,
+ * thus avoiding any data-dependent table access pattern.
+ */
+
+#if !defined SPH_HAMSI_EXPAND_SMALL
+#if SPH_SMALL_FOOTPRINT_HAMSI
+#define SPH_HAMSI_EXPAND_SMALL  4
+#else
+#define SPH_HAMSI_EXPAND_SMALL  8
+#endif
+#endif
+
+#if !defined SPH_HAMSI_EXPAND_BIG
+#define SPH_HAMSI_EXPAND_BIG    8
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#include "sph_hamsi_helper.c"
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0xc3967a67), SPH_C32(0xc3bc6c20), SPH_C32(0x4bc3bcc3),
+	SPH_C32(0xa7c3bc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
+	SPH_C32(0x69656b65), SPH_C32(0x20556e69)
+};
+
+/*
+ * This version is the one used in the Hamsi submission package for
+ * round 2 of the SHA-3 competition; the UTF-8 encoding is wrong and
+ * shall soon be corrected in the official Hamsi specification.
+ *
+static const sph_u32 IV224[] = {
+	SPH_C32(0x3c967a67), SPH_C32(0x3cbc6c20), SPH_C32(0xb4c343c3),
+	SPH_C32(0xa73cbc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
+	SPH_C32(0x69656b65), SPH_C32(0x20556e69)
+};
+ */
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0x76657273), SPH_C32(0x69746569), SPH_C32(0x74204c65),
+	SPH_C32(0x7576656e), SPH_C32(0x2c204465), SPH_C32(0x70617274),
+	SPH_C32(0x656d656e), SPH_C32(0x7420456c)
+};
+
+static const sph_u32 IV384[] = {
+	SPH_C32(0x656b7472), SPH_C32(0x6f746563), SPH_C32(0x686e6965),
+	SPH_C32(0x6b2c2043), SPH_C32(0x6f6d7075), SPH_C32(0x74657220),
+	SPH_C32(0x53656375), SPH_C32(0x72697479), SPH_C32(0x20616e64),
+	SPH_C32(0x20496e64), SPH_C32(0x75737472), SPH_C32(0x69616c20),
+	SPH_C32(0x43727970), SPH_C32(0x746f6772), SPH_C32(0x61706879),
+	SPH_C32(0x2c204b61)
+};
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
+	SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
+	SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
+	SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
+	SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
+	SPH_C32(0x6769756d)
+};
+
+static const sph_u32 alpha_n[] = {
+	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
+};
+
+static const sph_u32 alpha_f[] = {
+	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
+	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
+	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
+	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
+};
+
+#define DECL_STATE_SMALL \
+	sph_u32 c0, c1, c2, c3, c4, c5, c6, c7;
+
+#define READ_STATE_SMALL(sc)   do { \
+		c0 = sc->h[0x0]; \
+		c1 = sc->h[0x1]; \
+		c2 = sc->h[0x2]; \
+		c3 = sc->h[0x3]; \
+		c4 = sc->h[0x4]; \
+		c5 = sc->h[0x5]; \
+		c6 = sc->h[0x6]; \
+		c7 = sc->h[0x7]; \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		sc->h[0x0] = c0; \
+		sc->h[0x1] = c1; \
+		sc->h[0x2] = c2; \
+		sc->h[0x3] = c3; \
+		sc->h[0x4] = c4; \
+		sc->h[0x5] = c5; \
+		sc->h[0x6] = c6; \
+		sc->h[0x7] = c7; \
+	} while (0)
+
+#define s0   m0
+#define s1   m1
+#define s2   c0
+#define s3   c1
+#define s4   c2
+#define s5   c3
+#define s6   m2
+#define s7   m3
+#define s8   m4
+#define s9   m5
+#define sA   c4
+#define sB   c5
+#define sC   c6
+#define sD   c7
+#define sE   m6
+#define sF   m7
+
+#define SBOX(a, b, c, d)   do { \
+		sph_u32 t; \
+		t = (a); \
+		(a) &= (c); \
+		(a) ^= (d); \
+		(c) ^= (b); \
+		(c) ^= (a); \
+		(d) |= t; \
+		(d) ^= (b); \
+		t ^= (c); \
+		(b) = (d); \
+		(d) |= t; \
+		(d) ^= (a); \
+		(a) &= (b); \
+		t ^= (a); \
+		(b) ^= (d); \
+		(b) ^= t; \
+		(a) = (c); \
+		(c) = (b); \
+		(b) = (d); \
+		(d) = SPH_T32(~t); \
+	} while (0)
+
+#define L(a, b, c, d)   do { \
+		(a) = SPH_ROTL32(a, 13); \
+		(c) = SPH_ROTL32(c, 3); \
+		(b) ^= (a) ^ (c); \
+		(d) ^= (c) ^ SPH_T32((a) << 3); \
+		(b) = SPH_ROTL32(b, 1); \
+		(d) = SPH_ROTL32(d, 7); \
+		(a) ^= (b) ^ (d); \
+		(c) ^= (d) ^ SPH_T32((b) << 7); \
+		(a) = SPH_ROTL32(a, 5); \
+		(c) = SPH_ROTL32(c, 22); \
+	} while (0)
+
+#define ROUND_SMALL(rc, alpha)   do { \
+		s0 ^= alpha[0x00]; \
+		s1 ^= alpha[0x01] ^ (sph_u32)(rc); \
+		s2 ^= alpha[0x02]; \
+		s3 ^= alpha[0x03]; \
+		s4 ^= alpha[0x08]; \
+		s5 ^= alpha[0x09]; \
+		s6 ^= alpha[0x0A]; \
+		s7 ^= alpha[0x0B]; \
+		s8 ^= alpha[0x10]; \
+		s9 ^= alpha[0x11]; \
+		sA ^= alpha[0x12]; \
+		sB ^= alpha[0x13]; \
+		sC ^= alpha[0x18]; \
+		sD ^= alpha[0x19]; \
+		sE ^= alpha[0x1A]; \
+		sF ^= alpha[0x1B]; \
+		SBOX(s0, s4, s8, sC); \
+		SBOX(s1, s5, s9, sD); \
+		SBOX(s2, s6, sA, sE); \
+		SBOX(s3, s7, sB, sF); \
+		L(s0, s5, sA, sF); \
+		L(s1, s6, sB, sC); \
+		L(s2, s7, s8, sD); \
+		L(s3, s4, s9, sE); \
+	} while (0)
+
+#define P_SMALL   do { \
+		ROUND_SMALL(0, alpha_n); \
+		ROUND_SMALL(1, alpha_n); \
+		ROUND_SMALL(2, alpha_n); \
+	} while (0)
+
+#define PF_SMALL   do { \
+		ROUND_SMALL(0, alpha_f); \
+		ROUND_SMALL(1, alpha_f); \
+		ROUND_SMALL(2, alpha_f); \
+		ROUND_SMALL(3, alpha_f); \
+		ROUND_SMALL(4, alpha_f); \
+		ROUND_SMALL(5, alpha_f); \
+	} while (0)
+
+#define T_SMALL   do { \
+		/* order is important */ \
+		c7 = (sc->h[7] ^= sB); \
+		c6 = (sc->h[6] ^= sA); \
+		c5 = (sc->h[5] ^= s9); \
+		c4 = (sc->h[4] ^= s8); \
+		c3 = (sc->h[3] ^= s3); \
+		c2 = (sc->h[2] ^= s2); \
+		c1 = (sc->h[1] ^= s1); \
+		c0 = (sc->h[0] ^= s0); \
+	} while (0)
+
+static void
+hamsi_small(sph_hamsi_small_context *sc, const unsigned char *buf, size_t num)
+{
+	DECL_STATE_SMALL
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->count += (sph_u64)num << 5;
+#else
+	tmp = SPH_T32((sph_u32)num << 5);
+	sc->count_low = SPH_T32(sc->count_low + tmp);
+	sc->count_high += (sph_u32)((num >> 13) >> 14);
+	if (sc->count_low < tmp)
+		sc->count_high ++;
+#endif
+	READ_STATE_SMALL(sc);
+	while (num -- > 0) {
+		sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
+
+		INPUT_SMALL;
+		P_SMALL;
+		T_SMALL;
+		buf += 4;
+	}
+	WRITE_STATE_SMALL(sc);
+}
+
+static void
+hamsi_small_final(sph_hamsi_small_context *sc, const unsigned char *buf)
+{
+	sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
+	DECL_STATE_SMALL
+
+	READ_STATE_SMALL(sc);
+	INPUT_SMALL;
+	PF_SMALL;
+	T_SMALL;
+	WRITE_STATE_SMALL(sc);
+}
+
+static void
+hamsi_small_init(sph_hamsi_small_context *sc, const sph_u32 *iv)
+{
+	sc->partial_len = 0;
+	memcpy(sc->h, iv, sizeof sc->h);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+static void
+hamsi_small_core(sph_hamsi_small_context *sc, const void *data, size_t len)
+{
+	if (sc->partial_len != 0) {
+		size_t mlen;
+
+		mlen = 4 - sc->partial_len;
+		if (len < mlen) {
+			memcpy(sc->partial + sc->partial_len, data, len);
+			sc->partial_len += len;
+			return;
+		} else {
+			memcpy(sc->partial + sc->partial_len, data, mlen);
+			len -= mlen;
+			data = (const unsigned char *)data + mlen;
+			hamsi_small(sc, sc->partial, 1);
+			sc->partial_len = 0;
+		}
+	}
+
+	hamsi_small(sc, data, (len >> 2));
+	data = (const unsigned char *)data + (len & ~(size_t)3);
+	len &= (size_t)3;
+	memcpy(sc->partial, data, len);
+	sc->partial_len = len;
+}
+
+static void
+hamsi_small_close(sph_hamsi_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	unsigned char pad[12];
+	size_t ptr, u;
+	unsigned z;
+	unsigned char *out;
+
+	ptr = sc->partial_len;
+	memcpy(pad, sc->partial, ptr);
+#if SPH_64
+	sph_enc64be(pad + 4, sc->count + (ptr << 3) + n);
+#else
+	sph_enc32be(pad + 4, sc->count_high);
+	sph_enc32be(pad + 8, sc->count_low + (ptr << 3) + n);
+#endif
+	z = 0x80 >> n;
+	pad[ptr ++] = ((ub & -z) | z) & 0xFF;
+	while (ptr < 4)
+		pad[ptr ++] = 0;
+	hamsi_small(sc, pad, 2);
+	hamsi_small_final(sc, pad + 8);
+	out = dst;
+	for (u = 0; u < out_size_w32; u ++)
+		sph_enc32be(out + (u << 2), sc->h[u]);
+}
+
+#define DECL_STATE_BIG \
+	sph_u32 c0, c1, c2, c3, c4, c5, c6, c7; \
+	sph_u32 c8, c9, cA, cB, cC, cD, cE, cF;
+
+#define READ_STATE_BIG(sc)   do { \
+		c0 = sc->h[0x0]; \
+		c1 = sc->h[0x1]; \
+		c2 = sc->h[0x2]; \
+		c3 = sc->h[0x3]; \
+		c4 = sc->h[0x4]; \
+		c5 = sc->h[0x5]; \
+		c6 = sc->h[0x6]; \
+		c7 = sc->h[0x7]; \
+		c8 = sc->h[0x8]; \
+		c9 = sc->h[0x9]; \
+		cA = sc->h[0xA]; \
+		cB = sc->h[0xB]; \
+		cC = sc->h[0xC]; \
+		cD = sc->h[0xD]; \
+		cE = sc->h[0xE]; \
+		cF = sc->h[0xF]; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		sc->h[0x0] = c0; \
+		sc->h[0x1] = c1; \
+		sc->h[0x2] = c2; \
+		sc->h[0x3] = c3; \
+		sc->h[0x4] = c4; \
+		sc->h[0x5] = c5; \
+		sc->h[0x6] = c6; \
+		sc->h[0x7] = c7; \
+		sc->h[0x8] = c8; \
+		sc->h[0x9] = c9; \
+		sc->h[0xA] = cA; \
+		sc->h[0xB] = cB; \
+		sc->h[0xC] = cC; \
+		sc->h[0xD] = cD; \
+		sc->h[0xE] = cE; \
+		sc->h[0xF] = cF; \
+	} while (0)
+
+#define s00   m0
+#define s01   m1
+#define s02   c0
+#define s03   c1
+#define s04   m2
+#define s05   m3
+#define s06   c2
+#define s07   c3
+#define s08   c4
+#define s09   c5
+#define s0A   m4
+#define s0B   m5
+#define s0C   c6
+#define s0D   c7
+#define s0E   m6
+#define s0F   m7
+#define s10   m8
+#define s11   m9
+#define s12   c8
+#define s13   c9
+#define s14   mA
+#define s15   mB
+#define s16   cA
+#define s17   cB
+#define s18   cC
+#define s19   cD
+#define s1A   mC
+#define s1B   mD
+#define s1C   cE
+#define s1D   cF
+#define s1E   mE
+#define s1F   mF
+
+#define ROUND_BIG(rc, alpha)   do { \
+		s00 ^= alpha[0x00]; \
+		s01 ^= alpha[0x01] ^ (sph_u32)(rc); \
+		s02 ^= alpha[0x02]; \
+		s03 ^= alpha[0x03]; \
+		s04 ^= alpha[0x04]; \
+		s05 ^= alpha[0x05]; \
+		s06 ^= alpha[0x06]; \
+		s07 ^= alpha[0x07]; \
+		s08 ^= alpha[0x08]; \
+		s09 ^= alpha[0x09]; \
+		s0A ^= alpha[0x0A]; \
+		s0B ^= alpha[0x0B]; \
+		s0C ^= alpha[0x0C]; \
+		s0D ^= alpha[0x0D]; \
+		s0E ^= alpha[0x0E]; \
+		s0F ^= alpha[0x0F]; \
+		s10 ^= alpha[0x10]; \
+		s11 ^= alpha[0x11]; \
+		s12 ^= alpha[0x12]; \
+		s13 ^= alpha[0x13]; \
+		s14 ^= alpha[0x14]; \
+		s15 ^= alpha[0x15]; \
+		s16 ^= alpha[0x16]; \
+		s17 ^= alpha[0x17]; \
+		s18 ^= alpha[0x18]; \
+		s19 ^= alpha[0x19]; \
+		s1A ^= alpha[0x1A]; \
+		s1B ^= alpha[0x1B]; \
+		s1C ^= alpha[0x1C]; \
+		s1D ^= alpha[0x1D]; \
+		s1E ^= alpha[0x1E]; \
+		s1F ^= alpha[0x1F]; \
+		SBOX(s00, s08, s10, s18); \
+		SBOX(s01, s09, s11, s19); \
+		SBOX(s02, s0A, s12, s1A); \
+		SBOX(s03, s0B, s13, s1B); \
+		SBOX(s04, s0C, s14, s1C); \
+		SBOX(s05, s0D, s15, s1D); \
+		SBOX(s06, s0E, s16, s1E); \
+		SBOX(s07, s0F, s17, s1F); \
+		L(s00, s09, s12, s1B); \
+		L(s01, s0A, s13, s1C); \
+		L(s02, s0B, s14, s1D); \
+		L(s03, s0C, s15, s1E); \
+		L(s04, s0D, s16, s1F); \
+		L(s05, s0E, s17, s18); \
+		L(s06, s0F, s10, s19); \
+		L(s07, s08, s11, s1A); \
+/*if (rc == 0 ) { \
+printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
+}*/ \
+		L(s00, s02, s05, s07); \
+		L(s10, s13, s15, s16); \
+/*if (rc == 0 ) { \
+printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
+}*/ \
+		L(s09, s0B, s0C, s0E); \
+		L(s19, s1A, s1C, s1F); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_HAMSI
+
+#define P_BIG   do { \
+		unsigned r; \
+		for (r = 0; r < 6; r ++) \
+			ROUND_BIG(r, alpha_n); \
+	} while (0)
+
+#define PF_BIG   do { \
+		unsigned r; \
+		for (r = 0; r < 12; r ++) \
+			ROUND_BIG(r, alpha_f); \
+	} while (0)
+
+#else
+
+#define P_BIG   do { \
+		ROUND_BIG(0, alpha_n); \
+/*printf("S R0 s00 %08lx s01 %08lx s02 %08lx s03 %08lx\n",s00,s01,s02,s03); \
+printf("S R0 s04 %08lx s05 %08lx s06 %08lx s07 %08lx\n",s04,s05,s06,s07); \
+printf("S R0 s08 %08lx s09 %08lx s0A %08lx s0B %08lx\n",s08,s09,s0A,s0B); \
+printf("S R0 s0C %08lx s0D %08lx s0E %08lx s0F %08lx\n",s0C,s0D,s0E,s0F); \
+printf("S R0 s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
+printf("S R0 s14 %08lx s15 %08lx s16 %08lx s17 %08lx\n",s14,s15,s16,s17); \
+printf("S R0 s18 %08lx s19 %08lx s1A %08lx s1B %08lx\n",s18,s19,s1A,s1B); \
+printf("S R0 s1C %08lx s1D %08lx s1E %08lx s1F %08lx\n",s1C,s1D,s1E,s1F); \
+*/\
+		ROUND_BIG(1, alpha_n); \
+		ROUND_BIG(2, alpha_n); \
+		ROUND_BIG(3, alpha_n); \
+		ROUND_BIG(4, alpha_n); \
+		ROUND_BIG(5, alpha_n); \
+	} while (0)
+
+#define PF_BIG   do { \
+		ROUND_BIG(0, alpha_f); \
+		ROUND_BIG(1, alpha_f); \
+		ROUND_BIG(2, alpha_f); \
+		ROUND_BIG(3, alpha_f); \
+		ROUND_BIG(4, alpha_f); \
+		ROUND_BIG(5, alpha_f); \
+		ROUND_BIG(6, alpha_f); \
+		ROUND_BIG(7, alpha_f); \
+		ROUND_BIG(8, alpha_f); \
+		ROUND_BIG(9, alpha_f); \
+		ROUND_BIG(10, alpha_f); \
+		ROUND_BIG(11, alpha_f); \
+	} while (0)
+
+#endif
+
+#define T_BIG   do { \
+		/* order is important */ \
+		cF = (sc->h[0xF] ^= s17); \
+		cE = (sc->h[0xE] ^= s16); \
+		cD = (sc->h[0xD] ^= s15); \
+		cC = (sc->h[0xC] ^= s14); \
+		cB = (sc->h[0xB] ^= s13); \
+		cA = (sc->h[0xA] ^= s12); \
+		c9 = (sc->h[0x9] ^= s11); \
+		c8 = (sc->h[0x8] ^= s10); \
+		c7 = (sc->h[0x7] ^= s07); \
+		c6 = (sc->h[0x6] ^= s06); \
+		c5 = (sc->h[0x5] ^= s05); \
+		c4 = (sc->h[0x4] ^= s04); \
+		c3 = (sc->h[0x3] ^= s03); \
+		c2 = (sc->h[0x2] ^= s02); \
+		c1 = (sc->h[0x1] ^= s01); \
+		c0 = (sc->h[0x0] ^= s00); \
+	} while (0)
+
+static void
+hamsi_big(sph_hamsi_big_context *sc, const unsigned char *buf, size_t num)
+{
+	DECL_STATE_BIG
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->count += (sph_u64)num << 6;
+#else
+	tmp = SPH_T32((sph_u32)num << 6);
+	sc->count_low = SPH_T32(sc->count_low + tmp);
+	sc->count_high += (sph_u32)((num >> 13) >> 13);
+	if (sc->count_low < tmp)
+		sc->count_high ++;
+#endif
+	READ_STATE_BIG(sc);
+/*
+uint32_t* b = (uint32_t*)buf;
+//printf("S s64: %016llx\n",*ss);
+//printf("S buf: %08lx %08lx\n",b[0], b[1]);
+
+int n1 = 1;
+int n2 = 1;
+*/
+	while (num -- > 0) {
+		sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
+		sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
+
+		INPUT_BIG;
+/*if ( n1 ) 
+{
+n1 = 0;
+printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m0,m1,m2,m3 );
+printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m4,m5,m6,m7);
+printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m8,m9,mA,mB );
+printf("S INPUT m: %08lx %08lx %08lx %08lx\n",mC,mD,mE,mF);
+}
+*/
+
+		P_BIG;
+
+/*if ( n2 )        
+{
+n2 = 0;
+printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s00,s01,s02,s03 );
+printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s04,s05,s07,s07);
+printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s08,s09,s0A,s0B );
+printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s0C,s0D,s0E,s0F);
+}
+*/
+
+		T_BIG;
+		buf += 8;
+	}
+	WRITE_STATE_BIG(sc);
+}
+
+static void
+hamsi_big_final(sph_hamsi_big_context *sc, const unsigned char *buf)
+{
+	sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
+	sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
+	DECL_STATE_BIG
+
+	READ_STATE_BIG(sc);
+	INPUT_BIG;
+	PF_BIG;
+	T_BIG;
+	WRITE_STATE_BIG(sc);
+}
+
+static void
+hamsi_big_init(sph_hamsi_big_context *sc, const sph_u32 *iv)
+{
+	sc->partial_len = 0;
+	memcpy(sc->h, iv, sizeof sc->h);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+static void
+hamsi_big_core(sph_hamsi_big_context *sc, const void *data, size_t len)
+{
+uint64_t* d = (uint64_t*)data;
+uint64_t* h = (uint64_t*)sc->h;
+/*
+printf("S core1 len = %d\n",len);
+printf("S data: %016llx %016llx %016llx %016llx\n",d[0],d[1],d[2],d[3]);
+printf("S data: %016llx %016llx %016llx %016llx\n",d[4],d[5],d[6],d[7]);
+printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
+*/
+	if (sc->partial_len != 0) {
+//printf("WARNING partial_len != 0\n");
+
+		size_t mlen;
+
+		mlen = 8 - sc->partial_len;
+		if (len < mlen) {
+			memcpy(sc->partial + sc->partial_len, data, len);
+			sc->partial_len += len;
+			return;
+		} else {
+			memcpy(sc->partial + sc->partial_len, data, mlen);
+			len -= mlen;
+			data = (const unsigned char *)data + mlen;
+			hamsi_big(sc, sc->partial, 1);
+			sc->partial_len = 0;
+		}
+	}
+
+	hamsi_big(sc, data, (len >> 3));
+/*
+printf("S core2\n");
+printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
+*/
+	data = (const unsigned char *)data + (len & ~(size_t)7);
+	len &= (size_t)7;
+	memcpy(sc->partial, data, len);
+	sc->partial_len = len;
+}
+
+static void
+hamsi_big_close(sph_hamsi_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	unsigned char pad[8];
+	size_t ptr, u;
+	unsigned z;
+	unsigned char *out;
+//uint64_t* h = (uint64_t*)sc->h;
+
+	ptr = sc->partial_len;
+#if SPH_64
+	sph_enc64be(pad, sc->count + (ptr << 3) + n);
+#else
+	sph_enc32be(pad, sc->count_high);
+	sph_enc32be(pad + 4, sc->count_low + (ptr << 3) + n);
+#endif
+	z = 0x80 >> n;
+	sc->partial[ptr ++] = ((ub & -z) | z) & 0xFF;
+	while (ptr < 8)
+		sc->partial[ptr ++] = 0;
+
+//printf("S close1\n");
+//printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
+
+	hamsi_big(sc, sc->partial, 1);
+
+//printf("S close2\n");
+//printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
+
+
+	hamsi_big_final(sc, pad);
+
+//printf("S close3\n");
+//printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
+
+
+	out = dst;
+	if (out_size_w32 == 12) {
+		sph_enc32be(out +  0, sc->h[ 0]);
+		sph_enc32be(out +  4, sc->h[ 1]);
+		sph_enc32be(out +  8, sc->h[ 3]);
+		sph_enc32be(out + 12, sc->h[ 4]);
+		sph_enc32be(out + 16, sc->h[ 5]);
+		sph_enc32be(out + 20, sc->h[ 6]);
+		sph_enc32be(out + 24, sc->h[ 8]);
+		sph_enc32be(out + 28, sc->h[ 9]);
+		sph_enc32be(out + 32, sc->h[10]);
+		sph_enc32be(out + 36, sc->h[12]);
+		sph_enc32be(out + 40, sc->h[13]);
+		sph_enc32be(out + 44, sc->h[15]);
+	} else {
+		for (u = 0; u < 16; u ++)
+			sph_enc32be(out + (u << 2), sc->h[u]);
+	}
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi224_init(void *cc)
+{
+	hamsi_small_init(cc, IV224);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi224(void *cc, const void *data, size_t len)
+{
+	hamsi_small_core(cc, data, len);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi224_close(void *cc, void *dst)
+{
+	hamsi_small_close(cc, 0, 0, dst, 7);
+//	hamsi_small_init(cc, IV224);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	hamsi_small_close(cc, ub, n, dst, 7);
+//	hamsi_small_init(cc, IV224);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi256_init(void *cc)
+{
+	hamsi_small_init(cc, IV256);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi256(void *cc, const void *data, size_t len)
+{
+	hamsi_small_core(cc, data, len);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi256_close(void *cc, void *dst)
+{
+	hamsi_small_close(cc, 0, 0, dst, 8);
+//	hamsi_small_init(cc, IV256);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	hamsi_small_close(cc, ub, n, dst, 8);
+//	hamsi_small_init(cc, IV256);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi384_init(void *cc)
+{
+	hamsi_big_init(cc, IV384);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi384(void *cc, const void *data, size_t len)
+{
+	hamsi_big_core(cc, data, len);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi384_close(void *cc, void *dst)
+{
+	hamsi_big_close(cc, 0, 0, dst, 12);
+//	hamsi_big_init(cc, IV384);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	hamsi_big_close(cc, ub, n, dst, 12);
+//	hamsi_big_init(cc, IV384);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi512_init(void *cc)
+{
+	hamsi_big_init(cc, IV512);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi512(void *cc, const void *data, size_t len)
+{
+	hamsi_big_core(cc, data, len);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi512_close(void *cc, void *dst)
+{
+	hamsi_big_close(cc, 0, 0, dst, 16);
+//	hamsi_big_init(cc, IV512);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	hamsi_big_close(cc, ub, n, dst, 16);
+//	hamsi_big_init(cc, IV512);
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -0,0 +1,115 @@
+/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
+/*
+ * Helper code, included (three times !) by HAVAL implementation.
+ *
+ * TODO: try to merge this with md_helper.c.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)    SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)   a ## b
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _4way)
+( haval_4way_context *sc, const void *data, size_t len )
+{
+   __m128i *vdata = (__m128i*)data;
+   unsigned current;
+
+   current = (unsigned)sc->count_low & 127U;
+   while ( len > 0 )
+   {
+      unsigned clen;
+      sph_u32 clow, clow2;
+
+      clen = 128U - current;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( sc->buf + (current>>2), vdata, clen>>2 );
+      vdata += clen>>2;
+      current += clen;
+      len -= clen;
+      if ( current == 128U )
+      {
+         DSTATE;
+         IN_PREPARE(sc->buf);
+         RSTATE;
+         SPH_XCAT(CORE, PASSES)(INW);
+         WSTATE;
+         current = 0;
+      }
+      clow = sc->count_low;
+      clow2 = SPH_T32(clow + clen);
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high ++;
+   }
+}
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
+                                                void *dst)
+{
+   unsigned current;
+   DSTATE;
+
+   current = (unsigned)sc->count_low & 127UL;
+
+   sc->buf[ current>>2 ] = m128_one_32;
+   current += 4;   
+   RSTATE;
+   if ( current > 116UL )
+   {
+      memset_zero_128( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
+      do
+      {
+         IN_PREPARE(sc->buf);
+         SPH_XCAT(CORE, PASSES)(INW);
+      } while (0);
+      current = 0;
+   }
+
+   uint32_t t1, t2;
+   memset_zero_128( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
+   t1 = 0x01 | (PASSES << 3);
+   t2 = sc->olen << 3;
+   sc->buf[ 116>>2 ] = _mm_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
+   sc->buf[ 120>>2 ] = _mm_set1_epi32( sc->count_low << 3 );
+   sc->buf[ 124>>2 ] = _mm_set1_epi32( (sc->count_high << 3)
+                                     | (sc->count_low >> 29) );
+   do
+   {
+      IN_PREPARE(sc->buf);
+      SPH_XCAT(CORE, PASSES)(INW);
+   } while (0);
+   WSTATE;
+   haval_4way_out( sc, dst );
+}
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -0,0 +1,522 @@
+/* $Id: haval.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * HAVAL implementation.
+ *
+ * The HAVAL reference paper is of questionable clarity with regards to
+ * some details such as endianness of bits within a byte, bytes within
+ * a 32-bit word, or the actual ordering of words within a stream of
+ * words. This implementation has been made compatible with the reference
+ * implementation available on: http://labs.calyptix.com/haval.php
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include "haval-hash-4way.h"
+
+#if defined (__AVX__)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAVAL
+#define SPH_SMALL_FOOTPRINT_HAVAL   1
+//#endif
+
+#define F1(x6, x5, x4, x3, x2, x1, x0) \
+   _mm_xor_si128( x0, \
+       _mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \
+                      _mm_xor_si128( _mm_and_si128( x2, x5 ), \
+                                     _mm_and_si128( x3, x6 ) ) ) ) \
+
+#define F2(x6, x5, x4, x3, x2, x1, x0) \
+   _mm_xor_si128( \
+      _mm_and_si128( x2, \
+         _mm_xor_si128( _mm_andnot_si128( x3, x1 ), \
+                        _mm_xor_si128( _mm_and_si128( x4, x5 ), \
+                                       _mm_xor_si128( x6, x0 ) ) ) ), \
+         _mm_xor_si128( \
+             _mm_and_si128( x4, _mm_xor_si128( x1, x5 ) ), \
+             _mm_xor_si128( _mm_and_si128( x3, x5 ), x0 ) ) ) \
+
+#define F3(x6, x5, x4, x3, x2, x1, x0) \
+  _mm_xor_si128( \
+    _mm_and_si128( x3, \
+      _mm_xor_si128( _mm_and_si128( x1, x2 ), \
+                     _mm_xor_si128( x6, x0 ) ) ), \
+      _mm_xor_si128( _mm_xor_si128(_mm_and_si128( x1, x4 ), \
+                                   _mm_and_si128( x2, x5 ) ), x0 ) )
+
+#define F4(x6, x5, x4, x3, x2, x1, x0) \
+  _mm_xor_si128( \
+     _mm_xor_si128( \
+        _mm_and_si128( x3, \
+           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x2 ), \
+                                         _mm_or_si128( x4, x6 ) ), x5 ) ), \
+        _mm_and_si128( x4, \
+           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm_not(x2), x5 ), \
+                          _mm_xor_si128( x1, x6 ) ), x0 ) ) ), \
+     _mm_xor_si128( _mm_and_si128( x2, x6 ), x0 ) )
+
+
+#define F5(x6, x5, x4, x3, x2, x1, x0) \
+   _mm_xor_si128( \
+       _mm_and_si128( x0, \
+            mm_not( _mm_xor_si128( \
+                    _mm_and_si128( _mm_and_si128( x1, x2 ), x3 ), x5 ) ) ), \
+      _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
+                                    _mm_and_si128( x2, x5 ) ), \
+                                    _mm_and_si128( x3, x6 ) ) )
+
+/*
+ * The macros below integrate the phi() permutations, depending on the
+ * pass and the total number of passes.
+ */
+
+#define FP3_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x1, x0, x3, x5, x6, x2, x4)
+#define FP3_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x4, x2, x1, x0, x5, x3, x6)
+#define FP3_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x6, x1, x2, x3, x4, x5, x0)
+
+#define FP4_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x2, x6, x1, x4, x5, x3, x0)
+#define FP4_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x3, x5, x2, x0, x1, x6, x4)
+#define FP4_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x1, x4, x3, x6, x0, x2, x5)
+#define FP4_4(x6, x5, x4, x3, x2, x1, x0) \
+	F4(x6, x4, x0, x5, x2, x1, x3)
+
+#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x3, x4, x1, x0, x5, x2, x6)
+#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x6, x2, x1, x0, x3, x4, x5)
+#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x2, x6, x0, x4, x3, x1, x5)
+#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \
+	F4(x1, x5, x3, x2, x0, x4, x6)
+#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \
+	F5(x2, x5, x0, x6, x4, x3, x1)
+
+/*
+ * One step, for "n" passes, pass number "p" (1 <= p <= n), using
+ * input word number "w" and step constant "c".
+ */
+#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
+do { \
+   __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm_add_epi32( _mm_add_epi32( mm_rotr_32( t, 7 ), \
+                                      mm_rotr_32( x7, 11 ) ), \
+                       _mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
+} while (0)
+
+/*
+ * PASSy(n, in) computes pass number "y", for a total of "n", using the
+ * one-argument macro "in" to access input words. Current state is assumed
+ * to be held in variables "s0" to "s7".
+ */
+
+//#if SPH_SMALL_FOOTPRINT_HAVAL
+
+#define PASS1(n, in)   do { \
+		unsigned pass_count; \
+		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+			STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(pass_count + 0), SPH_C32(0x00000000)); \
+			STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(pass_count + 1), SPH_C32(0x00000000)); \
+			STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(pass_count + 2), SPH_C32(0x00000000)); \
+			STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(pass_count + 3), SPH_C32(0x00000000)); \
+			STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(pass_count + 4), SPH_C32(0x00000000)); \
+			STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(pass_count + 5), SPH_C32(0x00000000)); \
+			STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(pass_count + 6), SPH_C32(0x00000000)); \
+			STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(pass_count + 7), SPH_C32(0x00000000)); \
+   		} \
+	} while (0)
+
+#define PASSG(p, n, in)   do { \
+		unsigned pass_count; \
+		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+			STEP(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(MP ## p[pass_count + 0]), \
+				RK ## p[pass_count + 0]); \
+			STEP(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(MP ## p[pass_count + 1]), \
+				RK ## p[pass_count + 1]); \
+			STEP(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(MP ## p[pass_count + 2]), \
+				RK ## p[pass_count + 2]); \
+			STEP(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(MP ## p[pass_count + 3]), \
+				RK ## p[pass_count + 3]); \
+			STEP(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(MP ## p[pass_count + 4]), \
+				RK ## p[pass_count + 4]); \
+			STEP(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(MP ## p[pass_count + 5]), \
+				RK ## p[pass_count + 5]); \
+			STEP(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(MP ## p[pass_count + 6]), \
+				RK ## p[pass_count + 6]); \
+			STEP(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(MP ## p[pass_count + 7]), \
+				RK ## p[pass_count + 7]); \
+   		} \
+	} while (0)
+
+#define PASS2(n, in)    PASSG(2, n, in)
+#define PASS3(n, in)    PASSG(3, n, in)
+#define PASS4(n, in)    PASSG(4, n, in)
+#define PASS5(n, in)    PASSG(5, n, in)
+
+static const unsigned MP2[32] = {
+	 5, 14, 26, 18, 11, 28,  7, 16,
+	 0, 23, 20, 22,  1, 10,  4,  8,
+	30,  3, 21,  9, 17, 24, 29,  6,
+	19, 12, 15, 13,  2, 25, 31, 27
+};
+
+static const unsigned MP3[32] = {
+	19,  9,  4, 20, 28, 17,  8, 22,
+	29, 14, 25, 12, 24, 30, 16, 26,
+	31, 15,  7,  3,  1,  0, 18, 27,
+	13,  6, 21, 10, 23, 11,  5,  2
+};
+
+static const unsigned MP4[32] = {
+	24,  4,  0, 14,  2,  7, 28, 23,
+	26,  6, 30, 20, 18, 25, 19,  3,
+	22, 11, 31, 21,  8, 27, 12,  9,
+	 1, 29,  5, 15, 17, 10, 16, 13
+};
+
+static const unsigned MP5[32] = {
+	27,  3, 21, 26, 17, 11, 20, 29,
+	19,  0, 12,  7, 13,  8, 31, 10,
+	 5,  9, 14, 30, 18,  6, 28, 24,
+	 2, 23, 16, 22,  4,  1, 25, 15
+};
+
+static const sph_u32 RK2[32] = {
+	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
+	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
+	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
+	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917),
+	SPH_C32(0x9216D5D9), SPH_C32(0x8979FB1B),
+	SPH_C32(0xD1310BA6), SPH_C32(0x98DFB5AC),
+	SPH_C32(0x2FFD72DB), SPH_C32(0xD01ADFB7),
+	SPH_C32(0xB8E1AFED), SPH_C32(0x6A267E96),
+	SPH_C32(0xBA7C9045), SPH_C32(0xF12C7F99),
+	SPH_C32(0x24A19947), SPH_C32(0xB3916CF7),
+	SPH_C32(0x0801F2E2), SPH_C32(0x858EFC16),
+	SPH_C32(0x636920D8), SPH_C32(0x71574E69),
+	SPH_C32(0xA458FEA3), SPH_C32(0xF4933D7E),
+	SPH_C32(0x0D95748F), SPH_C32(0x728EB658),
+	SPH_C32(0x718BCD58), SPH_C32(0x82154AEE),
+	SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5)
+};
+
+static const sph_u32 RK3[32] = {
+	SPH_C32(0x9C30D539), SPH_C32(0x2AF26013),
+	SPH_C32(0xC5D1B023), SPH_C32(0x286085F0),
+	SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF),
+	SPH_C32(0x8E79DCB0), SPH_C32(0x603A180E),
+	SPH_C32(0x6C9E0E8B), SPH_C32(0xB01E8A3E),
+	SPH_C32(0xD71577C1), SPH_C32(0xBD314B27),
+	SPH_C32(0x78AF2FDA), SPH_C32(0x55605C60),
+	SPH_C32(0xE65525F3), SPH_C32(0xAA55AB94),
+	SPH_C32(0x57489862), SPH_C32(0x63E81440),
+	SPH_C32(0x55CA396A), SPH_C32(0x2AAB10B6),
+	SPH_C32(0xB4CC5C34), SPH_C32(0x1141E8CE),
+	SPH_C32(0xA15486AF), SPH_C32(0x7C72E993),
+	SPH_C32(0xB3EE1411), SPH_C32(0x636FBC2A),
+	SPH_C32(0x2BA9C55D), SPH_C32(0x741831F6),
+	SPH_C32(0xCE5C3E16), SPH_C32(0x9B87931E),
+	SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C)
+};
+
+static const sph_u32 RK4[32] = {
+	SPH_C32(0x7A325381), SPH_C32(0x28958677),
+	SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF),
+	SPH_C32(0xC4BFE81B), SPH_C32(0x66282193),
+	SPH_C32(0x61D809CC), SPH_C32(0xFB21A991),
+	SPH_C32(0x487CAC60), SPH_C32(0x5DEC8032),
+	SPH_C32(0xEF845D5D), SPH_C32(0xE98575B1),
+	SPH_C32(0xDC262302), SPH_C32(0xEB651B88),
+	SPH_C32(0x23893E81), SPH_C32(0xD396ACC5),
+	SPH_C32(0x0F6D6FF3), SPH_C32(0x83F44239),
+	SPH_C32(0x2E0B4482), SPH_C32(0xA4842004),
+	SPH_C32(0x69C8F04A), SPH_C32(0x9E1F9B5E),
+	SPH_C32(0x21C66842), SPH_C32(0xF6E96C9A),
+	SPH_C32(0x670C9C61), SPH_C32(0xABD388F0),
+	SPH_C32(0x6A51A0D2), SPH_C32(0xD8542F68),
+	SPH_C32(0x960FA728), SPH_C32(0xAB5133A3),
+	SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4)
+};
+
+static const sph_u32 RK5[32] = {
+	SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98),
+	SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176),
+	SPH_C32(0x66CA593E), SPH_C32(0x82430E88),
+	SPH_C32(0x8CEE8619), SPH_C32(0x456F9FB4),
+	SPH_C32(0x7D84A5C3), SPH_C32(0x3B8B5EBE),
+	SPH_C32(0xE06F75D8), SPH_C32(0x85C12073),
+	SPH_C32(0x401A449F), SPH_C32(0x56C16AA6),
+	SPH_C32(0x4ED3AA62), SPH_C32(0x363F7706),
+	SPH_C32(0x1BFEDF72), SPH_C32(0x429B023D),
+	SPH_C32(0x37D0D724), SPH_C32(0xD00A1248),
+	SPH_C32(0xDB0FEAD3), SPH_C32(0x49F1C09B),
+	SPH_C32(0x075372C9), SPH_C32(0x80991B7B),
+	SPH_C32(0x25D479D8), SPH_C32(0xF6E8DEF7),
+	SPH_C32(0xE3FE501A), SPH_C32(0xB6794C3B),
+	SPH_C32(0x976CE0BD), SPH_C32(0x04C006BA),
+	SPH_C32(0xC1A94FB6), SPH_C32(0x409F60C4)
+};
+
+#define SAVE_STATE \
+   __m128i u0, u1, u2, u3, u4, u5, u6, u7; \
+   do { \
+      u0 = s0; \
+      u1 = s1; \
+      u2 = s2; \
+      u3 = s3; \
+      u4 = s4; \
+      u5 = s5; \
+      u6 = s6; \
+      u7 = s7; \
+   } while (0)
+
+#define UPDATE_STATE \
+do { \
+   s0 = _mm_add_epi32( s0, u0 ); \
+   s1 = _mm_add_epi32( s1, u1 ); \
+   s2 = _mm_add_epi32( s2, u2 ); \
+   s3 = _mm_add_epi32( s3, u3 ); \
+   s4 = _mm_add_epi32( s4, u4 ); \
+   s5 = _mm_add_epi32( s5, u5 ); \
+   s6 = _mm_add_epi32( s6, u6 ); \
+   s7 = _mm_add_epi32( s7, u7 ); \
+} while (0)
+
+/*
+ * COREn(in) performs the core HAVAL computation for "n" passes, using
+ * the one-argument macro "in" to access the input words. Running state
+ * is held in variable "s0" to "s7".
+ */
+/*
+#define CORE3(in)  do { \
+		SAVE_STATE; \
+		PASS1(3, in); \
+		PASS2(3, in); \
+		PASS3(3, in); \
+		UPDATE_STATE; \
+	} while (0)
+
+#define CORE4(in)  do { \
+		SAVE_STATE; \
+		PASS1(4, in); \
+		PASS2(4, in); \
+		PASS3(4, in); \
+		PASS4(4, in); \
+		UPDATE_STATE; \
+	} while (0)
+*/
+#define CORE5(in)  do { \
+		SAVE_STATE; \
+		PASS1(5, in); \
+		PASS2(5, in); \
+		PASS3(5, in); \
+		PASS4(5, in); \
+		PASS5(5, in); \
+		UPDATE_STATE; \
+	} while (0)
+
+/*
+ * DSTATE declares the state variables "s0" to "s7".
+ */
+#define DSTATE   __m128i s0, s1, s2, s3, s4, s5, s6, s7
+
+/*
+ * RSTATE fills the state variables from the context "sc".
+ */
+#define RSTATE \
+do { \
+   s0 = sc->s0; \
+   s1 = sc->s1; \
+   s2 = sc->s2; \
+   s3 = sc->s3; \
+   s4 = sc->s4; \
+   s5 = sc->s5; \
+   s6 = sc->s6; \
+   s7 = sc->s7; \
+} while (0)
+
+/*
+ * WSTATE updates the context "sc" from the state variables.
+ */
+#define WSTATE \
+do { \
+   sc->s0 = s0; \
+   sc->s1 = s1; \
+   sc->s2 = s2; \
+   sc->s3 = s3; \
+   sc->s4 = s4; \
+   sc->s5 = s5; \
+   sc->s6 = s6; \
+   sc->s7 = s7; \
+} while (0)
+
+/*
+ * Initialize a context. "olen" is the output length, in 32-bit words
+ * (between 4 and 8, inclusive). "passes" is the number of passes
+ * (3, 4 or 5).
+ */
+static void
+haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
+{
+   sc->s0 = _mm_set1_epi32( 0x243F6A88UL );
+   sc->s1 = _mm_set1_epi32( 0x85A308D3UL );
+   sc->s2 = _mm_set1_epi32( 0x13198A2EUL );
+   sc->s3 = _mm_set1_epi32( 0x03707344UL );
+   sc->s4 = _mm_set1_epi32( 0xA4093822UL );
+   sc->s5 = _mm_set1_epi32( 0x299F31D0UL );
+   sc->s6 = _mm_set1_epi32( 0x082EFA98UL );
+   sc->s7 = _mm_set1_epi32( 0xEC4E6C89UL );
+   sc->olen = olen;
+   sc->passes = passes;
+   sc->count_high = 0;
+   sc->count_low = 0;
+	
+}
+
+#define IN_PREPARE(indata) const __m128i *const load_ptr = (indata)
+
+#define INW(i)   load_ptr[ i ] 
+
+/*
+ * Write out HAVAL output. The output length is tailored to the requested
+ * length.
+ */
+static void
+haval_4way_out( haval_4way_context *sc, void *dst )
+{
+   __m128i *buf = (__m128i*)dst;
+   DSTATE;
+   RSTATE;
+
+   buf[0] = s0;
+   buf[1] = s1;
+   buf[2] = s2;
+   buf[3] = s3;
+   buf[4] = s4;
+   buf[5] = s5;
+   buf[6] = s6;
+   buf[7] = s7;
+}
+
+/*
+ * The main core functions inline the code with the COREx() macros. We
+ * use a helper file, included three times, which avoids code copying.
+ */
+/*
+#undef PASSES
+#define PASSES   3
+#include "haval-helper.c"
+
+#undef PASSES
+#define PASSES   4
+#include "haval-helper.c"
+*/
+
+#undef PASSES
+#define PASSES   5
+#include "haval-4way-helper.c"
+
+/* ====================================================================== */
+
+#define API(xxx, y) \
+void \
+haval ## xxx ## _ ## y ## _4way_init(void *cc) \
+{ \
+	haval_4way_init(cc, xxx >> 5, y); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \
+{ \
+	haval ## y ## _4way(cc, data, len); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _4way_close(void *cc, void *dst) \
+{ \
+	haval ## y ## _4way_close(cc, dst); \
+} \
+
+API(256, 5)
+
+#define RVAL \
+do { \
+   s0 = val[0]; \
+   s1 = val[1]; \
+   s2 = val[2]; \
+   s3 = val[3]; \
+   s4 = val[4]; \
+   s5 = val[5]; \
+   s6 = val[6]; \
+   s7 = val[7]; \
+} while (0)
+
+#define WVAL \
+do { \
+   val[0] = s0; \
+   val[1] = s1; \
+   val[2] = s2; \
+   val[3] = s3; \
+   val[4] = s4; \
+   val[5] = s5; \
+   val[6] = s6; \
+   val[7] = s7; \
+} while (0)
+
+#define INMSG(i)   msg[i]
+
+#ifdef __cplusplus
+}
+#endif	
+#endif
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -0,0 +1,95 @@
+/* $Id: sph_haval.h 218 2010-06-08 17:06:34Z tp $ */
+/**
+ * HAVAL interface.
+ *
+ * HAVAL is actually a family of 15 hash functions, depending on whether
+ * the internal computation uses 3, 4 or 5 passes, and on the output
+ * length, which is 128, 160, 192, 224 or 256 bits. This implementation
+ * provides interface functions for all 15, which internally map to
+ * three cores (depending on the number of passes). Note that output
+ * lengths other than 256 bits are not obtained by a simple truncation
+ * of a longer result; the requested length is encoded within the
+ * padding data.
+ *
+ * HAVAL was published in: Yuliang Zheng, Josef Pieprzyk and Jennifer
+ * Seberry: "HAVAL -- a one-way hashing algorithm with variable length
+ * of output", Advances in Cryptology -- AUSCRYPT'92, Lecture Notes in
+ * Computer Science, Vol.718, pp.83-104, Springer-Verlag, 1993.
+ *
+ * This paper, and a reference implementation, are available on the
+ * Calyptix web site: http://labs.calyptix.com/haval.php
+ *
+ * The HAVAL reference paper is quite unclear on the data encoding
+ * details, i.e. endianness (both byte order within a 32-bit word, and
+ * word order within a message block). This implementation has been
+ * made compatible with the reference implementation referenced above.
+ *
+ * @warning   A collision for HAVAL-128/3 (HAVAL with three passes and
+ * 128-bit output) has been published; this function is thus considered
+ * as cryptographically broken. The status for other variants is unclear;
+ * use only with care.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_haval.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef HAVAL_HASH_4WAY_H__
+#define HAVAL_HASH_4WAY_H__
+
+#if defined(__AVX__)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_haval256_5   256
+
+typedef struct {
+   __m128i buf[32];
+   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+   unsigned olen, passes;
+   sph_u32 count_high, count_low;
+} haval_4way_context;
+
+typedef haval_4way_context haval256_5_4way_context;
+
+void haval256_5_4way_init( void *cc );
+
+void haval256_5_4way( void *cc, const void *data, size_t len );
+
+void haval256_5_4way_close( void *cc, void *dst );
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+#endif
--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -15,7 +15,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/skein/sse2/skein.c"

 #ifndef NO_AES_NI
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -99,6 +99,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
  pthread_barrier_wait( &hodl_barrier );
  return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
 #endif
+  return false;
 }

 bool register_hodl_algo( algo_gate_t* gate )
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -96,34 +96,18 @@ extern "C"{
 do { \
   __m256i cc = _mm256_set_epi64x( c, c, c, c ); \
    x3 = mm256_not( x3 ); \
-    x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_not( x2 ) ) ); \
+    x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
    tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
    x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
-    x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_not( x1 ), x2 ) ); \
+    x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \
    x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
-    x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_not( x3 ) ) ); \
+    x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \
    x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
    x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
    x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
    x2 = _mm256_xor_si256( x2, tmp ); \
 } while (0)

-/*
-#define Sb(x0, x1, x2, x3, c)   do { \
-		x3 = ~x3; \
-		x0 ^= (c) & ~x2; \
-		tmp = (c) ^ (x0 & x1); \
-		x0 ^= x2 & x3; \
-		x3 ^= ~x1 & x2; \
-		x1 ^= x0 & x2; \
-		x2 ^= x0 & ~x3; \
-		x0 ^= x1 | x3; \
-		x3 ^= x1 & x2; \
-		x1 ^= tmp & x0; \
-		x2 ^= tmp; \
-	} while (0)
-*/
-
 #define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
 do { \
    x4 = _mm256_xor_si256( x4, x1 ); \
@@ -136,20 +120,6 @@ do { \
    x3 = _mm256_xor_si256( x3, x4 ); \
 } while (0)

-
-/*
-#define Lb(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
-		x4 ^= x1; \
-		x5 ^= x2; \
-		x6 ^= x3 ^ x0; \
-		x7 ^= x0; \
-		x0 ^= x5; \
-		x1 ^= x6; \
-		x2 ^= x7 ^ x4; \
-		x3 ^= x4; \
-	} while (0)
-*/
-
 #if SPH_JH_64

 static const sph_u64 C[] = {
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -23,12 +23,12 @@ void jha_hash_4way( void *out, const void *input )
    uint64_t hash2[8] __attribute__ ((aligned (64)));
    uint64_t hash3[8] __attribute__ ((aligned (64)));
    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
-    uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
-    __m256i mask, mask0, mask1;
-    __m256i* vh = (__m256i*)vhash;
-    __m256i* vh0 = (__m256i*)vhash0;
-    __m256i* vh1 = (__m256i*)vhash1;
+    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+    __m256i* vh  = (__m256i*)vhash;
+    __m256i* vhA = (__m256i*)vhashA;
+    __m256i* vhB = (__m256i*)vhashB;
+    __m256i vh_mask;

    blake512_4way_context  ctx_blake;
    hashState_groestl      ctx_groestl;
@@ -40,127 +40,69 @@ void jha_hash_4way( void *out, const void *input )
    keccak512_4way( &ctx_keccak, input, 80 );
    keccak512_4way_close( &ctx_keccak, vhash );

-//    memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
-//    keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
-//    keccak512_4way_close( &ctx_keccak, vhash );
-
    // Heavy & Light Pair Loop
    for ( int round = 0; round < 3; round++ )
    {
-      // select next function based on bit 0 of previous hash.
-      // Specutively execute both functions and use mask to
-      // select results from correct function for each lane.
-      // hash = mask : vhash0 ? vhash1
-      mask = mm256_negate_64(
-                     _mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
-
-// second version
-//      mask0 = mask
-//      mask1 = mm256_not( mask );
-
-// first version
-//       mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
-//                     _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
-
-       // groestl (serial) vs skein
+       vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
+               vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero );

       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
-
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash0,
-                                 (char*)hash0, 512 );
+                                               (char*)hash0, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash1,
-                                 (char*)hash1, 512 );
+                                               (char*)hash1, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash2,
-                                 (char*)hash2, 512 );
+                                               (char*)hash2, 512 );
       init_groestl( &ctx_groestl, 64 );
       update_and_final_groestl( &ctx_groestl, (char*)hash3,
-                                 (char*)hash3, 512 );
-
-       mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
-
-       // skein
+                                               (char*)hash3, 512 );
+       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );

       skein512_4way_init( &ctx_skein );
       skein512_4way( &ctx_skein, vhash, 64 );
-       skein512_4way_close( &ctx_skein, vhash1 );
+       skein512_4way_close( &ctx_skein, vhashB );

-       // merge vectored hash
       for ( int i = 0; i < 8; i++ )
-       {
-          // blend should be faster
-          vh[i] = _mm256_blendv_epi8( vh0[i], vh1[i], mask );
-
-// second version
-//          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
-//                                   _mm256_and_si256( vh1[i], mask1 ) );
-
-// first version
-/*
-          vh0[i] = _mm256_maskload_epi64( 
-                                      vhash0 + i*4, mm256_not( mask ) );
-          vh1[i] = _mm256_maskload_epi64(
-                                      vhash1 + i*4, mask );
-          vh[i]  = _mm256_or_si256( vh0[i], vh1[i] );
-*/
-       }
-
-       // blake v jh
+          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );

       blake512_4way_init( &ctx_blake );
       blake512_4way( &ctx_blake, vhash, 64 );
-       blake512_4way_close( &ctx_blake, vhash0 );
+       blake512_4way_close( &ctx_blake, vhashA );

       jh512_4way_init( &ctx_jh );
       jh512_4way( &ctx_jh, vhash, 64 );
-       jh512_4way_close( &ctx_jh, vhash1 );
+       jh512_4way_close( &ctx_jh, vhashB );

-       // merge hash
       for ( int i = 0; i < 8; i++ )
-       {
-          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
-                                   _mm256_and_si256( vh1[i], mask1 ) );
-/*
-          vha256[i] = _mm256_maskload_epi64(
-                                      vhasha + i*4, mm256_not( mask ) );
-          vhb256[i] = _mm256_maskload_epi64(
-                                      vhashb + i*4, mask );
-          vh256[i]  = _mm256_or_si256( vha256[i], vhb256[i] );
-*/
-       }
+          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
    }

    mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
-
-//    memcpy( output,       hash0, 32 );
-//    memcpy( output+32,    hash1, 32 );
-//    memcpy( output+64,    hash2, 32 );
-//    memcpy( output+96,    hash3, 32 );
-
 }

 int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
                       uint64_t *hashes_done )
 {
-     uint32_t hash[8*4] __attribute__ ((aligned (64)));
-     uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-     uint32_t endiandata[20] __attribute__((aligned(64)));
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-	const uint32_t first_nonce = pdata[19];
-	const uint32_t Htarg = ptarget[7];
-	uint32_t n = pdata[19];
-     uint32_t *nonces = work->nonces;
-     bool *found = work->nfound;
-     int num_found = 0;
-     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-     uint32_t *noncep1 = vdata + 75;
-     uint32_t *noncep2 = vdata + 77;
-     uint32_t *noncep3 = vdata + 79;
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t n = pdata[19];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;

-	uint64_t htmax[] = {
+   uint64_t htmax[] = {
 		0,
 		0xF,
 		0xFF,
@@ -168,7 +110,7 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
 		0xFFFF,
 		0x10000000
 	};
-	uint32_t masks[] = {
+   uint32_t masks[] = {
 		0xFFFFFFFF,
 		0xFFFFFFF0,
 		0xFFFFFF00,
@@ -177,17 +119,12 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
 		0
 	};

-   // we need bigendian data...
   for ( int i=0; i < 19; i++ )
      be32enc( &endiandata[i], pdata[i] );

   uint64_t *edata = (uint64_t*)endiandata;
   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

-   // precalc midstate for keccak
-//   keccak512_4way_init( &jha_kec_mid );
-//   keccak512_4way( &jha_kec_mid, vdata, 64 );
-
   for ( int m = 0; m < 6; m++ )
   {
      if ( Htarg <= htmax[m] )
@@ -201,7 +138,6 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
              be32enc( noncep3, n+3 );

              jha_hash_4way( hash, vdata );
-
              pdata[19] = n;

              if ( ( !(hash[7] & mask) )
@@ -239,11 +175,9 @@ int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
              n += 4;
         } while ( ( num_found == 0 ) && ( n < max_nonce )
                     && !work_restart[thr_id].restart );
-
         break;
      }
   }
-
   *hashes_done = n - first_nonce + 1;
   return num_found;
 }
--- a/algo/jh/jha-gate.c
+++ b/algo/jh/jha-gate.c
@@ -5,14 +5,13 @@ bool register_jha_algo( algo_gate_t* gate )
 {
 #if defined (JHA_4WAY)
  four_way_not_tested();
-  gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
  gate->scanhash         = (void*)&scanhash_jha_4way;
  gate->hash             = (void*)&jha_hash_4way;
 #else
-  gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
  gate->scanhash         = (void*)&scanhash_jha;
  gate->hash             = (void*)&jha_hash;
 #endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
  gate->set_target       = (void*)&scrypt_set_target;
  return true;
 };
--- a/algo/jh/jha-gate.h
+++ b/algo/jh/jha-gate.h
@@ -5,7 +5,7 @@
 #include <stdint.h>


-#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+#if defined(__AVX2__) && defined(__AES__)
  #define JHA_4WAY
 #endif

--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -9,7 +9,7 @@ int64_t keccak_get_max64() { return 0x7ffffLL; }

 bool register_keccak_algo( algo_gate_t* gate )
 {
-  gate->optimizations = FOUR_WAY_OPT;
+  gate->optimizations = AVX2_OPT;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  gate->set_target      = (void*)&keccak_set_target;
  gate->get_max64       = (void*)&keccak_get_max64;
@@ -30,7 +30,7 @@ void keccakc_set_target( struct work* work, double job_diff )

 bool register_keccakc_algo( algo_gate_t* gate )
 {
-  gate->optimizations = FOUR_WAY_OPT;
+  gate->optimizations = AVX2_OPT;
  gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
  gate->set_target      = (void*)&keccakc_set_target;
  gate->get_max64       = (void*)&keccak_get_max64;
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX2__)
+#if defined(__AVX2__)
  #define KECCAK_4WAY
 #endif

--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -59,7 +59,7 @@ static const sph_u64 RC[] = {
 #define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
 #define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
-#define NOT64(d, s)      (d = _mm256_xor_si256(s,mm256_neg1))
+#define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rotl_64(v, n))
 #define XOR64_IOTA       XOR64

@@ -375,12 +375,12 @@ static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
          kc->w[i] = _mm256_setzero_si256();

   // Initialization for the "lane complement".
-   kc->w[ 1] = mm256_neg1;
-   kc->w[ 2] = mm256_neg1;
-   kc->w[ 8] = mm256_neg1;
-   kc->w[12] = mm256_neg1;
-   kc->w[17] = mm256_neg1;
-   kc->w[20] = mm256_neg1;
+   kc->w[ 1] = m256_neg1;
+   kc->w[ 2] = m256_neg1;
+   kc->w[ 8] = m256_neg1;
+   kc->w[12] = m256_neg1;
+   kc->w[17] = m256_neg1;
+   kc->w[20] = m256_neg1;
   kc->ptr = 0;
   kc->lim = 200 - (out_size >> 2);
 }
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -0,0 +1,568 @@
+/*
+ * luffa_for_sse2.c
+ * Version 2.0 (Sep 15th 2009)
+ *
+ * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
+ *
+ * Hitachi, Ltd. is the owner of this software and hereby grant
+ * the U.S. Government and any interested party the right to use
+ * this software for the purposes of the SHA-3 evaluation process,
+ * notwithstanding that this software is copyrighted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <string.h>
+#include <immintrin.h>
+#include "luffa-hash-2way.h"
+
+#if defined(__AVX2__)
+
+#include "avxdefs.h"
+
+#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
+                               0UL, 0UL, 0UL, 0xffffffffUL )
+
+#define ADD_CONSTANT(a,b,c0,c1)\
+    a = _mm256_xor_si256(a,c0);\
+    b = _mm256_xor_si256(b,c1);\
+
+#define MULT2(a0,a1) \
+do { \
+  __m256i b = _mm256_xor_si256( a0, \
+                   _mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \
+  a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
+  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
+} while(0)
+
+// confirm pointer arithmetic
+// ok but use array indexes
+#define STEP_PART(x,c,t)\
+    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
+    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
+    MIXWORD(*x,*(x+4),*t,*(t+1));\
+    MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
+    MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
+    MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
+    ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
+
+#define SUBCRUMB(a0,a1,a2,a3,t)\
+    t  = _mm256_load_si256(&a0);\
+    a0 = _mm256_or_si256(a0,a1);\
+    a2 = _mm256_xor_si256(a2,a3);\
+    a1 = _mm256_andnot_si256(a1, m256_neg1 );\
+    a0 = _mm256_xor_si256(a0,a3);\
+    a3 = _mm256_and_si256(a3,t);\
+    a1 = _mm256_xor_si256(a1,a3);\
+    a3 = _mm256_xor_si256(a3,a2);\
+    a2 = _mm256_and_si256(a2,a0);\
+    a0 = _mm256_andnot_si256(a0, m256_neg1 );\
+    a2 = _mm256_xor_si256(a2,a1);\
+    a1 = _mm256_or_si256(a1,a3);\
+    t  = _mm256_xor_si256(t,a1);\
+    a3 = _mm256_xor_si256(a3,a2);\
+    a2 = _mm256_and_si256(a2,a1);\
+    a1 = _mm256_xor_si256(a1,a0);\
+    a0 = _mm256_load_si256(&t);\
+
+#define MIXWORD(a,b,t1,t2)\
+    b  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(a,2);\
+    t2 = _mm256_srli_epi32(a,30);\
+     a = _mm256_or_si256(t1,t2);\
+    a  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(b,14);\
+    t2 = _mm256_srli_epi32(b,18);\
+    b  = _mm256_or_si256(t1,t2);\
+    b  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(a,10);\
+    t2 = _mm256_srli_epi32(a,22);\
+    a  = _mm256_or_si256(t1,t2);\
+    a  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(b,1);\
+    t2 = _mm256_srli_epi32(b,31);\
+    b  = _mm256_or_si256(t1,t2);
+
+#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
+    a1 = _mm256_shuffle_epi32(a1,147);\
+    t0 = _mm256_load_si256(&a1);\
+    a1 = _mm256_unpacklo_epi32(a1,a0);\
+    t0 = _mm256_unpackhi_epi32(t0,a0);\
+    t1 = _mm256_shuffle_epi32(t0,78);\
+    a0 = _mm256_shuffle_epi32(a1,78);\
+    SUBCRUMB(t1,t0,a0,a1,tmp0);\
+    t0 = _mm256_unpacklo_epi32(t0,t1);\
+    a1 = _mm256_unpacklo_epi32(a1,a0);\
+    a0 = _mm256_load_si256(&a1);\
+    a0 = _mm256_unpackhi_epi64(a0,t0);\
+    a1 = _mm256_unpacklo_epi64(a1,t0);\
+    a1 = _mm256_shuffle_epi32(a1,57);\
+    MIXWORD(a0,a1,tmp0,tmp1);\
+    ADD_CONSTANT(a0,a1,c0,c1);
+
+#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
+    s2 = _mm256_load_si256(&r1);\
+    q2 = _mm256_load_si256(&p1);\
+    r2 = _mm256_shuffle_epi32(r2,216);\
+    p2 = _mm256_shuffle_epi32(p2,216);\
+    r1 = _mm256_unpacklo_epi32(r1,r0);\
+    p1 = _mm256_unpacklo_epi32(p1,p0);\
+    s2 = _mm256_unpackhi_epi32(s2,r0);\
+    q2 = _mm256_unpackhi_epi32(q2,p0);\
+    s0 = _mm256_load_si256(&r2);\
+    q0 = _mm256_load_si256(&p2);\
+    r2 = _mm256_unpacklo_epi64(r2,r1);\
+    p2 = _mm256_unpacklo_epi64(p2,p1);\
+    s1 = _mm256_load_si256(&s0);\
+    q1 = _mm256_load_si256(&q0);\
+    s0 = _mm256_unpackhi_epi64(s0,r1);\
+    q0 = _mm256_unpackhi_epi64(q0,p1);\
+    r2 = _mm256_shuffle_epi32(r2,225);\
+    p2 = _mm256_shuffle_epi32(p2,225);\
+    r0 = _mm256_load_si256(&s1);\
+    p0 = _mm256_load_si256(&q1);\
+    s0 = _mm256_shuffle_epi32(s0,225);\
+    q0 = _mm256_shuffle_epi32(q0,225);\
+    s1 = _mm256_unpacklo_epi64(s1,s2);\
+    q1 = _mm256_unpacklo_epi64(q1,q2);\
+    r0 = _mm256_unpackhi_epi64(r0,s2);\
+    p0 = _mm256_unpackhi_epi64(p0,q2);\
+    s2 = _mm256_load_si256(&r0);\
+    q2 = _mm256_load_si256(&p0);\
+    s3 = _mm256_load_si256(&r2);\
+    q3 = _mm256_load_si256(&p2);\
+
+#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
+    s0 = _mm256_load_si256(&r0);\
+    q0 = _mm256_load_si256(&p0);\
+    s1 = _mm256_load_si256(&r2);\
+    q1 = _mm256_load_si256(&p2);\
+    r0 = _mm256_unpackhi_epi32(r0,r1);\
+    p0 = _mm256_unpackhi_epi32(p0,p1);\
+    r2 = _mm256_unpackhi_epi32(r2,r3);\
+    p2 = _mm256_unpackhi_epi32(p2,p3);\
+    s0 = _mm256_unpacklo_epi32(s0,r1);\
+    q0 = _mm256_unpacklo_epi32(q0,p1);\
+    s1 = _mm256_unpacklo_epi32(s1,r3);\
+    q1 = _mm256_unpacklo_epi32(q1,p3);\
+    r1 = _mm256_load_si256(&r0);\
+    p1 = _mm256_load_si256(&p0);\
+    r0 = _mm256_unpackhi_epi64(r0,r2);\
+    p0 = _mm256_unpackhi_epi64(p0,p2);\
+    s0 = _mm256_unpackhi_epi64(s0,s1);\
+    q0 = _mm256_unpackhi_epi64(q0,q1);\
+    r1 = _mm256_unpacklo_epi64(r1,r2);\
+    p1 = _mm256_unpacklo_epi64(p1,p2);\
+    s2 = _mm256_load_si256(&r0);\
+    q2 = _mm256_load_si256(&p0);\
+    s1 = _mm256_load_si256(&r1);\
+    q1 = _mm256_load_si256(&p1);\
+
+#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    s1 = _mm256_load_si256(&r3);\
+    q1 = _mm256_load_si256(&p3);\
+    s3 = _mm256_load_si256(&r3);\
+    q3 = _mm256_load_si256(&p3);\
+    s1 = _mm256_unpackhi_epi32(s1,r2);\
+    q1 = _mm256_unpackhi_epi32(q1,p2);\
+    s3 = _mm256_unpacklo_epi32(s3,r2);\
+    q3 = _mm256_unpacklo_epi32(q3,p2);\
+    s0 = _mm256_load_si256(&s1);\
+    q0 = _mm256_load_si256(&q1);\
+    s2 = _mm256_load_si256(&s3);\
+    q2 = _mm256_load_si256(&q3);\
+    r3 = _mm256_load_si256(&r1);\
+    p3 = _mm256_load_si256(&p1);\
+    r1 = _mm256_unpacklo_epi32(r1,r0);\
+    p1 = _mm256_unpacklo_epi32(p1,p0);\
+    r3 = _mm256_unpackhi_epi32(r3,r0);\
+    p3 = _mm256_unpackhi_epi32(p3,p0);\
+    s0 = _mm256_unpackhi_epi64(s0,r3);\
+    q0 = _mm256_unpackhi_epi64(q0,p3);\
+    s1 = _mm256_unpacklo_epi64(s1,r3);\
+    q1 = _mm256_unpacklo_epi64(q1,p3);\
+    s2 = _mm256_unpackhi_epi64(s2,r1);\
+    q2 = _mm256_unpackhi_epi64(q2,p1);\
+    s3 = _mm256_unpacklo_epi64(s3,r1);\
+    q3 = _mm256_unpacklo_epi64(q3,p1);
+
+#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
+
+/* initial values of chaining variables */
+static const uint32 IV[40] __attribute((aligned(32))) = {
+    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
+    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
+    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
+    0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
+    0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
+    0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
+    0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
+    0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
+    0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
+    0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
+};
+
+/* Round Constants */
+static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
+    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
+    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
+    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
+    0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
+    0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
+    0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
+    0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
+    0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
+    0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
+    0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
+    0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
+    0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
+    0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
+    0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
+    0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
+    0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
+    0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
+    0x00000000,0x00000000,0x00000000,0x5090d577,
+    0x00000000,0x00000000,0x00000000,0xac11d7fa,
+    0x00000000,0x00000000,0x00000000,0x2d1925ab,
+    0x00000000,0x00000000,0x00000000,0x1bcb66f2,
+    0x00000000,0x00000000,0x00000000,0xb46496ac,
+    0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
+    0x00000000,0x00000000,0x00000000,0xd1925ab0,
+    0x00000000,0x00000000,0x00000000,0x78602649,
+    0x00000000,0x00000000,0x00000000,0x29131ab6,
+    0x00000000,0x00000000,0x00000000,0x8edae952,
+    0x00000000,0x00000000,0x00000000,0x0fc053c3,
+    0x00000000,0x00000000,0x00000000,0x3b6ba548,
+    0x00000000,0x00000000,0x00000000,0x3f014f0c,
+    0x00000000,0x00000000,0x00000000,0xedae9520,
+    0x00000000,0x00000000,0x00000000,0xfc053c31
+};
+
+__m256i CNS[32];
+
+/***************************************************/
+/* Round function         */
+/* state: hash context    */
+
+static void rnd512_2way( luffa_2way_context *state, __m256i msg1, __m256i msg0 )
+{
+    __m256i t[2];
+    __m256i *chainv = state->chainv;
+    __m256i tmp[2];
+    __m256i x[8];
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+
+    t[0] = _mm256_xor_si256( t[0], chainv[2] );
+    t[1] = _mm256_xor_si256( t[1], chainv[3] );
+    t[0] = _mm256_xor_si256( t[0], chainv[4] );
+    t[1] = _mm256_xor_si256( t[1], chainv[5] );
+    t[0] = _mm256_xor_si256( t[0], chainv[6] );
+    t[1] = _mm256_xor_si256( t[1], chainv[7] );
+    t[0] = _mm256_xor_si256( t[0], chainv[8] );
+    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+
+    MULT2( t[0], t[1] );
+
+    msg0 = _mm256_shuffle_epi32( msg0, 27 );
+    msg1 = _mm256_shuffle_epi32( msg1, 27 );
+
+    chainv[0] = _mm256_xor_si256( chainv[0], t[0] );
+    chainv[1] = _mm256_xor_si256( chainv[1], t[1] );
+    chainv[2] = _mm256_xor_si256( chainv[2], t[0] );
+    chainv[3] = _mm256_xor_si256( chainv[3], t[1] );
+    chainv[4] = _mm256_xor_si256( chainv[4], t[0] );
+    chainv[5] = _mm256_xor_si256( chainv[5], t[1] );
+    chainv[6] = _mm256_xor_si256( chainv[6], t[0] );
+    chainv[7] = _mm256_xor_si256( chainv[7], t[1] );
+    chainv[8] = _mm256_xor_si256( chainv[8], t[0] );
+    chainv[9] = _mm256_xor_si256( chainv[9], t[1] );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+
+    MULT2( chainv[0], chainv[1]);
+    chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
+    chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
+
+    MULT2( chainv[2], chainv[3]);
+    chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
+    chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
+
+    MULT2( chainv[4], chainv[5]);
+    chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
+    chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
+
+    MULT2( chainv[6], chainv[7]);
+    chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
+    chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
+
+    MULT2( chainv[8], chainv[9]);
+    chainv[8] = _mm256_xor_si256( chainv[8], t[0] );
+    chainv[9] = _mm256_xor_si256( chainv[9], t[1] );
+
+    t[0] = chainv[8];
+    t[1] = chainv[9];
+
+    MULT2( chainv[8], chainv[9]);
+    chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
+    chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
+
+    MULT2( chainv[6], chainv[7]);
+    chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
+    chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
+
+    MULT2( chainv[4], chainv[5]);
+    chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
+    chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
+
+    MULT2( chainv[2], chainv[3] );
+    chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
+    chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
+
+    MULT2( chainv[0], chainv[1] );
+    chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t[0] ), msg0 );
+    chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t[1] ), msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
+    chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
+    chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
+    chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
+    chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
+
+    MULT2( msg0, msg1);
+
+    chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3],  1 ),
+                                 _mm256_srli_epi32( chainv[3], 31 ) );
+    chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5],  2 ),
+                                 _mm256_srli_epi32( chainv[5], 30 ) );
+    chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7],  3 ),
+                                 _mm256_srli_epi32( chainv[7], 29 ) );
+    chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9],  4 ),
+                                 _mm256_srli_epi32( chainv[9], 28 ) );
+
+
+    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
+                x[0], x[1], x[2], x[3],
+                chainv[1],chainv[3],chainv[5],chainv[7],
+                x[4], x[5], x[6], x[7] );
+
+    STEP_PART( &x[0], &CNS[ 0], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 2], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 4], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 6], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 8], &tmp[0] );
+    STEP_PART( &x[0], &CNS[10], &tmp[0] );
+    STEP_PART( &x[0], &CNS[12], &tmp[0] );
+    STEP_PART( &x[0], &CNS[14], &tmp[0] );
+
+    MIXTON1024( x[0], x[1], x[2], x[3],
+                chainv[0], chainv[2], chainv[4],chainv[6],
+                x[4], x[5], x[6], x[7],
+                chainv[1],chainv[3],chainv[5],chainv[7]);
+
+    /* Process last 256-bit block */
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[16], CNS[17],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[18], CNS[19],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[20], CNS[21],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[22], CNS[23],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[24], CNS[25],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[26], CNS[27],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[28], CNS[29],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[30], CNS[31],
+                tmp[0], tmp[1] );
+}
+
+
+/***************************************************/
+/* Finalization function  */
+/* state: hash context    */
+/* b[8]: hash values      */
+
+static void finalization512_2way( luffa_2way_context *state, uint32 *b )
+{
+    uint32 hash[8] __attribute((aligned(64)));
+    __m256i* chainv = state->chainv;
+    __m256i t[2];
+
+    /*---- blank round with m=0 ----*/
+    rnd512_2way( state, m256_zero, m256_zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+
+    t[0] = _mm256_xor_si256( t[0], chainv[2] );
+    t[1] = _mm256_xor_si256( t[1], chainv[3] );
+    t[0] = _mm256_xor_si256( t[0], chainv[4] );
+    t[1] = _mm256_xor_si256( t[1], chainv[5] );
+    t[0] = _mm256_xor_si256( t[0], chainv[6] );
+    t[1] = _mm256_xor_si256( t[1], chainv[7] );
+    t[0] = _mm256_xor_si256( t[0], chainv[8] );
+    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+
+    t[0] = _mm256_shuffle_epi32( t[0], 27 );
+    t[1] = _mm256_shuffle_epi32( t[1], 27 );
+
+    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
+    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
+
+    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
+
+    rnd512_2way( state, m256_zero, m256_zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+    t[0] = _mm256_xor_si256( t[0], chainv[2] );
+    t[1] = _mm256_xor_si256( t[1], chainv[3] );
+    t[0] = _mm256_xor_si256( t[0], chainv[4] );
+    t[1] = _mm256_xor_si256( t[1], chainv[5] );
+    t[0] = _mm256_xor_si256( t[0], chainv[6] );
+    t[1] = _mm256_xor_si256( t[1], chainv[7] );
+    t[0] = _mm256_xor_si256( t[0], chainv[8] );
+    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+
+    t[0] = _mm256_shuffle_epi32( t[0], 27 );
+    t[1] = _mm256_shuffle_epi32( t[1], 27 );
+
+    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
+    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
+
+    casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
+}
+
+int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
+{
+    int i;
+    state->hashbitlen = hashbitlen;
+
+    for ( i=0; i<32; i++ ) CNS[i] =
+          _mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
+                            CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2)    ],
+                            CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
+                            CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2)    ] );
+
+    for ( i=0; i<10; i++ ) state->chainv[i] =
+          _mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
+                            IV[ (i<<2) +1 ], IV[ (i<<2)    ],
+                            IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
+                            IV[ (i<<2) +1 ], IV[ (i<<2)    ] );
+
+    ((__m256i*)state->buffer)[0] = m256_zero;
+    ((__m256i*)state->buffer)[1] = m256_zero;
+
+    return 0;
+}
+
+// Do not call luffa_update_close after having called luffa_update.
+// Once luffa_update has been called only call luffa_update or luffa_close.
+int luffa_2way_update( luffa_2way_context *state, const void *data,
+                       size_t len )
+{
+    __m256i *vdata  = (__m256i*)data;
+    __m256i *buffer = (__m256i*)state->buffer;
+    int i;
+    int blocks = (int)len / 32;
+    state-> rembytes = (int)len % 32;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       rnd512_2way( state, mm256_bswap_32( vdata[1] ) ,
+                           mm256_bswap_32( vdata[0] ) );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    // store in buffer for transform in final for midstate to work
+    if ( state->rembytes  )
+    {
+      // remaining data bytes
+      buffer[0] = mm256_bswap_32( vdata[0] );
+      buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                   0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
+    }
+    return 0;
+}
+
+int luffa_2way_close( luffa_2way_context *state, void *hashval )
+{
+    __m256i *buffer = (__m256i*)state->buffer;
+
+    // transform pad block
+    if ( state->rembytes )
+      // not empty, data is in buffer
+      rnd512_2way( state, buffer[1], buffer[0] );
+    else
+      // empty pad block, constant data
+      rnd512_2way( state, m256_zero,
+                   _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                    0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
+
+    finalization512_2way( state, (uint32*)hashval );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_2way( state, (uint32*)( hashval+128 ) );
+    return 0;
+}
+
+int luffa_2way_update_close( luffa_2way_context *state,
+                 void *output, const void *data, size_t inlen )
+{
+// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
+    __m256i *vdata  = (__m256i*)data;
+    int i;
+    int blocks = (int)( inlen / 32 );
+    state->rembytes = inlen % 32;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+       rnd512_2way( state, mm256_bswap_32( vdata[1] ),
+                           mm256_bswap_32( vdata[0] ) );
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+       // padding of partial block
+       rnd512_2way( state,
+                    _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
+                    mm256_bswap_32( vdata[0] ) );
+    else
+       // empty pad block
+       rnd512_2way( state, m256_zero, 
+                    _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
+
+    finalization512_2way( state, (uint32*)output );
+    if ( state->hashbitlen > 512 )
+        finalization512_2way( state, (uint32*)( output+128 ) );
+
+    return 0;
+}
+
+#endif
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -0,0 +1,69 @@
+#if !defined(LUFFA_HASH_2WAY_H__)
+#define LUFFA_HASH_2WAY_H__ 1
+/*
+ * luffa_for_sse2.h
+ * Version 2.0 (Sep 15th 2009)
+ *
+ * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
+ *
+ * Hitachi, Ltd. is the owner of this software and hereby grant
+ * the U.S. Government and any interested party the right to use
+ * this software for the purposes of the SHA-3 evaluation process,
+ * notwithstanding that this software is copyrighted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#if defined(__AVX2__)
+
+#include <immintrin.h>
+#include "algo/sha/sha3-defs.h"
+#include "avxdefs.h"
+
+/* The length of digests*/
+#define DIGEST_BIT_LEN_224 224
+#define DIGEST_BIT_LEN_256 256
+#define DIGEST_BIT_LEN_384 384
+#define DIGEST_BIT_LEN_512 512
+
+/*********************************/
+/* The parameters of Luffa       */
+#define MSG_BLOCK_BIT_LEN 256  /*The bit length of a message block*/
+#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
+                                                     * of a message block*/
+
+/* The number of blocks in Luffa */
+#define WIDTH_224 3
+#define WIDTH_256 3
+#define WIDTH_384 4
+#define WIDTH_512 5
+
+/* The limit of the length of message */
+#define LIMIT_224 64
+#define LIMIT_256 64
+#define LIMIT_384 128
+#define LIMIT_512 128
+/*********************************/
+
+typedef struct {
+    uint32 buffer[8*2] __attribute((aligned(64)));
+    __m256i chainv[10] __attribute((aligned(32)));   /* Chaining values */
+    int hashbitlen;
+    int rembytes;
+} luffa_2way_context;
+
+int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
+int luffa_2way_update( luffa_2way_context *state, const void *data,
+                       size_t len );
+int luffa_2way_close( luffa_2way_context *state, void *hashval );
+int luffa_2way_update_close( luffa_2way_context *state, void *output,
+                                   const void *data, size_t inlen );
+
+#endif
+#endif
--- a/algo/luffa/sse2/luffa_for_sse2.c
+++ b/algo/luffa/sse2/luffa_for_sse2.c
@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
+                      mm_bswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) );
+      casti_m128i( state->buffer, 0 ) = mm_bswap_32( cast_m128i( data ) );
      // padding of partial block
      casti_m128i( state->buffer, 1 ) =
            _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
+                      mm_bswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    {
      // padding of partial block
      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
-                      mm_byteswap_32( cast_m128i( data ) ) );
+                      mm_bswap_32( cast_m128i( data ) ) );
    }
    else
    {
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )

    _mm256_store_si256( (__m256i*)hash, t );

-    casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );

    rnd512( state, zero, zero );

@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )

    _mm256_store_si256( (__m256i*)hash, t );

-    casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
 }

 #else
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 0 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 1 ) = mm_bswap_32( casti_m128i( hash, 1 ) );

    rnd512( state, zero, zero );

@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 2 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 3 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
 }
 #endif

--- a/algo/luffa/sse2/luffa_for_sse2.h
+++ b/algo/luffa/sse2/luffa_for_sse2.h
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -0,0 +1,128 @@
+#include "lyra2h-gate.h"
+
+#ifdef LYRA2H_4WAY
+
+#include <memory.h>
+#include <mm_malloc.h>
+#include "lyra2.h"
+#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake-hash-4way.h"
+
+__thread uint64_t* lyra2h_4way_matrix;
+
+bool lyra2h_4way_thread_init()
+{
+ return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_4way_context l2h_4way_blake_mid;
+
+void lyra2h_4way_midstate( const void* input )
+{
+       blake256_4way_init( &l2h_4way_blake_mid );
+       blake256_4way( &l2h_4way_blake_mid, input, 64 );
+}
+
+void lyra2h_4way_hash( void *state, const void *input )
+{
+     uint32_t hash0[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (64)));
+     uint32_t hash2[8] __attribute__ ((aligned (64)));
+     uint32_t hash3[8] __attribute__ ((aligned (64)));
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
+
+     memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
+     blake256_4way( &ctx_blake, input + (64*4), 16 );
+     blake256_4way_close( &ctx_blake, vhash );
+
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+
+     LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 76; // 19*4
+   uint32_t *noncep1 = vdata + 77;
+   uint32_t *noncep2 = vdata + 78;
+   uint32_t *noncep3 = vdata + 79;
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   for ( int i=0; i < 19; i++ )
+      be32enc( &edata[i], pdata[i] );
+
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   lyra2h_4way_midstate( vdata );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      be32enc( &edata[19], n );
+      lyra2h_4way_hash( hash, vdata );
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = pdata[19] = n;
+          work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+      {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+      {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
+
--- a/algo/lyra2/lyra2h-gate.c
+++ b/algo/lyra2/lyra2h-gate.c
@@ -0,0 +1,25 @@
+#include "lyra2h-gate.h"
+#include "lyra2.h"
+
+void lyra2h_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_lyra2h_algo( algo_gate_t* gate )
+{
+#ifdef LYRA2H_4WAY
+  gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h_4way;
+  gate->hash       = (void*)&lyra2h_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&lyra2h_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h;
+  gate->hash       = (void*)&lyra2h_hash;
+#endif
+  gate->optimizations = AVX_OPT | AVX2_OPT;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&lyra2h_set_target;
+  return true;
+};
+
--- a/algo/lyra2/lyra2h-gate.h
+++ b/algo/lyra2/lyra2h-gate.h
@@ -0,0 +1,32 @@
+#ifndef LYRA2H_GATE_H__
+#define LYRA2H_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__)
+  #define LYRA2H_4WAY
+#endif
+
+#define LYRA2H_MATRIX_SIZE  BLOCK_LEN_INT64 * 16 * 16 * 8
+
+#if defined(LYRA2H_4WAY)
+
+void lyra2h_4way_hash( void *state, const void *input );
+
+int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+bool lyra2h_4way_thread_init();
+
+#endif
+
+void lyra2h_hash( void *state, const void *input );
+
+int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+bool lyra2h_thread_init();
+
+#endif
+
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -1,6 +1,6 @@
+#include "lyra2h-gate.h"
 #include <memory.h>
 #include <mm_malloc.h>
-#include "algo-gate-api.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"

@@ -8,8 +8,7 @@ __thread uint64_t* lyra2h_matrix;

 bool lyra2h_thread_init()
 {
-   const int i = 16 * 16 * 96;
-   lyra2h_matrix = _mm_malloc( i, 64 );
+   lyra2h_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 );
   return lyra2h_matrix;
 }

@@ -74,20 +73,3 @@ int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-
-void lyra2h_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool register_lyra2h_algo( algo_gate_t* gate )
-{
-  gate->optimizations = AVX_OPT | AVX2_OPT;
-  gate->miner_thread_init = (void*)&lyra2h_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2h;
-  gate->hash       = (void*)&lyra2h_hash;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&lyra2h_set_target;
-  return true;
-};
-
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -106,6 +106,7 @@ int scanhash_lyra2re(int thr_id, struct work *work,
                   {
 			pdata[19] = nonce;
 			*hashes_done = pdata[19] - first_nonce;
+                        work_set_target_ratio( work, hash );
 			return 1;
                   }
 		}
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -0,0 +1,185 @@
+#include "lyra2rev2-gate.h"
+#include <memory.h>
+
+#if defined (__AVX2__)	
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+
+#include "algo/cubehash/sph_cubehash.h"
+//#include "algo/bmw/sph_bmw.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+
+typedef struct {
+   blake256_4way_context     blake;
+   keccak256_4way_context    keccak;
+   cubehashParam             cube;
+   skein256_4way_context     skein;
+   bmw256_4way_context          bmw;
+//        sph_bmw256_context       bmw;
+} lyra2v2_4way_ctx_holder;
+
+static lyra2v2_4way_ctx_holder l2v2_4way_ctx;
+
+void init_lyra2rev2_4way_ctx()
+{
+//   blake256_4way_init( &l2v2_4way_ctx.blake );
+   keccak256_4way_init( &l2v2_4way_ctx.keccak );
+   cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 );
+   skein256_4way_init( &l2v2_4way_ctx.skein );
+   bmw256_4way_init( &l2v2_4way_ctx.bmw );
+//        sph_bmw256_init( &l2v2_4way_ctx.bmw );
+}
+
+void lyra2rev2_4way_hash( void *state, const void *input )
+{
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+   uint64_t vhash64[4*4] __attribute__ ((aligned (64)));
+   lyra2v2_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
+   memcpy( &ctx, &l2v2_4way_ctx, sizeof(l2v2_4way_ctx) );
+
+   blake256_4way( &ctx.blake, input + (64<<2), 16 );
+//   blake256_4way( &ctx.blake, input, 80 );
+   blake256_4way_close( &ctx.blake, vhash );
+
+   mm256_reinterleave_4x64( vhash64, vhash, 256 );
+   keccak256_4way( &ctx.keccak, vhash64, 32 );
+   keccak256_4way_close( &ctx.keccak, vhash64 );
+   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
+
+   LYRA2REV2( l2v2_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
+   LYRA2REV2( l2v2_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
+
+   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   skein256_4way( &ctx.skein, vhash64, 32 );
+   skein256_4way_close( &ctx.skein, vhash64 );
+   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
+   memcpy( &ctx.cube, &l2v2_4way_ctx.cube, sizeof ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
+
+
+   // BMW256 4way has a lane corruption problem, only lanes 0 & 2 produce
+   // good hash. As a result this ugly workaround of running bmw256-4way
+   // twice with data shuffled to get all 4 lanes of good hash.
+   // The hash is then shuffled back into the appropriate lanes for output.
+   // Not as fast but still faster than using sph serially. 
+
+   // shift lane 1 data to lane 2.
+   mm_interleave_4x32( vhash, hash0, hash0, hash1, hash1, 256 );
+   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_close( &ctx.bmw, vhash );
+   uint32_t trash[8] __attribute__ ((aligned (32)));
+   // extract lane 0 as usual and lane2 containing lane 1 hash
+   mm_deinterleave_4x32( state, trash, state+32, trash, vhash, 256 );
+   // shift lane2 data to lane 0 and lane 3 data to lane 2
+   mm_interleave_4x32( vhash, hash2, hash2, hash3, hash3, 256 );
+   bmw256_4way_init( &ctx.bmw );
+   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_close( &ctx.bmw, vhash );
+   // extract lane 2 hash from lane 0 and lane 3 hash from lane 2.
+   mm_deinterleave_4x32( state+64, trash, state+96, trash, vhash, 256 );
+}
+
+int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 76; // 19*4
+   uint32_t *noncep1 = vdata + 77;
+   uint32_t *noncep2 = vdata + 78;
+   uint32_t *noncep3 = vdata + 79;
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   swab32_array( edata, pdata, 20 );
+
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   blake256_4way_init( &l2v2_4way_ctx.blake );
+   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      lyra2rev2_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+//printf("found0\n");
+          found[0] = true;
+          num_found++;
+          nonces[0] = pdata[19] = n;
+          work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      {
+//printf("found1\n");
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+      {
+//printf("found2\n");
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+      {
+//printf("found3\n");
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/lyra2/lyra2rev2-gate.c
+++ b/algo/lyra2/lyra2rev2-gate.c
@@ -0,0 +1,38 @@
+#include "lyra2rev2-gate.h"
+
+__thread uint64_t* l2v2_wholeMatrix;
+
+void lyra2rev2_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool lyra2rev2_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   l2v2_wholeMatrix = _mm_malloc( i, 64 );
+
+   return l2v2_wholeMatrix;
+}
+
+bool register_lyra2rev2_algo( algo_gate_t* gate )
+{
+#if defined (LYRA2REV2_4WAY)
+  init_lyra2rev2_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
+  gate->hash      = (void*)&lyra2rev2_4way_hash;
+#else
+  init_lyra2rev2_ctx();
+  gate->scanhash  = (void*)&scanhash_lyra2rev2;
+  gate->hash      = (void*)&lyra2rev2_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
+  gate->set_target        = (void*)&lyra2rev2_set_target;
+  return true;
+};
+
+
--- a/algo/lyra2/lyra2rev2-gate.h
+++ b/algo/lyra2/lyra2rev2-gate.h
@@ -0,0 +1,35 @@
+#ifndef LYRA2REV2_GATE_H__
+#define LYRA2REV2_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+#include "lyra2.h"
+
+#if defined(__AVX2__)
+  #define LYRA2REV2_4WAY
+#endif
+
+extern __thread uint64_t* l2v2_wholeMatrix;
+
+bool register_lyra2rev2_algo( algo_gate_t* gate );
+
+#if defined(LYRA2REV2_4WAY)
+
+void lyra2rev2_4way_hash( void *state, const void *input );
+
+int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_lyra2rev2_4way_ctx();
+
+#endif
+
+void lyra2rev2_hash( void *state, const void *input );
+
+int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_lyra2rev2_ctx();
+
+#endif
+
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -1,20 +1,12 @@
+#include "lyra2rev2-gate.h"
 #include <memory.h>
-
-#include "algo-gate-api.h"
-
 #include "algo/blake/sph_blake.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-#include "lyra2.h"
-#include "avxdefs.h"
-
-// This gets allocated when miner_thread starts up and is never freed.
-// It's not a leak because the only way to allocate it again is to exit
-// the thread and that only occurs when the entire program exits.
-__thread uint64_t* l2v2_wholeMatrix;
+//#include "lyra2.h"

 typedef struct {
        cubehashParam           cube1;
@@ -106,6 +98,7 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
                   if( fulltest(hash, ptarget) )
                   {
 			pdata[19] = nonce;
+                        work_set_target_ratio( work, hash );
 			*hashes_done = pdata[19] - first_nonce;
 		   	return 1;
 		   }
@@ -119,30 +112,3 @@ int scanhash_lyra2rev2(int thr_id, struct work *work,
 	return 0;
 }

-void lyra2rev2_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool lyra2rev2_thread_init()
-{
-   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-
-   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
-   l2v2_wholeMatrix = _mm_malloc( i, 64 );
-
-   return l2v2_wholeMatrix;
-}
-
-bool register_lyra2rev2_algo( algo_gate_t* gate )
-{
-  init_lyra2rev2_ctx();
-  gate->optimizations = AVX_OPT | AVX2_OPT;
-  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
-  gate->scanhash          = (void*)&scanhash_lyra2rev2;
-  gate->hash              = (void*)&lyra2rev2_hash;
-  gate->set_target        = (void*)&lyra2rev2_set_target;
-  return true;
-};
-
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -4,13 +4,10 @@

 #include <memory.h>
 #include <mm_malloc.h>
-//#include "algo-gate-api.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "algo/blake/blake-hash-4way.h"
-//#include "avxdefs.h"

-// same size, only difference is the name, lyra2 is done serially
 __thread uint64_t* lyra2z_4way_matrix;

 bool lyra2z_4way_thread_init()
@@ -26,12 +23,8 @@ void lyra2z_4way_midstate( const void* input )
       blake256_4way( &l2z_4way_blake_mid, input, 64 );
 }

-// block 2050 new algo, blake plus new lyra parms. new input
-// is power of 2 so normal lyra can be used
-//void zcoin_hash(void *state, const void *input, uint32_t height)
 void lyra2z_4way_hash( void *state, const void *input )
 {
-//        uint32_t _ALIGN(64) hash[16];
     uint32_t hash0[8] __attribute__ ((aligned (64)));
     uint32_t hash1[8] __attribute__ ((aligned (64)));
     uint32_t hash2[8] __attribute__ ((aligned (64)));
@@ -39,27 +32,21 @@ void lyra2z_4way_hash( void *state, const void *input )
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));

-//     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
-//     blake256_4way( &ctx_blake, input + (64*4), 16 );
-//     blake256_4way_close( &ctx_blake, vhash );
-
-     blake256_4way_init( &ctx_blake );
-     blake256_4way( &ctx_blake, input, 80 );
+     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
+     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

     LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
-//     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
     LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
-//     LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );

     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash2, 32 );
     memcpy( state+96, hash3, 32 );
-
-//    memcpy(state, hash, 32);
 }

 int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -67,7 +54,6 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t hash[8*4] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
-//	uint32_t _ALIGN(64) hash[8];
   uint32_t _ALIGN(64) edata[20];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -90,7 +76,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,

   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

-//   lyra2z_4way_midstate( vdata );
+   lyra2z_4way_midstate( vdata );

   do {
      found[0] = found[1] = found[2] = found[3] = false;
@@ -99,47 +85,38 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
      be32enc( noncep2, n+2 );
      be32enc( noncep3, n+3 );

-      be32enc( &edata[19], n );
      lyra2z_4way_hash( hash, vdata );
+      pdata[19] = n;

      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
      {
-printf("found 0\n");
          found[0] = true;
          num_found++;
          nonces[0] = pdata[19] = n;
          work_set_target_ratio( work, hash );
      }
-/*
      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
      {
-printf("found 1\n");          
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
          work_set_target_ratio( work, hash+8 );
      }
-*/
      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
      {
-printf("found 2\n");          
          found[2] = true;
          num_found++;
          nonces[2] = n+2;
          work_set_target_ratio( work, hash+16 );
      }
-/*
      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
      {
-printf("found 3\n");          
          found[3] = true;
          num_found++;
          nonces[3] = n+3;
          work_set_target_ratio( work, hash+24 );
      }
      n += 4;
-*/
-      n += 2;
   } while ( (num_found == 0) && (n < max_nonce-4)
                   && !work_restart[thr_id].restart);

@@ -149,21 +126,3 @@ printf("found 3\n");

 #endif

-/*
-
-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
-			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
-		nonce++;
-
-	} while (nonce < max_nonce && !work_restart[thr_id].restart);
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
-}
-*/
-
--- a/algo/lyra2/lyra2z-gate.c
+++ b/algo/lyra2/lyra2z-gate.c
@@ -9,18 +9,15 @@ void lyra2z_set_target( struct work* work, double job_diff )
 bool register_lyra2z_algo( algo_gate_t* gate )
 {
 #ifdef LYRA2Z_4WAY
-  four_way_not_tested();
-  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
  gate->hash       = (void*)&lyra2z_4way_hash;
 #else
-  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2z_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z;
  gate->hash       = (void*)&lyra2z_hash;
 #endif
-
+  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
  gate->set_target = (void*)&lyra2z_set_target;
  return true;
--- a/algo/lyra2/lyra2z-gate.h
+++ b/algo/lyra2/lyra2z-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(HASH_4WAY)
+#if defined(__AVX2__)
  #define LYRA2Z_4WAY
 #endif

--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -82,41 +82,3 @@ int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-/*
-//int64_t get_max64_0xffffLL() { return 0xffffLL; };
-
-void lyra2z_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
-{
-   work->height = sctx->bloc_height;
-   return false;
-}
-
-
-bool lyra2z_thread_init()
-{
-   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-
-   int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
-   lyra2z_wholeMatrix = _mm_malloc( i, 64 );
-
-   return lyra2z_wholeMatrix;
-}
-
-bool register_lyra2z_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  gate->miner_thread_init = (void*)&lyra2z_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2z;
-  gate->hash       = (void*)&lyra2z_hash;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&lyra2z_set_target;
-//  gate->prevent_dupes = (void*)&zcoin_get_work_height;
-  return true;
-};
-*/
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -346,6 +346,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
                    hash_str,
                    target_str);
            }
+            work_set_target_ratio( work, hash );
            pdata[19] = data[19];
            goto out;
 	  }
--- a/algo/neoscrypt/neoscrypt.c
+++ b/algo/neoscrypt/neoscrypt.c
--- a/algo/nist5/nist5-gate.c
+++ b/algo/nist5/nist5-gate.c
@@ -2,7 +2,7 @@

 bool register_nist5_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
 #if defined (NIST5_4WAY)
    gate->scanhash = (void*)&scanhash_nist5_4way;
    gate->hash     = (void*)&nist5hash_4way;
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+#if defined(__AVX2__) && defined(__AES__)
  #define NIST5_4WAY
 #endif

--- a/algo/nist5/nist5.c
+++ b/algo/nist5/nist5.c
@@ -132,6 +132,7 @@ int scanhash_nist5(int thr_id, struct work *work,
 				if (!(hash64[7] & mask)) {
 					printf("[%d]",thr_id);
 					if (fulltest(hash64, ptarget)) {
+                                                work_set_target_ratio( work, hash64 );
 						*hashes_done = n - first_nonce + 1;
 						return true;
 					}
--- a/algo/nist5/zr5.c
+++ b/algo/nist5/zr5.c
@@ -172,6 +172,7 @@ int scanhash_zr5( int thr_id, struct work *work,
         pdata[0] = tmpdata[0];
         pdata[19] = nonce;
         *hashes_done = pdata[19] - first_nonce + 1;
+         work_set_target_ratio( work, hash );
         if (opt_debug)
           applog(LOG_INFO, "found nonce %x", nonce);
         return 1;
--- a/algo/polytimos/polytimos-gate.h
+++ b/algo/polytimos/polytimos-gate.h
@@ -1,12 +0,0 @@
-#ifndef __POLYTIMOS_GATE_H__
-#define __POLYTIMOS_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-void polytimos_hash( void *state, const void *input );
-int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-void init_polytimos_context();
-
-#endif
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -0,0 +1,231 @@
+#include "cpuminer-config.h"
+#include "anime-gate.h"
+
+#if defined (ANIME_4WAY)
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+
+typedef struct {
+    blake512_4way_context  blake;
+    bmw512_4way_context    bmw;
+    hashState_groestl      groestl;
+    jh512_4way_context     jh;
+    skein512_4way_context  skein;
+    keccak512_4way_context keccak;
+} anime_4way_ctx_holder;
+
+anime_4way_ctx_holder anime_4way_ctx __attribute__ ((aligned (64)));
+
+void init_anime_4way_ctx()
+{
+     blake512_4way_init( &anime_4way_ctx.blake );
+     bmw512_4way_init( &anime_4way_ctx.bmw );
+     init_groestl( &anime_4way_ctx.groestl, 64 );
+     skein512_4way_init( &anime_4way_ctx.skein );
+     jh512_4way_init( &anime_4way_ctx.jh );
+     keccak512_4way_init( &anime_4way_ctx.keccak );
+}
+
+void anime_4way_hash( void *state, const void *input )
+{
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+    __m256i* vh  = (__m256i*)vhash;
+    __m256i* vhA = (__m256i*)vhashA;
+    __m256i* vhB = (__m256i*)vhashB;
+    __m256i vh_mask;
+    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
+    int i;
+    anime_4way_ctx_holder ctx;
+    memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) );
+
+    bmw512_4way( &ctx.bmw, vhash, 80 );
+    bmw512_4way_close( &ctx.bmw, vhash );
+
+    blake512_4way( &ctx.blake, input, 64 );
+    blake512_4way_close( &ctx.blake, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  m256_zero );
+
+       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_close( &ctx.skein, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+    jh512_4way( &ctx.jh, vhash, 64 );
+    jh512_4way_close( &ctx.jh, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  m256_zero );
+
+       blake512_4way_init( &ctx.blake );
+       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_close( &ctx.blake, vhashA );
+
+       bmw512_4way_init( &ctx.bmw );
+       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_close( &ctx.bmw, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    keccak512_4way( &ctx.keccak, vhash, 64 );
+    keccak512_4way_close( &ctx.keccak, vhash );
+
+    skein512_4way_init( &ctx.skein );
+    skein512_4way( &ctx.skein, vhash, 64 );
+    skein512_4way_close( &ctx.skein, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  m256_zero );
+
+       keccak512_4way_init( &ctx.keccak );
+       keccak512_4way( &ctx.keccak, vhash, 64 );
+       keccak512_4way_close( &ctx.keccak, vhashA );
+
+       jh512_4way_init( &ctx.jh );
+       jh512_4way( &ctx.jh, vhash, 64 );
+       jh512_4way_close( &ctx.jh, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done)
+{
+    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+    uint32_t endiandata[20] __attribute__((aligned(64)));
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t *nonces = work->nonces;
+    bool *found = work->nfound;
+    int num_found = 0;
+    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+    uint32_t *noncep1 = vdata + 75;
+    uint32_t *noncep2 = vdata + 77;
+    uint32_t *noncep3 = vdata + 79;
+    const uint32_t Htarg = ptarget[7];
+    uint64_t htmax[] = {
+                0,
+                0xF,
+                0xFF,
+                0xFFF,
+                0xFFFF,
+                0x10000000
+        };
+    uint32_t masks[] = {
+                0xFFFFFFFF,
+                0xFFFFFFF0,
+                0xFFFFFF00,
+                0xFFFFF000,
+                0xFFFF0000,
+                0
+        };
+
+    swab32_array( endiandata, pdata, 20 );
+
+    uint64_t *edata = (uint64_t*)endiandata;
+    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+    for (int m=0; m < 6; m++)
+       if (Htarg <= htmax[m])
+       {
+          uint32_t mask = masks[m];
+
+          do
+          {
+              found[0] = found[1] = found[2] = found[3] = false;
+              be32enc( noncep0, n   );
+              be32enc( noncep1, n+1 );
+              be32enc( noncep2, n+2 );
+              be32enc( noncep3, n+3 );
+
+              anime_4way_hash( hash, vdata );
+              pdata[19] = n;
+
+             if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) ) 
+             {
+                found[0] = true;
+                num_found++;
+                nonces[0] = n;
+                work_set_target_ratio( work, hash );
+             }
+             if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) )
+             {
+                found[1] = true;
+                num_found++;
+                nonces[1] = n+1;
+                work_set_target_ratio( work, hash );
+             }
+             if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) )
+             {
+                found[2] = true;
+                num_found++;
+                nonces[2] = n+2;
+                work_set_target_ratio( work, hash );
+             }
+             if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) )
+             {
+                found[3] = true;
+                num_found++;
+                nonces[3] = n+3;
+                work_set_target_ratio( work, hash );
+             }
+             n += 4;
+          } while ( ( num_found == 0 ) && ( n < max_nonce )
+              && !work_restart[thr_id].restart );
+          break;
+       }
+
+    *hashes_done = n - first_nonce + 1;
+    return num_found;
+}
+
+#endif
--- a/algo/quark/anime-gate.c
+++ b/algo/quark/anime-gate.c
@@ -0,0 +1,17 @@
+#include "anime-gate.h"
+
+bool register_anime_algo( algo_gate_t* gate )
+{
+#if defined (ANIME_4WAY)
+  init_anime_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_anime_4way;
+  gate->hash      = (void*)&anime_4way_hash;
+#else
+  init_anime_ctx();
+  gate->scanhash  = (void*)&scanhash_anime;
+  gate->hash      = (void*)&anime_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  return true;
+};
+
--- a/algo/quark/anime-gate.h
+++ b/algo/quark/anime-gate.h
@@ -0,0 +1,32 @@
+#ifndef ANIME_GATE_H__
+#define ANIME_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define ANIME_4WAY
+#endif
+
+bool register_anime_algo( algo_gate_t* gate );
+
+#if defined(ANIME_4WAY)
+
+void anime_4way_hash( void *state, const void *input );
+
+int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_anime_4way_ctx();
+
+#endif
+
+void anime_hash( void *state, const void *input );
+
+int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_anime_ctx();
+
+#endif
+
--- a/algo/quark/anime.c
+++ b/algo/quark/anime.c
@@ -0,0 +1,189 @@
+#include "cpuminer-config.h"
+#include "anime-gate.h"
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include "algo/blake/sph_blake.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#ifdef __AES__
+ #include "algo/groestl/aes_ni/hash-groestl.h"
+#else
+ #include "algo/groestl/sph_groestl.h"
+#endif
+
+typedef struct {
+    sph_blake512_context  blake;
+    sph_bmw512_context    bmw;
+#ifdef __AES__
+    hashState_groestl groestl;
+#else
+    sph_groestl512_context groestl;
+#endif
+    sph_jh512_context      jh;
+    sph_skein512_context   skein;
+    sph_keccak512_context  keccak;
+} anime_ctx_holder;
+
+anime_ctx_holder anime_ctx __attribute__ ((aligned (64)));
+
+void init_anime_ctx()
+{
+     sph_blake512_init( &anime_ctx.blake );
+     sph_bmw512_init( &anime_ctx.bmw );
+#ifdef __AES__
+    init_groestl( &anime_ctx.groestl, 64 );
+#else
+     sph_groestl512_init( &anime_ctx.groestl );
+#endif
+     sph_skein512_init( &anime_ctx.skein );
+     sph_jh512_init( &anime_ctx.jh );
+     sph_keccak512_init( &anime_ctx.keccak );
+}
+
+void anime_hash( void *state, const void *input )
+{
+    unsigned char hash[128] __attribute__ ((aligned (32)));
+/*
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+    __m256i* vh  = (__m256i*)vhash;
+    __m256i* vhA = (__m256i*)vhashA;
+    __m256i* vhB = (__m256i*)vhashB;
+    __m256i vh_mask;
+    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
+*/
+    uint32_t mask = 8;
+    anime_ctx_holder ctx;
+    memcpy( &ctx, &anime_ctx, sizeof(anime_ctx) );
+
+    sph_bmw512( &ctx.bmw, input, 80 );
+    sph_bmw512_close( &ctx.bmw, hash );
+
+    sph_blake512( &ctx.blake, hash, 64 );
+    sph_blake512_close( &ctx.blake, hash );
+
+    if ( ( hash[0] & mask ) != 0 ) 
+    {
+#ifdef __AES__
+       update_and_final_groestl( &ctx.groestl, (char*)hash, (char*)hash, 512 );
+       reinit_groestl( &ctx.groestl );
+#else
+       sph_groestl512 ( &ctx.groestl, hash, 64 );
+       sph_groestl512_close( &ctx.groestl, hash );
+       sph_groestl512_init( &ctx.groestl );
+#endif
+    }
+    else
+    {
+       sph_skein512( &ctx.skein, hash, 64 );
+       sph_skein512_close( &ctx.skein, hash );
+       sph_skein512_init( &ctx.skein );
+    }
+
+#ifdef __AES__
+    update_and_final_groestl( &ctx.groestl, (char*)hash, (char*)hash, 512 );
+#else
+    sph_groestl512 ( &ctx.groestl, hash, 64 );
+    sph_groestl512_close( &ctx.groestl, hash );
+#endif
+
+    sph_jh512( &ctx.jh, hash, 64 );
+    sph_jh512_close( &ctx.jh, hash );
+
+    if ( ( hash[0] & mask ) != 0 )
+    {
+       sph_blake512_init( &ctx.blake );
+       sph_blake512( &ctx.blake, hash, 64 );
+       sph_blake512_close( &ctx.blake, hash );
+    }
+    else
+    {
+       sph_bmw512_init( &ctx.bmw );
+       sph_bmw512( &ctx.bmw, hash, 64 );
+       sph_bmw512_close( &ctx.bmw, hash );
+    }
+
+    sph_keccak512( &ctx.keccak, hash, 64 );
+    sph_keccak512_close( &ctx.keccak, hash );
+
+    sph_skein512( &ctx.skein, hash, 64 );
+    sph_skein512_close( &ctx.skein, hash );
+
+    if ( ( hash[0] & mask ) != 0 )
+    {
+       sph_keccak512_init( &ctx.keccak );
+       sph_keccak512( &ctx.keccak, hash, 64 );
+       sph_keccak512_close( &ctx.keccak, hash );
+    }
+    else
+    {
+       sph_jh512_init( &ctx.jh );
+       sph_jh512( &ctx.jh, hash, 64 );
+       sph_jh512_close( &ctx.jh, hash );
+    }
+
+   memcpy( state, hash, 32 );
+}
+
+int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done)
+{
+    uint32_t hash[8] __attribute__ ((aligned (64)));
+    uint32_t endiandata[20] __attribute__((aligned(64)));
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19];
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t Htarg = ptarget[7];
+    uint64_t htmax[] = {
+                0,
+                0xF,
+                0xFF,
+                0xFFF,
+                0xFFFF,
+                0x10000000
+        };
+    uint32_t masks[] = {
+                0xFFFFFFFF,
+                0xFFFFFFF0,
+                0xFFFFFF00,
+                0xFFFFF000,
+                0xFFFF0000,
+                0
+        };
+
+    swab32_array( endiandata, pdata, 20 );
+
+    for (int m=0; m < 6; m++)
+       if (Htarg <= htmax[m])
+       {
+          uint32_t mask = masks[m];
+          do
+          {
+              be32enc( &endiandata[19], n );
+              anime_hash( hash, endiandata );
+              pdata[19] = n;
+
+             if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) ) 
+             {
+                work_set_target_ratio( work, hash );
+                *hashes_done = n - first_nonce + 1;
+                return true;
+             }
+             n++;
+          } while ( ( n < max_nonce ) && !work_restart[thr_id].restart );
+          break;
+       }
+
+    pdata[19] = n;
+    return 0;
+}
+
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -0,0 +1,207 @@
+#include "cpuminer-config.h"
+#include "quark-gate.h"
+
+#if defined (QUARK_4WAY)
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+
+typedef struct {
+    blake512_4way_context  blake;
+    bmw512_4way_context    bmw;
+    hashState_groestl      groestl;
+    jh512_4way_context     jh;
+    skein512_4way_context  skein;
+    keccak512_4way_context keccak;
+} quark_4way_ctx_holder;
+
+quark_4way_ctx_holder quark_4way_ctx __attribute__ ((aligned (64)));
+
+void init_quark_4way_ctx()
+{
+     blake512_4way_init( &quark_4way_ctx.blake );
+     bmw512_4way_init( &quark_4way_ctx.bmw );
+     init_groestl( &quark_4way_ctx.groestl, 64 );
+     skein512_4way_init( &quark_4way_ctx.skein );
+     jh512_4way_init( &quark_4way_ctx.jh );
+     keccak512_4way_init( &quark_4way_ctx.keccak );
+}
+
+void quark_4way_hash( void *state, const void *input )
+{
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+    __m256i* vh  = (__m256i*)vhash;
+    __m256i* vhA = (__m256i*)vhashA;
+    __m256i* vhB = (__m256i*)vhashB;
+    __m256i vh_mask;
+    __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 );
+    int i;
+    quark_4way_ctx_holder ctx;
+    memcpy( &ctx, &quark_4way_ctx, sizeof(quark_4way_ctx) );
+
+    blake512_4way( &ctx.blake, input, 80 );
+    blake512_4way_close( &ctx.blake, vhash );
+
+    bmw512_4way( &ctx.bmw, vhash, 64 );
+    bmw512_4way_close( &ctx.bmw, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  m256_zero );
+
+       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+       update_and_final_groestl( &ctx.groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+       reinit_groestl( &ctx.groestl );
+       update_and_final_groestl( &ctx.groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+       skein512_4way( &ctx.skein, vhash, 64 );
+       skein512_4way_close( &ctx.skein, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 );
+    reinit_groestl( &ctx.groestl );
+    update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 );
+    mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+    jh512_4way( &ctx.jh, vhash, 64 );
+    jh512_4way_close( &ctx.jh, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  m256_zero );
+
+       blake512_4way_init( &ctx.blake );
+       blake512_4way( &ctx.blake, vhash, 64 );
+       blake512_4way_close( &ctx.blake, vhashA );
+
+       bmw512_4way_init( &ctx.bmw );
+       bmw512_4way( &ctx.bmw, vhash, 64 );
+       bmw512_4way_close( &ctx.bmw, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    keccak512_4way( &ctx.keccak, vhash, 64 );
+    keccak512_4way_close( &ctx.keccak, vhash );
+
+    skein512_4way_init( &ctx.skein );
+    skein512_4way( &ctx.skein, vhash, 64 );
+    skein512_4way_close( &ctx.skein, vhash );
+
+    vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
+                                  m256_zero );
+
+       keccak512_4way_init( &ctx.keccak );
+       keccak512_4way( &ctx.keccak, vhash, 64 );
+       keccak512_4way_close( &ctx.keccak, vhashA );
+
+       jh512_4way_init( &ctx.jh );
+       jh512_4way( &ctx.jh, vhash, 64 );
+       jh512_4way_close( &ctx.jh, vhashB );
+
+    for ( i = 0; i < 8; i++ )
+       vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done)
+{
+    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+    uint32_t endiandata[20] __attribute__((aligned(64)));
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t *nonces = work->nonces;
+    bool *found = work->nfound;
+    int num_found = 0;
+    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+    uint32_t *noncep1 = vdata + 75;
+    uint32_t *noncep2 = vdata + 77;
+    uint32_t *noncep3 = vdata + 79;
+
+    swab32_array( endiandata, pdata, 20 );
+
+    uint64_t *edata = (uint64_t*)endiandata;
+    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+    do
+    {
+       found[0] = found[1] = found[2] = found[3] = false;
+       be32enc( noncep0, n   );
+       be32enc( noncep1, n+1 );
+       be32enc( noncep2, n+2 );
+       be32enc( noncep3, n+3 );
+
+       quark_4way_hash( hash, vdata );
+       pdata[19] = n;
+
+       if ( ( hash[7] & 0xFFFFFF00 ) == 0 && fulltest( hash, ptarget ) ) 
+       {
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          work_set_target_ratio( work, hash );
+       }
+       if ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+8, ptarget ) )
+       {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash );
+       }
+       if ( ( (hash+16)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+16, ptarget ) )
+       {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash );
+       }
+       if ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 && fulltest( hash+24, ptarget ) )
+       {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash );
+       }
+       n += 4;
+    } while ( ( num_found == 0 ) && ( n < max_nonce )
+              && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce + 1;
+    return num_found;
+}
+
+#endif
--- a/algo/quark/quark-gate.c
+++ b/algo/quark/quark-gate.c
@@ -0,0 +1,17 @@
+#include "quark-gate.h"
+
+bool register_quark_algo( algo_gate_t* gate )
+{
+#if defined (QUARK_4WAY)
+  init_quark_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_quark_4way;
+  gate->hash      = (void*)&quark_4way_hash;
+#else
+  init_quark_ctx();
+  gate->scanhash  = (void*)&scanhash_quark;
+  gate->hash      = (void*)&quark_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  return true;
+};
+
--- a/algo/quark/quark-gate.h
+++ b/algo/quark/quark-gate.h
@@ -0,0 +1,32 @@
+#ifndef QUARK_GATE_H__
+#define QUARK_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define QUARK_4WAY
+#endif
+
+bool register_quark_algo( algo_gate_t* gate );
+
+#if defined(QUARK_4WAY)
+
+void quark_4way_hash( void *state, const void *input );
+
+int scanhash_quark_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_quark_4way_ctx();
+
+#endif
+
+void quark_hash( void *state, const void *input );
+
+int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_quark_ctx();
+
+#endif
+
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -1,5 +1,5 @@
 #include "cpuminer-config.h"
-#include "algo-gate-api.h"
+#include "quark-gate.h"

 #include <stdio.h>
 #include <string.h>
@@ -47,7 +47,7 @@ void init_quark_ctx()
 #endif
 }

-inline static void quarkhash(void *state, const void *input)
+void quark_hash(void *state, const void *input)
 {
    unsigned char hashbuf[128];
    size_t hashptr;
@@ -187,11 +187,12 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
 	do {
 		pdata[19] = ++n;
 		be32enc(&endiandata[19], n); 
-		quarkhash(hash64, &endiandata);
+		quark_hash(hash64, &endiandata);
                if ((hash64[7]&0xFFFFFF00)==0)
                {
                  if (fulltest(hash64, ptarget)) 
                  {
+                    work_set_target_ratio( work, hash64 );
                    *hashes_done = n - first_nonce + 1;
 		    return true;
                  }
@@ -203,12 +204,3 @@ int scanhash_quark( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-bool register_quark_algo( algo_gate_t* gate )
-{
-  init_quark_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT;
-  gate->scanhash         = (void*)&scanhash_quark;
-  gate->hash             = (void*)&quarkhash;
-  return true;
-};
-
--- a/algo/qubit/deep-2way.c
+++ b/algo/qubit/deep-2way.c
@@ -0,0 +1,130 @@
+#include "deep-gate.h"
+
+#if defined(DEEP_2WAY)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/shavite/sph_shavite.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct
+{
+        luffa_2way_context      luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        hashState_echo          echo;
+} deep_2way_ctx_holder;
+
+deep_2way_ctx_holder deep_2way_ctx;
+
+void init_deep_2way_ctx()
+{
+        luffa_2way_init( &deep_2way_ctx.luffa, 512 );
+        cubehashInit(&deep_2way_ctx.cube,512,16,32);
+        sph_shavite512_init(&deep_2way_ctx.shavite);
+        init_echo(&deep_2way_ctx.echo, 512);
+};
+
+void deep_2way_hash( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*2] __attribute__ ((aligned (64)));
+     deep_2way_ctx_holder ctx;
+
+     memcpy( &ctx, &deep_2way_ctx, sizeof(deep_2way_ctx) );
+     luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
+     luffa_2way_close( &ctx.luffa, vhash );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                           (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &deep_2way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &deep_2way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &deep_2way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+}
+
+int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 32+3;   // 4*8 + 3
+     uint32_t *noncep1 = vdata + 32+7;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
+
+     luffa_2way_init( &deep_2way_ctx.luffa, 512 );
+     luffa_2way_update( &deep_2way_ctx.luffa, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     {
+        uint32_t mask = masks[m];
+        do
+        {
+            found[0] = found[1] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            deep_2way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+64 );
+            }
+            n += 2;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+     }
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/qubit/deep-gate.c
+++ b/algo/qubit/deep-gate.c
@@ -0,0 +1,17 @@
+#include "deep-gate.h"
+
+bool register_deep_algo( algo_gate_t* gate )
+{
+#if defined (DEEP_2WAY)
+  init_deep_2way_ctx();
+  gate->scanhash  = (void*)&scanhash_deep_2way;
+  gate->hash      = (void*)&deep_2way_hash;
+#else
+  init_deep_ctx();
+  gate->scanhash  = (void*)&scanhash_deep;
+  gate->hash      = (void*)&deep_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  return true;
+};
+
--- a/algo/qubit/deep-gate.h
+++ b/algo/qubit/deep-gate.h
@@ -0,0 +1,32 @@
+#ifndef DEEP_GATE_H__
+#define DEEP_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define DEEP_2WAY
+#endif
+
+bool register_deep_algo( algo_gate_t* gate );
+
+#if defined(DEEP_2WAY)
+
+void deep_2way_hash( void *state, const void *input );
+
+int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_deep_2way_ctx();
+
+#endif
+
+void deep_hash( void *state, const void *input );
+
+int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_deep_ctx();
+
+#endif
+
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -1,31 +1,20 @@
-#include "algo-gate-api.h"
-
+#include "deep-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
-
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-#include "algo/simd/sse2/nist.h"
-#include "algo/shavite/sph_shavite.h"
-
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
+#else
+#include "algo/echo/sph_echo.h"
 #endif

 typedef struct
 {
        hashState_luffa         luffa;
        cubehashParam           cubehash;
-        sph_shavite512_context  shavite;
-        hashState_sd            simd;
 #ifdef NO_AES_NI
        sph_echo512_context echo;
 #else
@@ -133,6 +122,7 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
 	        	if (!(hash64[7] & mask)) {
 		            printf("[%d]",thr_id);
 			    if (fulltest(hash64, ptarget)) {
+                             work_set_target_ratio( work, hash64 );
                             *hashes_done = n - first_nonce + 1;
 				return true;
 	                    }
@@ -149,12 +139,3 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-bool register_deep_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_deep_ctx();
-  gate->scanhash = (void*)&scanhash_deep;
-  gate->hash     = (void*)&deep_hash;
-  return true;
-};
-
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -0,0 +1,138 @@
+#include "qubit-gate.h"
+
+#if defined(QUBIT_2WAY)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/simd/simd-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct
+{
+        luffa_2way_context      luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        simd_2way_context       simd;
+        hashState_echo          echo;
+} qubit_2way_ctx_holder;
+
+qubit_2way_ctx_holder qubit_2way_ctx;
+
+void init_qubit_2way_ctx()
+{
+        luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
+        cubehashInit(&qubit_2way_ctx.cube,512,16,32);
+        sph_shavite512_init(&qubit_2way_ctx.shavite);
+        simd_2way_init( &qubit_2way_ctx.simd, 512 );
+        init_echo(&qubit_2way_ctx.echo, 512);
+};
+
+void qubit_2way_hash( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*2] __attribute__ ((aligned (64)));
+     qubit_2way_ctx_holder ctx;
+
+     memcpy( &ctx, &qubit_2way_ctx, sizeof(qubit_2way_ctx) );
+     luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
+     luffa_2way_close( &ctx.luffa, vhash );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                           (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+}
+
+int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 32+3;   // 4*8 + 3
+     uint32_t *noncep1 = vdata + 32+7;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
+
+     luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
+     luffa_2way_update( &qubit_2way_ctx.luffa, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     {
+        uint32_t mask = masks[m];
+        do
+        {
+           found[0] = found[1] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            qubit_2way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            n += 2;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+     }
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/qubit/qubit-gate.c
+++ b/algo/qubit/qubit-gate.c
@@ -0,0 +1,17 @@
+#include "qubit-gate.h"
+
+bool register_qubit_algo( algo_gate_t* gate )
+{
+#if defined (QUBIT_2WAY)
+  init_qubit_2way_ctx();
+  gate->scanhash  = (void*)&scanhash_qubit_2way;
+  gate->hash      = (void*)&qubit_2way_hash;
+#else
+  init_qubit_ctx();
+  gate->scanhash  = (void*)&scanhash_qubit;
+  gate->hash      = (void*)&qubit_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  return true;
+};
+
--- a/algo/qubit/qubit-gate.h
+++ b/algo/qubit/qubit-gate.h
@@ -0,0 +1,32 @@
+#ifndef QUBIT_GATE_H__
+#define QUBIT_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define QUBIT_2WAY
+#endif
+
+bool register_qubit_algo( algo_gate_t* gate );
+
+#if defined(QUBIT_2WAY)
+
+void qubit_2way_hash( void *state, const void *input );
+
+int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_qubit_2way_ctx();
+
+#endif
+
+void qubit_hash( void *state, const void *input );
+
+int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_qubit_ctx();
+
+#endif
+
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -1,23 +1,16 @@
-#include "algo-gate-api.h"
-
+#include "qubit-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
-
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
-
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
+#else
+#include "algo/echo/sph_echo.h"
 #endif

 typedef struct
@@ -55,7 +48,7 @@ void qubit_luffa_midstate( const void* input )
    update_luffa( &qubit_luffa_mid, input, 64 );
 }

-void qubithash(void *output, const void *input)
+void qubit_hash(void *output, const void *input)
 {
        unsigned char hash[128] __attribute((aligned(64)));
        #define hashB hash+64
@@ -122,7 +115,7 @@ int scanhash_qubit(int thr_id, struct work *work,
                {
 	            pdata[19] = ++n;
 		    be32enc(&endiandata[19], n);
-		    qubithash(hash64, endiandata);
+		    qubit_hash(hash64, endiandata);
 #ifndef DEBUG_ALGO
 		    if (!(hash64[7] & mask))
                    {
@@ -141,6 +134,7 @@ int scanhash_qubit(int thr_id, struct work *work,
 	        	if (!(hash64[7] & mask)) {
 		            printf("[%d]",thr_id);
 			    if (fulltest(hash64, ptarget)) {
+                             work_set_target_ratio( work, hash64 );
                             *hashes_done = n - first_nonce + 1;
 				return true;
 	                    }
@@ -157,12 +151,3 @@ int scanhash_qubit(int thr_id, struct work *work,
 	return 0;
 }

-bool register_qubit_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_qubit_ctx();
-  gate->scanhash = (void*)&scanhash_qubit;
-  gate->hash     = (void*)&qubithash;
-  return true;
-};
-
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -754,6 +754,7 @@ extern int scanhash_scrypt( int thr_id, struct work *work, uint32_t max_nonce,
 			if (unlikely(hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget))) {
 				*hashes_done = n - pdata[19] + 1;
 				pdata[19] = data[i * 20 + 19];
+                                work_set_target_ratio( work, hash );
 				return 1;
 			}
 		}
@@ -777,6 +778,7 @@ bool scrypt_miner_thread_init( int thr_id )

 bool register_scrypt_algo( algo_gate_t* gate )
 {
+  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
  gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
  gate->scanhash         = (void*)&scanhash_scrypt;
 //  gate->hash             = (void*)&scrypt_1024_1_1_256_24way;
--- a/algo/scryptjane/scrypt-jane-chacha.h
+++ b/algo/scryptjane/scrypt-jane-chacha.h
@@ -114,7 +114,7 @@ available_implementations() {
 	return flags;
 }
 #endif
-
+/*
 static int
 scrypt_test_mix() {
 	static const uint8_t expected[16] = {
@@ -145,4 +145,4 @@ scrypt_test_mix() {

 	return ret;
 }
-
+*/
--- a/algo/scryptjane/scrypt-jane-hash.h
+++ b/algo/scryptjane/scrypt-jane-hash.h
@@ -26,7 +26,7 @@
 #include "scrypt-jane-pbkdf2.h"

 #define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */
-
+/*
 static int
 scrypt_test_hash() {
 	scrypt_hash_state st;
@@ -45,4 +45,4 @@ scrypt_test_hash() {
 	scrypt_hash_finish(&st, final);
 	return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE);
 }
-
+*/
--- a/algo/sha/md-helper-4way.c
+++ b/algo/sha/md-helper-4way.c
@@ -0,0 +1,270 @@
+/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * This file contains some functions which implement the external data
+ * handling and padding for Merkle-Damgard hash functions which follow
+ * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
+ *
+ * API: this file is meant to be included, not compiled as a stand-alone
+ * file. Some macros must be defined:
+ *   RFUN   name for the round function
+ *   HASH   "short name" for the hash function
+ *   BE32   defined for big-endian, 32-bit based (e.g. SHA-1)
+ *   LE32   defined for little-endian, 32-bit based (e.g. MD5)
+ *   BE64   defined for big-endian, 64-bit based (e.g. SHA-512)
+ *   LE64   defined for little-endian, 64-bit based (no example yet)
+ *   PW01   if defined, append 0x01 instead of 0x80 (for Tiger)
+ *   BLEN   if defined, length of a message block (in bytes)
+ *   PLW1   if defined, length is defined on one 64-bit word only (for Tiger)
+ *   PLW4   if defined, length is defined on four 64-bit words (for WHIRLPOOL)
+ *   SVAL   if defined, reference to the context state information
+ *
+ * BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
+ * this is used for instance for Tiger, which works on 64-bit words but
+ * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
+ * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
+ * set, then only one word (64 bits) will be used to encode the input
+ * message length (in bits), otherwise two words will be used (as in
+ * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
+ * not PLW1), four 64-bit words will be used to encode the message length
+ * (in bits). Note that regardless of those settings, only 64-bit message
+ * lengths are supported (in bits): messages longer than 2 Exabytes will be
+ * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
+ * 2 millions Terabytes, which is huge).
+ *
+ * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
+ * function. This is used for Tiger2, which is identical to Tiger except
+ * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
+ * of the 0x01 from original Tiger).
+ *
+ * The RFUN function is invoked with two arguments, the first pointing to
+ * aligned data (as a "const void *"), the second being state information
+ * from the context structure. By default, this state information is the
+ * "val" field from the context, and this field is assumed to be an array
+ * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
+ * from the context structure. The "val" field can have any type, except
+ * for the output encoding which assumes that it is an array of "sph_u32"
+ * values. By defining NO_OUTPUT, this last step is deactivated; the
+ * includer code is then responsible for writing out the hash result. When
+ * NO_OUTPUT is defined, the third parameter to the "close()" function is
+ * ignored.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)     SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)    a ## b
+
+#undef SPH_BLEN
+#undef SPH_WLEN
+#if defined BE64 || defined LE64
+#define SPH_BLEN    128U
+#define SPH_WLEN      8U
+#else
+#define SPH_BLEN     64U
+#define SPH_WLEN      4U
+#endif
+
+#ifdef BLEN
+#undef SPH_BLEN
+#define SPH_BLEN    BLEN
+#endif
+
+#undef SPH_MAXPAD
+#if defined PLW1
+#define SPH_MAXPAD   (SPH_BLEN - SPH_WLEN)
+#elif defined PLW4
+#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 2))
+#else
+#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 1))
+#endif
+
+#undef SPH_VAL
+#undef SPH_NO_OUTPUT
+#ifdef SVAL
+#define SPH_VAL         SVAL
+#define SPH_NO_OUTPUT   1
+#else
+#define SPH_VAL   sc->val
+#endif
+
+#ifndef CLOSE_ONLY
+
+#ifdef SPH_UPTR
+static void
+SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
+#else
+void
+HASH ( void *cc, const void *data, size_t len )
+#endif
+{
+   SPH_XCAT( HASH, _context ) *sc;
+   __m256i *vdata = (__m256i*)data;
+   size_t ptr;
+
+   sc = cc;
+   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = SPH_BLEN - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
+      vdata = vdata + (clen>>3);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == SPH_BLEN )
+      {
+         RFUN( sc->buf, SPH_VAL );
+         ptr = 0;
+      }
+         sc->count += clen;
+   }
+}
+
+#ifdef SPH_UPTR
+void
+HASH (void *cc, const void *data, size_t len)
+{
+   SPH_XCAT(HASH, _context) *sc;
+   __m256i *vdata = (__m256i*)data;
+   unsigned ptr;
+
+   if ( len < (2 * SPH_BLEN) )
+   {
+      SPH_XCAT(HASH, _short)(cc, data, len);
+      return;
+   }
+   sc = cc;
+   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
+   if ( ptr > 0 )
+   {
+      unsigned t;
+      t = SPH_BLEN - ptr;
+      SPH_XCAT( HASH, _short )( cc, data, t );
+      vdata = vdata + (t>>3);
+      len -= t;
+   }
+   SPH_XCAT( HASH, _short )( cc, data, len );
+}
+#endif
+
+#endif
+
+/*
+ * Perform padding and produce result. The context is NOT reinitialized
+ * by this function.
+ */
+static void
+SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
+          void *dst, unsigned rnum )
+{
+    SPH_XCAT(HASH, _context) *sc;
+    unsigned ptr, u;
+    sc = cc;
+    ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
+
+#ifdef PW01
+    sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x100 >> 8 );
+#else
+    sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x80 );
+#endif
+    ptr += 8;
+
+    if ( ptr > SPH_MAXPAD )
+    {
+         memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
+         RFUN( sc->buf, SPH_VAL );
+         memset_zero_256( sc->buf, SPH_MAXPAD >> 3 );
+    }
+    else
+    {
+         memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
+    }
+#if defined BE64
+#if defined PLW1
+    sc->buf[ SPH_MAXPAD>>3 ] =
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+#elif defined PLW4
+    memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
+    sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
+                mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+    sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
+                mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+#else
+    sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
+               mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+    sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
+               mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+#endif  // PLW
+#else  // LE64
+#if defined PLW1
+    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
+#elif defined PLW4
+    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
+    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
+                       _mm256_set1_epi64x( c->count >> 61 );
+    memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
+                       2 * SPH_WLEN );
+#else
+    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 );
+    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
+                          _mm256_set1_epi64x( sc->count >> 61 );
+#endif // PLW
+
+#endif // LE64
+
+    RFUN( sc->buf, SPH_VAL );
+
+#ifdef SPH_NO_OUTPUT
+    (void)dst;
+    (void)rnum;
+    (void)u;
+#else
+    for ( u = 0; u < rnum; u ++ )
+    {
+#if defined BE64
+       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
+#else  // LE64
+       ((__m256i*)dst)[u] = sc->val[u];
+#endif
+    }
+#endif
+}
+
+static void
+SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
+{
+   SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
+}
--- a/algo/sha/sha2-big-4way.c
+++ b/algo/sha/sha2-big-4way.c
@@ -0,0 +1,247 @@
+/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * SHA-384 / SHA-512 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_sha2.h"
+
+#if SPH_64
+
+#define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
+#define MAJ(X, Y, Z)   (((X) & (Y)) | (((X) | (Y)) & (Z)))
+
+#define ROTR64    SPH_ROTR64
+
+#define BSG5_0(x)      (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39))
+#define BSG5_1(x)      (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41))
+#define SSG5_0(x)      (ROTR64(x, 1) ^ ROTR64(x, 8) ^ SPH_T64((x) >> 7))
+#define SSG5_1(x)      (ROTR64(x, 19) ^ ROTR64(x, 61) ^ SPH_T64((x) >> 6))
+
+static const sph_u64 K512[80] = {
+	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
+	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
+	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
+	SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
+	SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
+	SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
+	SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
+	SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
+	SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
+	SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
+	SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
+	SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
+	SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
+	SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
+	SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
+	SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
+	SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
+	SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
+	SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
+	SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
+	SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
+	SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
+	SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
+	SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
+	SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
+	SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
+	SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
+	SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
+	SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
+	SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
+	SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
+	SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
+	SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
+	SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
+	SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
+	SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
+	SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
+	SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
+	SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
+	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
+};
+
+static const sph_u64 H384[8] = {
+	SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507),
+	SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939),
+	SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511),
+	SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4)
+};
+
+static const sph_u64 H512[8] = {
+	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+
+/*
+ * This macro defines the body for a SHA-384 / SHA-512 compression function
+ * implementation. The "in" parameter should evaluate, when applied to a
+ * numerical input parameter from 0 to 15, to an expression which yields
+ * the corresponding input block. The "r" parameter should evaluate to
+ * an array or pointer expression designating the array of 8 words which
+ * contains the input and output of the compression function.
+ *
+ * SHA-512 is hard for the compiler. If the loop is completely unrolled,
+ * then the code will be quite huge (possibly more than 100 kB), and the
+ * performance will be degraded due to cache misses on the code. We
+ * unroll only eight steps, which avoids all needless copies when
+ * 64-bit registers are swapped.
+ */
+
+#define SHA3_STEP(A, B, C, D, E, F, G, H, i)   do { \
+		sph_u64 T1, T2; \
+		T1 = SPH_T64(H + BSG5_1(E) + CH(E, F, G) + K512[i] + W[i]); \
+		T2 = SPH_T64(BSG5_0(A) + MAJ(A, B, C)); \
+		D = SPH_T64(D + T1); \
+		H = SPH_T64(T1 + T2); \
+	} while (0)
+
+#define SHA3_ROUND_BODY(in, r)   do { \
+		int i; \
+		sph_u64 A, B, C, D, E, F, G, H; \
+		sph_u64 W[80]; \
+ \
+ 		for (i = 0; i < 16; i ++) \
+			W[i] = in(i); \
+		for (i = 16; i < 80; i ++) \
+ 			W[i] = SPH_T64(SSG5_1(W[i - 2]) + W[i - 7] \
+				+ SSG5_0(W[i - 15]) + W[i - 16]); \
+		A = (r)[0]; \
+		B = (r)[1]; \
+		C = (r)[2]; \
+		D = (r)[3]; \
+		E = (r)[4]; \
+		F = (r)[5]; \
+		G = (r)[6]; \
+		H = (r)[7]; \
+		for (i = 0; i < 80; i += 8) { \
+			SHA3_STEP(A, B, C, D, E, F, G, H, i + 0); \
+			SHA3_STEP(H, A, B, C, D, E, F, G, i + 1); \
+			SHA3_STEP(G, H, A, B, C, D, E, F, i + 2); \
+			SHA3_STEP(F, G, H, A, B, C, D, E, i + 3); \
+			SHA3_STEP(E, F, G, H, A, B, C, D, i + 4); \
+			SHA3_STEP(D, E, F, G, H, A, B, C, i + 5); \
+			SHA3_STEP(C, D, E, F, G, H, A, B, i + 6); \
+			SHA3_STEP(B, C, D, E, F, G, H, A, i + 7); \
+		} \
+		(r)[0] = SPH_T64((r)[0] + A); \
+		(r)[1] = SPH_T64((r)[1] + B); \
+		(r)[2] = SPH_T64((r)[2] + C); \
+		(r)[3] = SPH_T64((r)[3] + D); \
+		(r)[4] = SPH_T64((r)[4] + E); \
+		(r)[5] = SPH_T64((r)[5] + F); \
+		(r)[6] = SPH_T64((r)[6] + G); \
+		(r)[7] = SPH_T64((r)[7] + H); \
+	} while (0)
+
+/*
+ * One round of SHA-384 / SHA-512. The data must be aligned for 64-bit access.
+ */
+static void
+sha3_round(const unsigned char *data, sph_u64 r[8])
+{
+#define SHA3_IN(x)   sph_dec64be_aligned(data + (8 * (x)))
+	SHA3_ROUND_BODY(SHA3_IN, r);
+#undef SHA3_IN
+}
+
+/* see sph_sha3.h */
+void
+sph_sha384_init(void *cc)
+{
+	sph_sha384_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H384, sizeof H384);
+	sc->count = 0;
+}
+
+/* see sph_sha3.h */
+void
+sph_sha512_init(void *cc)
+{
+	sph_sha512_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H512, sizeof H512);
+	sc->count = 0;
+}
+
+#define RFUN   sha3_round
+#define HASH   sha384
+#define BE64   1
+#include "md_helper.c"
+
+/* see sph_sha3.h */
+void
+sph_sha384_close(void *cc, void *dst)
+{
+	sha384_close(cc, dst, 6);
+//	sph_sha384_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha384_addbits_and_close(cc, ub, n, dst, 6);
+//	sph_sha384_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha512_close(void *cc, void *dst)
+{
+	sha384_close(cc, dst, 8);
+//	sph_sha512_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha384_addbits_and_close(cc, ub, n, dst, 8);
+//	sph_sha512_init(cc);
+}
+
+/* see sph_sha3.h */
+void
+sph_sha384_comp(const sph_u64 msg[16], sph_u64 val[8])
+{
+#define SHA3_IN(x)   msg[x]
+	SHA3_ROUND_BODY(SHA3_IN, val);
+#undef SHA3_IN
+}
+
+#endif
--- a/algo/sha/sha2-hash-4way.c
+++ b/algo/sha/sha2-hash-4way.c
@@ -0,0 +1,236 @@
+/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * SHA-384 / SHA-512 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sha2-hash-4way.h"
+
+#if defined(__AVX2__)
+
+static const sph_u64 K512[80] = {
+	SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD),
+	SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC),
+	SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019),
+	SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118),
+	SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE),
+	SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2),
+	SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1),
+	SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694),
+	SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3),
+	SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65),
+	SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483),
+	SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5),
+	SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210),
+	SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4),
+	SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725),
+	SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70),
+	SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926),
+	SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF),
+	SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8),
+	SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B),
+	SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001),
+	SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30),
+	SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910),
+	SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8),
+	SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53),
+	SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8),
+	SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB),
+	SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3),
+	SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60),
+	SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC),
+	SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9),
+	SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B),
+	SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207),
+	SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178),
+	SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6),
+	SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B),
+	SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493),
+	SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C),
+	SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A),
+	SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817)
+};
+
+static const sph_u64 H512[8] = {
+	SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B),
+	SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1),
+	SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F),
+	SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179)
+};
+
+#define CH(X, Y, Z) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) 
+
+#define MAJ(X, Y, Z) \
+   _mm256_or_si256( _mm256_and_si256( X, Y ), \
+                    _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) )
+
+#define BSG5_0(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+        mm256_rotr_64(x, 28), mm256_rotr_64(x, 34) ), mm256_rotr_64(x, 39) )
+
+#define BSG5_1(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+        mm256_rotr_64(x, 14), mm256_rotr_64(x, 18) ), mm256_rotr_64(x, 41) )
+
+#define SSG5_0(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+        mm256_rotr_64(x, 1), mm256_rotr_64(x, 8) ), _mm256_srli_epi64(x, 7) ) 
+
+#define SSG5_1(x) \
+   _mm256_xor_si256( _mm256_xor_si256( \
+        mm256_rotr_64(x, 19), mm256_rotr_64(x, 61) ), _mm256_srli_epi64(x, 6) )
+
+#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \
+do { \
+  __m256i T1, T2; \
+  T1 = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64( \
+       _mm256_add_epi64( H, BSG5_1(E) ), CH(E, F, G) ), \
+                         _mm256_set1_epi64x( K512[i] ) ), W[i] ); \
+  T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \
+  D  = _mm256_add_epi64( D, T1 ); \
+  H  = _mm256_add_epi64( T1, T2 ); \
+} while (0)
+
+static void
+sha512_4way_round( __m256i *in, __m256i r[8] )
+{
+   int i;
+   __m256i A, B, C, D, E, F, G, H;
+   __m256i W[80];
+
+   for ( i = 0; i < 16; i++ )
+      W[i] = mm256_bswap_64( in[i] );
+   for ( i = 16; i < 80; i++ )
+      W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64(
+           SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] );
+
+   A = r[0];
+   B = r[1];
+   C = r[2];
+   D = r[3];
+   E = r[4];
+   F = r[5];
+   G = r[6];
+   H = r[7];
+
+   for ( i = 0; i < 80; i += 8 )
+   {
+      SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 );
+      SHA3_4WAY_STEP( H, A, B, C, D, E, F, G, i + 1 );
+      SHA3_4WAY_STEP( G, H, A, B, C, D, E, F, i + 2 );
+      SHA3_4WAY_STEP( F, G, H, A, B, C, D, E, i + 3 );
+      SHA3_4WAY_STEP( E, F, G, H, A, B, C, D, i + 4 );
+      SHA3_4WAY_STEP( D, E, F, G, H, A, B, C, i + 5 );
+      SHA3_4WAY_STEP( C, D, E, F, G, H, A, B, i + 6 );
+      SHA3_4WAY_STEP( B, C, D, E, F, G, H, A, i + 7 );
+   }
+
+   r[0] = _mm256_add_epi64( r[0], A );
+   r[1] = _mm256_add_epi64( r[1], B );
+   r[2] = _mm256_add_epi64( r[2], C );
+   r[3] = _mm256_add_epi64( r[3], D );
+   r[4] = _mm256_add_epi64( r[4], E );
+   r[5] = _mm256_add_epi64( r[5], F );
+   r[6] = _mm256_add_epi64( r[6], G );
+   r[7] = _mm256_add_epi64( r[7], H );
+}
+
+void sha512_4way_init( sha512_4way_context *sc )
+{
+   sc->count = 0;
+   sc->val[0] = _mm256_set1_epi64x( H512[0] );
+   sc->val[1] = _mm256_set1_epi64x( H512[1] );
+   sc->val[2] = _mm256_set1_epi64x( H512[2] );
+   sc->val[3] = _mm256_set1_epi64x( H512[3] );
+   sc->val[4] = _mm256_set1_epi64x( H512[4] );
+   sc->val[5] = _mm256_set1_epi64x( H512[5] );
+   sc->val[6] = _mm256_set1_epi64x( H512[6] );
+   sc->val[7] = _mm256_set1_epi64x( H512[7] );
+}
+
+void sha512_4way( sha512_4way_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   size_t ptr;
+   int buf_size = 128;
+
+   ptr = (unsigned)sc->count & (buf_size - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 );
+      vdata = vdata + (clen>>3);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         sha512_4way_round( sc->buf, sc->val );
+         ptr = 0;
+      }
+      sc->count += clen;
+   }
+}
+
+void sha512_4way_close( sha512_4way_context *sc, void *dst )
+{
+    unsigned ptr, u;
+    int buf_size = 128;
+    int pad = buf_size - 16;
+
+    ptr = (unsigned)sc->count & (buf_size - 1U);
+    sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 );
+    ptr += 8;
+
+    if ( ptr > pad )
+    {
+         memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 );
+         sha512_4way_round( sc->buf, sc->val );
+         memset_zero_256( sc->buf, pad >> 3 );
+    }
+    else
+         memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
+
+    sc->buf[ pad >> 3 ] =
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+    sc->buf[ ( pad+8 ) >> 3 ] = 
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+    sha512_4way_round( sc->buf, sc->val );
+
+    for ( u = 0; u < 8; u ++ )
+       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
+}
+
+#endif
--- a/algo/sha/sha2-hash-4way.h
+++ b/algo/sha/sha2-hash-4way.h
@@ -0,0 +1,104 @@
+/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * SHA-224, SHA-256, SHA-384 and SHA-512 interface.
+ *
+ * SHA-256 has been published in FIPS 180-2, now amended with a change
+ * notice to include SHA-224 as well (which is a simple variation on
+ * SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS
+ * standards can be found at:
+ *    http://csrc.nist.gov/publications/fips/
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_sha2.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SHA2_HASH_4WAY_H__
+#define SHA2_HASH_4WAY_H__ 1
+
+#include <stddef.h>
+#include "sph_types.h"
+#include "avxdefs.h"
+
+#if 0
+
+#define SPH_SIZE_sha224   224
+
+#define SPH_SIZE_sha256   256
+
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u32 val[8];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_sha224_context;
+
+typedef sph_sha224_context sph_sha256_context;
+
+void sph_sha224_init(void *cc);
+
+void sph_sha224(void *cc, const void *data, size_t len);
+
+void sph_sha224_close(void *cc, void *dst);
+
+void sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+void sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]);
+
+void sph_sha256_init(void *cc);
+
+void sph_sha256(void *cc, const void *data, size_t len);
+
+void sph_sha256_close(void *cc, void *dst);
+
+void sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst);
+
+void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]);
+
+#endif
+
+#if defined (__AVX2__)
+
+#define SPH_SIZE_sha512   512
+
+typedef struct {
+   __m256i buf[128>>3];
+   __m256i val[8];
+   uint64_t count;
+} sha512_4way_context;
+
+void sha512_4way_init( sha512_4way_context *sc);
+void sha512_4way( sha512_4way_context *sc, const void *data, size_t len );
+void sha512_4way_close( sha512_4way_context *sc, void *dst );
+
+#endif
+#endif
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -36,15 +36,15 @@ void sha256t_hash(void* output, const void* input,  uint32_t len)
        memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );

        SHA256_Update( &ctx_sha256, input + midlen, tail );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );

        memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
        SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );

        memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
        SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
 #else
        sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
        memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -0,0 +1,618 @@
+/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
+/*
+ * Shabal implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#ifdef __AVX2__
+
+#include "shabal-hash-4way.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * Part of this code was automatically generated (the part between
+ * the "BEGIN" and "END" markers).
+ */
+
+#define sM    16
+
+#define C32   SPH_C32
+#define T32   SPH_T32
+
+#define O1   13
+#define O2    9
+#define O3    6
+
+/*
+ * We copy the state into local variables, so that the compiler knows
+ * that it can optimize them at will.
+ */
+
+/* BEGIN -- automatically generated code. */
+
+#define DECL_STATE   \
+	__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
+	        A08, A09, A0A, A0B; \
+	__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
+	        B8, B9, BA, BB, BC, BD, BE, BF; \
+	__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
+	        C8, C9, CA, CB, CC, CD, CE, CF; \
+	__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
+	        M8, M9, MA, MB, MC, MD, ME, MF; \
+	sph_u32 Wlow, Whigh;
+
+#define READ_STATE(state)   do { \
+		A00 = (state)->A[0]; \
+		A01 = (state)->A[1]; \
+		A02 = (state)->A[2]; \
+		A03 = (state)->A[3]; \
+		A04 = (state)->A[4]; \
+		A05 = (state)->A[5]; \
+		A06 = (state)->A[6]; \
+		A07 = (state)->A[7]; \
+		A08 = (state)->A[8]; \
+		A09 = (state)->A[9]; \
+		A0A = (state)->A[10]; \
+		A0B = (state)->A[11]; \
+		B0 = (state)->B[0]; \
+		B1 = (state)->B[1]; \
+		B2 = (state)->B[2]; \
+		B3 = (state)->B[3]; \
+		B4 = (state)->B[4]; \
+		B5 = (state)->B[5]; \
+		B6 = (state)->B[6]; \
+		B7 = (state)->B[7]; \
+		B8 = (state)->B[8]; \
+		B9 = (state)->B[9]; \
+		BA = (state)->B[10]; \
+		BB = (state)->B[11]; \
+		BC = (state)->B[12]; \
+		BD = (state)->B[13]; \
+		BE = (state)->B[14]; \
+		BF = (state)->B[15]; \
+		C0 = (state)->C[0]; \
+		C1 = (state)->C[1]; \
+		C2 = (state)->C[2]; \
+		C3 = (state)->C[3]; \
+		C4 = (state)->C[4]; \
+		C5 = (state)->C[5]; \
+		C6 = (state)->C[6]; \
+		C7 = (state)->C[7]; \
+		C8 = (state)->C[8]; \
+		C9 = (state)->C[9]; \
+		CA = (state)->C[10]; \
+		CB = (state)->C[11]; \
+		CC = (state)->C[12]; \
+		CD = (state)->C[13]; \
+		CE = (state)->C[14]; \
+		CF = (state)->C[15]; \
+		Wlow = (state)->Wlow; \
+		Whigh = (state)->Whigh; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->A[0] = A00; \
+		(state)->A[1] = A01; \
+		(state)->A[2] = A02; \
+		(state)->A[3] = A03; \
+		(state)->A[4] = A04; \
+		(state)->A[5] = A05; \
+		(state)->A[6] = A06; \
+		(state)->A[7] = A07; \
+		(state)->A[8] = A08; \
+		(state)->A[9] = A09; \
+		(state)->A[10] = A0A; \
+		(state)->A[11] = A0B; \
+		(state)->B[0] = B0; \
+		(state)->B[1] = B1; \
+		(state)->B[2] = B2; \
+		(state)->B[3] = B3; \
+		(state)->B[4] = B4; \
+		(state)->B[5] = B5; \
+		(state)->B[6] = B6; \
+		(state)->B[7] = B7; \
+		(state)->B[8] = B8; \
+		(state)->B[9] = B9; \
+		(state)->B[10] = BA; \
+		(state)->B[11] = BB; \
+		(state)->B[12] = BC; \
+		(state)->B[13] = BD; \
+		(state)->B[14] = BE; \
+		(state)->B[15] = BF; \
+		(state)->C[0] = C0; \
+		(state)->C[1] = C1; \
+		(state)->C[2] = C2; \
+		(state)->C[3] = C3; \
+		(state)->C[4] = C4; \
+		(state)->C[5] = C5; \
+		(state)->C[6] = C6; \
+		(state)->C[7] = C7; \
+		(state)->C[8] = C8; \
+		(state)->C[9] = C9; \
+		(state)->C[10] = CA; \
+		(state)->C[11] = CB; \
+		(state)->C[12] = CC; \
+		(state)->C[13] = CD; \
+		(state)->C[14] = CE; \
+		(state)->C[15] = CF; \
+		(state)->Wlow = Wlow; \
+		(state)->Whigh = Whigh; \
+	} while (0)
+
+#define DECODE_BLOCK \
+do { \
+   M0 = buf[ 0]; \
+   M1 = buf[ 1]; \
+   M2 = buf[ 2]; \
+   M3 = buf[ 3]; \
+   M4 = buf[ 4]; \
+   M5 = buf[ 5]; \
+   M6 = buf[ 6]; \
+   M7 = buf[ 7]; \
+   M8 = buf[ 8]; \
+   M9 = buf[ 9]; \
+   MA = buf[10]; \
+   MB = buf[11]; \
+   MC = buf[12]; \
+   MD = buf[13]; \
+   ME = buf[14]; \
+   MF = buf[15]; \
+} while (0)
+
+#define INPUT_BLOCK_ADD \
+do { \
+    B0 = _mm_add_epi32( B0, M0 );\
+    B1 = _mm_add_epi32( B1, M1 );\
+    B2 = _mm_add_epi32( B2, M2 );\
+    B3 = _mm_add_epi32( B3, M3 );\
+    B4 = _mm_add_epi32( B4, M4 );\
+    B5 = _mm_add_epi32( B5, M5 );\
+    B6 = _mm_add_epi32( B6, M6 );\
+    B7 = _mm_add_epi32( B7, M7 );\
+    B8 = _mm_add_epi32( B8, M8 );\
+    B9 = _mm_add_epi32( B9, M9 );\
+    BA = _mm_add_epi32( BA, MA );\
+    BB = _mm_add_epi32( BB, MB );\
+    BC = _mm_add_epi32( BC, MC );\
+    BD = _mm_add_epi32( BD, MD );\
+    BE = _mm_add_epi32( BE, ME );\
+    BF = _mm_add_epi32( BF, MF );\
+} while (0)
+
+#define INPUT_BLOCK_SUB \
+do { \
+    C0 = _mm_sub_epi32( C0, M0 ); \
+    C1 = _mm_sub_epi32( C1, M1 ); \
+    C2 = _mm_sub_epi32( C2, M2 ); \
+    C3 = _mm_sub_epi32( C3, M3 ); \
+    C4 = _mm_sub_epi32( C4, M4 ); \
+    C5 = _mm_sub_epi32( C5, M5 ); \
+    C6 = _mm_sub_epi32( C6, M6 ); \
+    C7 = _mm_sub_epi32( C7, M7 ); \
+    C8 = _mm_sub_epi32( C8, M8 ); \
+    C9 = _mm_sub_epi32( C9, M9 ); \
+    CA = _mm_sub_epi32( CA, MA ); \
+    CB = _mm_sub_epi32( CB, MB ); \
+    CC = _mm_sub_epi32( CC, MC ); \
+    CD = _mm_sub_epi32( CD, MD ); \
+    CE = _mm_sub_epi32( CE, ME ); \
+    CF = _mm_sub_epi32( CF, MF ); \
+} while (0)
+
+#define XOR_W \
+do { \
+   A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
+   A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
+} while (0)
+/*
+#define SWAP(v1, v2)   do { \
+		sph_u32 tmp = (v1); \
+		(v1) = (v2); \
+		(v2) = tmp; \
+	} while (0)
+*/
+#define SWAP_BC \
+do { \
+    mm_swap_128( B0, C0 ); \
+    mm_swap_128( B1, C1 ); \
+    mm_swap_128( B2, C2 ); \
+    mm_swap_128( B3, C3 ); \
+    mm_swap_128( B4, C4 ); \
+    mm_swap_128( B5, C5 ); \
+    mm_swap_128( B6, C6 ); \
+    mm_swap_128( B7, C7 ); \
+    mm_swap_128( B8, C8 ); \
+    mm_swap_128( B9, C9 ); \
+    mm_swap_128( BA, CA ); \
+    mm_swap_128( BB, CB ); \
+    mm_swap_128( BC, CC ); \
+    mm_swap_128( BD, CD ); \
+    mm_swap_128( BE, CE ); \
+    mm_swap_128( BF, CF ); \
+} while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+do { \
+   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
+            _mm_andnot_si128( xb3, xb2 ), \
+            _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
+               _mm_mullo_epi32(  mm_rotl_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
+                   ) ), _mm_set1_epi32(3UL) ) ) ) ); \
+   xb0 = mm_not( _mm_xor_si128( xa0, mm_rotl_32( xb0, 1 ) ) ); \
+} while (0)
+
+#define PERM_STEP_0   do { \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_1   do { \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_2   do { \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define APPLY_P \
+do { \
+    B0 = mm_rotr_32( B0, 15 ); \
+    B1 = mm_rotr_32( B1, 15 ); \
+    B2 = mm_rotr_32( B2, 15 ); \
+    B3 = mm_rotr_32( B3, 15 ); \
+    B4 = mm_rotr_32( B4, 15 ); \
+    B5 = mm_rotr_32( B5, 15 ); \
+    B6 = mm_rotr_32( B6, 15 ); \
+    B7 = mm_rotr_32( B7, 15 ); \
+    B8 = mm_rotr_32( B8, 15 ); \
+    B9 = mm_rotr_32( B9, 15 ); \
+    BA = mm_rotr_32( BA, 15 ); \
+    BB = mm_rotr_32( BB, 15 ); \
+    BC = mm_rotr_32( BC, 15 ); \
+    BD = mm_rotr_32( BD, 15 ); \
+    BE = mm_rotr_32( BE, 15 ); \
+    BF = mm_rotr_32( BF, 15 ); \
+    PERM_STEP_0; \
+    PERM_STEP_1; \
+    PERM_STEP_2; \
+    A0B = _mm_add_epi32( A0B, C6 ); \
+    A0A = _mm_add_epi32( A0A, C5 ); \
+    A09 = _mm_add_epi32( A09, C4 ); \
+    A08 = _mm_add_epi32( A08, C3 ); \
+    A07 = _mm_add_epi32( A07, C2 ); \
+    A06 = _mm_add_epi32( A06, C1 ); \
+    A05 = _mm_add_epi32( A05, C0 ); \
+    A04 = _mm_add_epi32( A04, CF ); \
+    A03 = _mm_add_epi32( A03, CE ); \
+    A02 = _mm_add_epi32( A02, CD ); \
+    A01 = _mm_add_epi32( A01, CC ); \
+    A00 = _mm_add_epi32( A00, CB ); \
+    A0B = _mm_add_epi32( A0B, CA ); \
+    A0A = _mm_add_epi32( A0A, C9 ); \
+    A09 = _mm_add_epi32( A09, C8 ); \
+    A08 = _mm_add_epi32( A08, C7 ); \
+    A07 = _mm_add_epi32( A07, C6 ); \
+    A06 = _mm_add_epi32( A06, C5 ); \
+    A05 = _mm_add_epi32( A05, C4 ); \
+    A04 = _mm_add_epi32( A04, C3 ); \
+    A03 = _mm_add_epi32( A03, C2 ); \
+    A02 = _mm_add_epi32( A02, C1 ); \
+    A01 = _mm_add_epi32( A01, C0 ); \
+    A00 = _mm_add_epi32( A00, CF ); \
+    A0B = _mm_add_epi32( A0B, CE ); \
+    A0A = _mm_add_epi32( A0A, CD ); \
+    A09 = _mm_add_epi32( A09, CC ); \
+    A08 = _mm_add_epi32( A08, CB ); \
+    A07 = _mm_add_epi32( A07, CA ); \
+    A06 = _mm_add_epi32( A06, C9 ); \
+    A05 = _mm_add_epi32( A05, C8 ); \
+    A04 = _mm_add_epi32( A04, C7 ); \
+    A03 = _mm_add_epi32( A03, C6 ); \
+    A02 = _mm_add_epi32( A02, C5 ); \
+    A01 = _mm_add_epi32( A01, C4 ); \
+    A00 = _mm_add_epi32( A00, C3 ); \
+} while (0)
+
+#define INCR_W   do { \
+		if ((Wlow = T32(Wlow + 1)) == 0) \
+			Whigh = T32(Whigh + 1); \
+	} while (0)
+
+static const sph_u32 A_init_256[] = {
+	C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
+	C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
+	C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
+};
+
+static const sph_u32 B_init_256[] = {
+	C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
+	C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
+	C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
+	C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
+};
+
+static const sph_u32 C_init_256[] = {
+	C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
+	C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
+	C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
+	C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
+};
+
+static const sph_u32 A_init_512[] = {
+	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+static const sph_u32 B_init_512[] = {
+	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+static const sph_u32 C_init_512[] = {
+	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+static void
+shabal_4way_init( void *cc, unsigned size )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+   int i;
+
+   if ( size == 512 )
+   {
+      for ( i = 0; i < 12; i++ )
+         sc->A[i] = _mm_set1_epi32( A_init_512[i] );
+      for ( i = 0; i < 16; i++ )
+      {
+         sc->B[i] = _mm_set1_epi32( B_init_512[i] );
+         sc->C[i] = _mm_set1_epi32( C_init_512[i] );
+      }
+   }
+   else
+   {
+      for ( i = 0; i < 12; i++ )
+         sc->A[i] = _mm_set1_epi32( A_init_256[i] );
+      for ( i = 0; i < 16; i++ )
+      {
+         sc->B[i] = _mm_set1_epi32( B_init_256[i] );
+         sc->C[i] = _mm_set1_epi32( C_init_256[i] );
+      }
+    }
+    sc->Wlow = 1;
+    sc->Whigh = 0;
+    sc->ptr = 0;
+}
+
+static void
+shabal_4way_core( void *cc, const unsigned char *data, size_t len )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+    __m128i *buf;
+    __m128i *vdata = (__m128i*)data;
+   const int buf_size = 64;  
+   size_t ptr;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr ) )
+   {
+      memcpy_128( buf + (ptr>>2), vdata, len>>2 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }
+   READ_STATE(sc);
+
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
+
+      ptr += clen;
+      vdata += clen>>2;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         DECODE_BLOCK;
+         INPUT_BLOCK_ADD;
+         XOR_W;
+         APPLY_P;
+         INPUT_BLOCK_SUB;
+         SWAP_BC;
+         INCR_W;
+         ptr = 0;
+      }
+   }
+   WRITE_STATE(sc);
+   sc->ptr = ptr;
+}
+
+static void
+shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
+                   unsigned size_words )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+    __m128i *buf;
+   const int buf_size = 64;
+   size_t ptr;
+   int i;
+   unsigned z, zz;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   zz = ((ub & -z) | z) & 0xFF;
+   buf[ptr>>2] = _mm_set1_epi32( zz );
+   memset_zero_128( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
+   READ_STATE(sc);
+   DECODE_BLOCK;
+   INPUT_BLOCK_ADD;
+   XOR_W;
+   APPLY_P;
+
+   for ( i = 0; i < 3; i ++ )
+   {
+      SWAP_BC;
+      XOR_W;
+      APPLY_P;
+   }
+
+   __m128i *d = (__m128i*)dst;
+   if ( size_words == 16 )   // 512
+   {
+      d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
+      d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
+      d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
+      d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
+   }
+   else    // 256
+   {
+      d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
+      d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
+   }
+}
+
+void
+shabal256_4way_init( void *cc )
+{
+	shabal_4way_init(cc, 256);
+}
+
+void
+shabal256_4way( void *cc, const void *data, size_t len )
+{
+	shabal_4way_core( cc, data, len );
+}
+
+void
+shabal256_4way_close( void *cc, void *dst )
+{
+	shabal_4way_close(cc, 0, 0, dst, 8);
+}
+
+void
+shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                  void *dst )
+{
+	shabal_4way_close(cc, ub, n, dst, 8);
+}
+
+void
+shabal512_4way_init(void *cc)
+{
+	shabal_4way_init(cc, 512);
+}
+
+void
+shabal512_4way(void *cc, const void *data, size_t len)
+{
+	shabal_4way_core(cc, data, len);
+}
+
+void
+shabal512_4way_close(void *cc, void *dst)
+{
+	shabal_4way_close(cc, 0, 0, dst, 16);
+}
+
+void
+shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_4way_close(cc, ub, n, dst, 16);
+}
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -0,0 +1,82 @@
+/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
+/**
+ * Shabal interface. Shabal is a family of functions which differ by
+ * their output size; this implementation defines Shabal for output
+ * sizes 192, 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shabal.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SHABAL_HASH_4WAY_H__
+#define SHABAL_HASH_4WAY_H__ 1
+
+#ifdef __AVX2__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#define SPH_SIZE_shabal256   256
+
+#define SPH_SIZE_shabal512   512
+
+typedef struct {
+	__m128i buf[16] __attribute__ ((aligned (64)));
+	__m128i A[12], B[16], C[16];
+	sph_u32 Whigh, Wlow;
+        size_t ptr;
+} shabal_4way_context;
+
+typedef shabal_4way_context shabal256_4way_context;
+typedef shabal_4way_context shabal512_4way_context;
+
+void shabal256_4way_init( void *cc );
+void shabal256_4way( void *cc, const void *data, size_t len );
+void shabal256_4way_close( void *cc, void *dst );
+void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+void shabal512_4way_init( void *cc );
+void shabal512_4way( void *cc, const void *data, size_t len );
+void shabal512_4way_close( void *cc, void *dst );
+void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
+
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -74,6 +74,18 @@ static const sph_u32 IV512[] = {
 	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
 };

+// Return hi 128 bits with elements shifted one lane with vacated lane filled
+// with data rotated from lo.
+// Partially rotate elements in two 128 bit vectors as one 256 bit vector
+// and return the rotated high 128 bits.
+// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
+// completed. It's faster than a full rotation.
+
+static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
+{   return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
+                        _mm_slli_si128( lo, 16 - (n<<2) ) );
+}
+
 #define AES_ROUND_NOKEY(x0, x1, x2, x3)   do { \
 		sph_u32 t0 = (x0); \
 		sph_u32 t1 = (x1); \
@@ -267,9 +279,6 @@ c512(sph_shavite_big_context *sc, const void *msg)

 #else

-/*
- * This function assumes that "msg" is aligned for 32-bit access.
- */
 static void
 c512( sph_shavite_big_context *sc, const void *msg )
 {
@@ -287,42 +296,42 @@ c512( sph_shavite_big_context *sc, const void *msg )
   // round
   k00 = m[0];
   x = _mm_xor_si128( p1, k00 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
  
   k01 = m[1];
   x = _mm_xor_si128( x, k01 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );

   k02 = m[2];
   x = _mm_xor_si128( x, k02 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );

   k03 = m[3];
   x = _mm_xor_si128( x, k03 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   p0 = _mm_xor_si128( p0, x );

   k10 = m[4];
   x = _mm_xor_si128( p3, k10 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   
   k11 = m[5];
   x = _mm_xor_si128( x, k11 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );

   k12 = m[6];
   x = _mm_xor_si128( x, k12 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );

   k13 = m[7];
   x = _mm_xor_si128( x, k13 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   p2 = _mm_xor_si128( p2, x );

   for ( r = 0; r < 3; r ++ )
   {
      // round 1, 5, 9
-      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
      k00 = _mm_xor_si128( k00, k13 ); 

      if ( r == 0 )
@@ -330,8 +339,8 @@ c512( sph_shavite_big_context *sc, const void *msg )
                  ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 

      x = _mm_xor_si128( p0, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
      k01 = _mm_xor_si128( k01, k00 );

      if ( r == 1 )
@@ -339,34 +348,34 @@ c512( sph_shavite_big_context *sc, const void *msg )
                  ~sc->count0, sc->count1, sc->count2, sc->count3 ) );

      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
      k02 = _mm_xor_si128( k02, k01 );

      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
      k03 = _mm_xor_si128( k03, k02 );

      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p3 = _mm_xor_si128( p3, x );
-      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
      k10 = _mm_xor_si128( k10, k03 );

      x = _mm_xor_si128( p2, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
      k11 = _mm_xor_si128( k11, k10 );

      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
      k12 = _mm_xor_si128( k12, k11 );

      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
      k13 = _mm_xor_si128( k13, k12 );

      if ( r == 2 )
@@ -374,173 +383,173 @@ c512( sph_shavite_big_context *sc, const void *msg )
                  ~sc->count1, sc->count0, sc->count3, sc->count2 ) );

      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p1 = _mm_xor_si128( p1, x );

      // round 2, 6, 10

-      k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
      x = _mm_xor_si128( p3, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

      p2 = _mm_xor_si128( p2, x );
-      k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
      x = _mm_xor_si128( p1, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p0 = _mm_xor_si128( p0, x );

      // round 3, 7, 11

-      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
      k00 = _mm_xor_si128( k00, k13 );

      x = _mm_xor_si128( p2, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );

-      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
      k01 = _mm_xor_si128( k01, k00 );

      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
      k02 = _mm_xor_si128( k02, k01 );

      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
      k03 = _mm_xor_si128( k03, k02 );

      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p1 = _mm_xor_si128( p1, x );
-      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
      k10 = _mm_xor_si128( k10, k03 );

      x = _mm_xor_si128( p0, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
      k11 = _mm_xor_si128( k11, k10 );

      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
      k12 = _mm_xor_si128( k12, k11 );

      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
      k13 = _mm_xor_si128( k13, k12 );

      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p3 = _mm_xor_si128( p3, x );

      // round 4, 8, 12

-      k00 = _mm_xor_si128( k00, mm_rotr256_32( k12, k13, 1 ) );
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );

      x = _mm_xor_si128( p1, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k01 = _mm_xor_si128( k01, mm_rotr256_32( k13, k00, 1 ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );

      x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k02 = _mm_xor_si128( k02, mm_rotr256_32( k00, k01, 1 ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );

      x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k03 = _mm_xor_si128( k03, mm_rotr256_32( k01, k02, 1 ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );

      x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p0 = _mm_xor_si128( p0, x );
-      k10 = _mm_xor_si128( k10, mm_rotr256_32( k02, k03, 1 ) );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );

      x = _mm_xor_si128( p3, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k11 = _mm_xor_si128( k11, mm_rotr256_32( k03, k10, 1 ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );

      x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k12 = _mm_xor_si128( k12, mm_rotr256_32( k10, k11, 1 ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );

      x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k13 = _mm_xor_si128( k13, mm_rotr256_32( k11, k12, 1 ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );

      x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
      p2 = _mm_xor_si128( p2, x );
   }

   // round 13

-   k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+   k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
   k00 = _mm_xor_si128( k00, k13 );

   x = _mm_xor_si128( p0, k00 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); 
+   x = _mm_aesenc_si128( x, m128_zero );
+   k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) ); 
   k01 = _mm_xor_si128( k01, k00 );

   x = _mm_xor_si128( x, k01 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
   k02 = _mm_xor_si128( k02, k01 );

   x = _mm_xor_si128( x, k02 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
   k03 = _mm_xor_si128( k03, k02 );

   x = _mm_xor_si128( x, k03 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   p3 = _mm_xor_si128( p3, x );
-   k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+   k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
   k10 = _mm_xor_si128( k10, k03 );

   x = _mm_xor_si128( p2, k10 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
   k11 = _mm_xor_si128( k11, k10 );

   x = _mm_xor_si128( x, k11 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
   k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );

   x = _mm_xor_si128( x, k12 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
   k13 = _mm_xor_si128( k13, k12 );

   x = _mm_xor_si128( x, k13 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   p1 = _mm_xor_si128( p1, x );

   h[0] = _mm_xor_si128( h[0], p2 );
--- a/algo/simd/sse2/nist.c
+++ b/algo/simd/sse2/nist.c
--- a/algo/simd/sse2/nist.h
+++ b/algo/simd/sse2/nist.h
--- a/algo/simd/sse2/simd-compat.h
+++ b/algo/simd/sse2/simd-compat.h
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -0,0 +1,853 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "simd-hash-2way.h"
+
+#if defined (__AVX2__)
+
+// imported from simd_iv.h
+
+uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc,
+                           0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
+                           0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f,
+                           0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
+                           0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8,
+                           0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
+                           0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4,
+                           0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 };
+
+/* Twiddle tables */
+
+static const m256_v16 FFT64_Twiddle[] =
+{
+    {{ 1,    2,    4,    8,   16,   32,   64,  128,
+       1,    2,    4,    8,   16,   32,   64,  128 }},
+    {{ 1,   60,    2,  120,    4,  -17,    8,  -34,
+       1,   60,    2,  120,    4,  -17,    8,  -34 }},
+    {{ 1,  120,    8,  -68,   64,  -30,   -2,   17,
+       1,  120,    8,  -68,   64,  -30,   -2,   17 }},
+    {{ 1,   46,   60,  -67,    2,   92,  120,  123,
+       1,   46,   60,  -67,    2,   92,  120,  123 }},
+    {{ 1,   92,  -17,  -22,   32,  117,  -30,   67,
+       1,   92,  -17,  -22,   32,  117,  -30,   67 }},
+    {{ 1,  -67,  120,  -73,    8,  -22,  -68,  -70,
+       1,  -67,  120,  -73,    8,  -22,  -68,  -70 }},
+    {{ 1,  123,  -34,  -70,  128,   67,   17,   35,
+       1,  123,  -34,  -70,  128,   67,   17,   35 }},
+};
+
+static const m256_v16 FFT128_Twiddle[] =
+{
+    {{   1, -118,   46,  -31,   60,  116,  -67,  -61,
+         1, -118,   46,  -31,   60,  116,  -67,  -61 }},
+    {{   2,   21,   92,  -62,  120,  -25,  123, -122,
+         2,   21,   92,  -62,  120,  -25,  123, -122 }},
+    {{   4,   42,  -73, -124,  -17,  -50,  -11,   13,
+         4,   42,  -73, -124,  -17,  -50,  -11,   13 }},
+    {{   8,   84,  111,    9,  -34, -100,  -22,   26,
+         8,   84,  111,    9,  -34, -100,  -22,   26 }},
+    {{  16,  -89,  -35,   18,  -68,   57,  -44,   52,
+        16,  -89,  -35,   18,  -68,   57,  -44,   52 }},
+    {{  32,   79,  -70,   36,  121,  114,  -88,  104,
+        32,   79,  -70,   36,  121,  114,  -88,  104 }},
+    {{  64,  -99,  117,   72,  -15,  -29,   81,  -49,
+        64,  -99,  117,   72,  -15,  -29,   81,  -49 }},
+    {{ 128,   59,  -23, -113,  -30,  -58,  -95,  -98,
+       128,   59,  -23, -113,  -30,  -58,  -95,  -98 }},
+};
+
+static const m256_v16 FFT256_Twiddle[] =
+{
+    {{   1,   41, -118,   45,   46,   87,  -31,   14,
+         1,   41, -118,   45,   46,   87,  -31,   14 }},
+    {{  60, -110,  116, -127,  -67,   80,  -61,   69,
+        60, -110,  116, -127,  -67,   80,  -61,   69 }},
+    {{   2,   82,   21,   90,   92,  -83,  -62,   28,
+         2,   82,   21,   90,   92,  -83,  -62,   28 }},
+    {{ 120,   37,  -25,    3,  123,  -97, -122, -119,
+       120,   37,  -25,    3,  123,  -97, -122, -119 }},
+    {{   4,  -93,   42,  -77,  -73,   91, -124,   56,
+         4,  -93,   42,  -77,  -73,   91, -124,   56 }},
+    {{ -17,   74,  -50,    6,  -11,   63,   13,   19,
+       -17,   74,  -50,    6,  -11,   63,   13,   19 }},
+    {{   8,   71,   84,  103,  111,  -75,    9,  112,
+         8,   71,   84,  103,  111,  -75,    9,  112 }},
+    {{ -34, -109, -100,   12,  -22,  126,   26,   38,
+       -34, -109, -100,   12,  -22,  126,   26,   38 }},
+    {{  16, -115,  -89,  -51,  -35,  107,   18,  -33,
+        16, -115,  -89,  -51,  -35,  107,   18,  -33 }},
+    {{ -68,   39,   57,   24,  -44,   -5,   52,   76,
+       -68,   39,   57,   24,  -44,   -5,   52,   76 }},
+    {{  32,   27,   79, -102,  -70,  -43,   36,  -66,
+        32,   27,   79, -102,  -70,  -43,   36,  -66 }},
+    {{ 121,   78,  114,   48,  -88,  -10,  104, -105,
+       121,   78,  114,   48,  -88,  -10,  104, -105 }},
+    {{  64,   54,  -99,   53,  117,  -86,   72,  125,
+        64,   54,  -99,   53,  117,  -86,   72,  125 }},
+    {{ -15, -101,  -29,   96,   81,  -20,  -49,   47,
+       -15, -101,  -29,   96,   81,  -20,  -49,   47 }},
+    {{ 128,  108,   59,  106,  -23,   85, -113,   -7,
+       128,  108,   59,  106,  -23,   85, -113,   -7 }},
+    {{ -30,   55,  -58,  -65,  -95,  -40,  -98,   94,
+       -30,   55,  -58,  -65,  -95,  -40,  -98,   94 }}
+};
+
+#define SHUFXOR_1 0xb1          /* 0b10110001 */
+#define SHUFXOR_2 0x4e          /* 0b01001110 */
+#define SHUFXOR_3 0x1b          /* 0b00011011 */
+
+#define CAT(x, y) x##y
+#define XCAT(x,y) CAT(x,y)
+
+#define shufxor(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
+
+// imported from vector.c
+
+#define REDUCE(x) \
+  _mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi16( 255 ) ), \
+                                         _mm256_srai_epi16( x, 8 ) )
+
+#define EXTRA_REDUCE_S(x)\
+  _mm256_sub_epi16( x, \
+         _mm256_and_si256( _mm256_set1_epi16( 257 ), \
+                           _mm256_cmpgt_epi16( x, _mm256_set1_epi16( 128 ) ) ) )
+
+#define REDUCE_FULL_S( x )  EXTRA_REDUCE_S( REDUCE (x ) )
+
+#define DO_REDUCE( i )      X(i) = REDUCE( X(i) )
+
+#define DO_REDUCE_FULL_S(i) \
+do { \
+    X(i) = REDUCE( X(i) );                        \
+    X(i) = EXTRA_REDUCE_S( X(i) );                \
+} while(0)
+
+void fft64_2way( void *a )
+{
+  __m256i* const A = a;
+  register __m256i X0, X1, X2, X3, X4, X5, X6, X7;
+
+#define X(i) X##i
+
+  X0 = A[0];
+  X1 = A[1];
+  X2 = A[2];
+  X3 = A[3];
+  X4 = A[4];
+  X5 = A[5];
+  X6 = A[6];
+  X7 = A[7];
+
+#define DO_REDUCE(i)   X(i) = REDUCE( X(i) )
+
+   // Begin with 8 parallels DIF FFT_8
+   //
+   // FFT_8 using w=4 as 8th root of unity
+   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
+   //  Output data is in revbin_permuted order.
+
+  static const int w[] = {0, 2, 4, 6};
+//   __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
+
+
+#define BUTTERFLY_0( i,j ) \
+do { \
+    __m256i v = X(j); \
+    X(j) = _mm256_add_epi16( X(i), X(j) ); \
+    X(i) = _mm256_sub_epi16( X(i), v ); \
+} while(0)
+
+#define BUTTERFLY_N( i,j,n ) \
+do { \
+    __m256i v = X(j); \
+    X(j) = _mm256_add_epi16( X(i), X(j) ); \
+    X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
+} while(0)
+
+  BUTTERFLY_0( 0, 4 );
+  BUTTERFLY_N( 1, 5, 1 );
+  BUTTERFLY_N( 2, 6, 2 );
+  BUTTERFLY_N( 3, 7, 3 );
+
+  DO_REDUCE( 2 );
+  DO_REDUCE( 3 );
+
+  BUTTERFLY_0( 0, 2 );
+  BUTTERFLY_0( 4, 6 );
+  BUTTERFLY_N( 1, 3, 2 );
+  BUTTERFLY_N( 5, 7, 2 );
+
+  DO_REDUCE( 1 );
+
+  BUTTERFLY_0( 0, 1 );
+  BUTTERFLY_0( 2, 3 );
+  BUTTERFLY_0( 4, 5 );
+  BUTTERFLY_0( 6, 7 );
+
+  /* We don't need to reduce X(7) */
+  DO_REDUCE_FULL_S( 0 );
+  DO_REDUCE_FULL_S( 1 );
+  DO_REDUCE_FULL_S( 2 );
+  DO_REDUCE_FULL_S( 3 );
+  DO_REDUCE_FULL_S( 4 );
+  DO_REDUCE_FULL_S( 5 );
+  DO_REDUCE_FULL_S( 6 );
+
+#undef BUTTERFLY_0
+#undef BUTTERFLY_N
+
+  // Multiply by twiddle factors
+  X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i );
+  X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i );
+  X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i );
+  X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i );
+  X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i );
+  X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i );
+  X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i );
+
+  // Transpose the FFT state with a revbin order permutation
+  // on the rows and the column.
+  // This will make the full FFT_64 in order.
+#define INTERLEAVE(i,j) \
+  do { \
+    __m256i t1= X(i); \
+    __m256i t2= X(j); \
+    X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
+    X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
+  } while(0)
+
+  INTERLEAVE( 1, 0 );
+  INTERLEAVE( 3, 2 );
+  INTERLEAVE( 5, 4 );
+  INTERLEAVE( 7, 6 );
+
+  INTERLEAVE( 2, 0 );
+  INTERLEAVE( 3, 1 );
+  INTERLEAVE( 6, 4 );
+  INTERLEAVE( 7, 5 );
+
+  INTERLEAVE( 4, 0 );
+  INTERLEAVE( 5, 1 );
+  INTERLEAVE( 6, 2 );
+  INTERLEAVE( 7, 3 );
+
+#undef INTERLEAVE
+
+   //Finish with 8 parallels DIT FFT_8
+   //FFT_8 using w=4 as 8th root of unity
+   // Unrolled decimation in time (DIT) radix-2 NTT.
+   // Input data is in revbin_permuted order.
+
+#define BUTTERFLY_0( i,j ) \
+do { \
+   __m256i u = X(j); \
+   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
+   X(i) = _mm256_add_epi16( u, X(i) ); \
+} while(0)
+
+
+#define BUTTERFLY_N( i,j,n ) \
+do { \
+   __m256i u = X(j); \
+   X(i) = _mm256_slli_epi16( X(i), w[n] ); \
+   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
+   X(i) = _mm256_add_epi16( u, X(i) ); \
+} while(0)
+
+  DO_REDUCE( 0 );
+  DO_REDUCE( 1 );
+  DO_REDUCE( 2 );
+  DO_REDUCE( 3 );
+  DO_REDUCE( 4 );
+  DO_REDUCE( 5 );
+  DO_REDUCE( 6 );
+  DO_REDUCE( 7 );
+
+  BUTTERFLY_0( 0, 1 );
+  BUTTERFLY_0( 2, 3 );
+  BUTTERFLY_0( 4, 5 );
+  BUTTERFLY_0( 6, 7 );
+
+  BUTTERFLY_0( 0, 2 );
+  BUTTERFLY_0( 4, 6 );
+  BUTTERFLY_N( 1, 3, 2 );
+  BUTTERFLY_N( 5, 7, 2 );
+
+  DO_REDUCE( 3 );
+
+  BUTTERFLY_0( 0, 4 );
+  BUTTERFLY_N( 1, 5, 1 );
+  BUTTERFLY_N( 2, 6, 2 );
+  BUTTERFLY_N( 3, 7, 3 );
+
+  DO_REDUCE_FULL_S( 0 );
+  DO_REDUCE_FULL_S( 1 );
+  DO_REDUCE_FULL_S( 2 );
+  DO_REDUCE_FULL_S( 3 );
+  DO_REDUCE_FULL_S( 4 );
+  DO_REDUCE_FULL_S( 5 );
+  DO_REDUCE_FULL_S( 6 );
+  DO_REDUCE_FULL_S( 7 );
+
+#undef BUTTERFLY
+
+  A[0] = X0;
+  A[1] = X1;
+  A[2] = X2;
+  A[3] = X3;
+  A[4] = X4;
+  A[5] = X5;
+  A[6] = X6;
+  A[7] = X7;
+
+#undef X
+}
+
+void fft128_2way( void *a )
+{
+  int i;
+  // Temp space to help for interleaving in the end
+  __m256i B[8];
+  __m256i *A = (__m256i*) a;
+//  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
+
+  /* Size-2 butterflies */
+  for ( i = 0; i<8; i++ )
+  {
+    B[ i ]   = _mm256_add_epi16( A[ i ], A[ i+8 ] );
+    B[ i ]   = REDUCE_FULL_S( B[ i ] );
+    A[ i+8 ] = _mm256_sub_epi16( A[ i ], A[ i+8 ] );
+    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
+    A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].m256i );
+    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
+  }
+
+  fft64_2way( B );
+  fft64_2way( A+8 );
+
+  /* Transpose (i.e. interleave) */
+  for ( i = 0; i < 8; i++ )
+  {
+    A[ 2*i   ] = _mm256_unpacklo_epi16( B[ i ], A[ i+8 ] );
+    A[ 2*i+1 ] = _mm256_unpackhi_epi16( B[ i ], A[ i+8 ] );
+  }
+}
+
+void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
+{
+  static const m256_v16 Tweak      = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
+  static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
+
+  __m256i *X = (__m256i*)x;
+  __m256i *A = (__m256i*)a;
+//  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
+
+#define UNPACK( i ) \
+do { \
+    __m256i t = X[i]; \
+    A[2*i]   = _mm256_unpacklo_epi8( t, m256_zero ); \
+    A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].m256i ); \
+    A[2*i+8] = REDUCE(A[2*i+8]); \
+    A[2*i+1] = _mm256_unpackhi_epi8( t, m256_zero ); \
+    A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].m256i ); \
+    A[2*i+9] = REDUCE(A[2*i+9]); \
+} while(0)
+
+    // This allows to tweak the last butterflies to introduce X^127
+#define UNPACK_TWEAK( i,tw ) \
+do { \
+    __m256i t = X[i]; \
+    __m256i tmp; \
+    A[2*i]   = _mm256_unpacklo_epi8( t, m256_zero ); \
+    A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].m256i ); \
+    A[2*i+8] = REDUCE( A[ 2*i+8 ] ); \
+    tmp      = _mm256_unpackhi_epi8( t, m256_zero ); \
+    A[2*i+1] = _mm256_add_epi16( tmp, tw ); \
+    A[2*i+9] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
+                                   FFT128_Twiddle[ 2*i+1 ].m256i );\
+    A[2*i+9] = REDUCE( A[ 2*i+9 ] );                       \
+} while(0)
+
+  UNPACK( 0 );
+  UNPACK( 1 );
+  UNPACK( 2 );
+  if ( final )
+    UNPACK_TWEAK( 3, FinalTweak.m256i );
+  else
+    UNPACK_TWEAK( 3, Tweak.m256i );
+
+#undef UNPACK
+#undef UNPACK_TWEAK
+
+  fft64_2way( a );
+  fft64_2way( a+128 );
+}
+
+void fft256_2way_msg( uint16_t *a, const uint8_t *x, int final )
+{
+  static const m256_v16 Tweak      = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
+  static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
+
+  __m256i *X = (__m256i*)x;
+  __m256i *A = (__m256i*)a;
+//  __m256i *Twiddle = (__m256i*)FFT256_Twiddle;
+
+#define UNPACK( i ) \
+do { \
+    __m256i t = X[i]; \
+    A[ 2*i      ] = _mm256_unpacklo_epi8( t, m256_zero ); \
+    A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
+                                        FFT256_Twiddle[ 2*i ].m256i ); \
+    A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
+    A[ 2*i +  1 ] = _mm256_unpackhi_epi8( t, m256_zero ); \
+    A[ 2*i + 17 ] = _mm256_mullo_epi16( A[ 2*i + 1 ], \
+                                        FFT256_Twiddle[ 2*i + 1 ].m256i ); \
+    A[ 2*i + 17 ] = REDUCE( A[ 2*i + 17 ] ); \
+} while(0)
+
+   // This allows to tweak the last butterflies to introduce X^127
+#define UNPACK_TWEAK( i,tw ) \
+do { \
+    __m256i t = X[i]; \
+    __m256i tmp; \
+    A[ 2*i      ] = _mm256_unpacklo_epi8( t, m256_zero ); \
+    A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
+                                        FFT256_Twiddle[ 2*i ].m256i ); \
+    A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
+    tmp           = _mm256_unpackhi_epi8( t, m256_zero ); \
+    A[ 2*i +  1 ] = _mm256_add_epi16( tmp, tw ); \
+    A[ 2*i + 17 ] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
+                                        FFT256_Twiddle[ 2*i + 1 ].m256i ); \
+  } while(0)
+
+  UNPACK( 0 );
+  UNPACK( 1 );
+  UNPACK( 2 );
+  UNPACK( 3 );
+  UNPACK( 4 );
+  UNPACK( 5 );
+  UNPACK( 6 );
+  if ( final )
+    UNPACK_TWEAK( 7, FinalTweak.m256i );
+  else
+    UNPACK_TWEAK( 7, Tweak.m256i );
+
+#undef UNPACK
+#undef UNPACK_TWEAK
+
+  fft128_2way( a );
+  fft128_2way( a+256 );
+}
+
+void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
+{
+  register __m256i S0l, S1l, S2l, S3l;
+  register __m256i S0h, S1h, S2h, S3h;
+  __m256i *S = (__m256i*) state;
+  __m256i *M = (__m256i*) msg;
+  __m256i *W = (__m256i*) fft;
+  static const m256_v16 code[] = { mm256_setc1_16(185), mm256_setc1_16(233) };
+
+  S0l = _mm256_xor_si256( S[0], M[0] );
+  S0h = _mm256_xor_si256( S[1], M[1] );
+  S1l = _mm256_xor_si256( S[2], M[2] );
+  S1h = _mm256_xor_si256( S[3], M[3] );
+  S2l = _mm256_xor_si256( S[4], M[4] );
+  S2h = _mm256_xor_si256( S[5], M[5] );
+  S3l = _mm256_xor_si256( S[6], M[6] );
+  S3h = _mm256_xor_si256( S[7], M[7] );
+
+#define S(i) S##i
+
+#define F_0(B, C, D) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( C,D ), B ), D )
+#define F_1(B, C, D) \
+   _mm256_or_si256( _mm256_and_si256( D, C ),\
+                    _mm256_and_si256( _mm256_or_si256( D,C ), B ) )
+
+#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
+#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
+
+  // We split the round function in two halfes
+  // so as to insert some independent computations in between
+
+#define SUM7_00 0
+#define SUM7_01 1
+#define SUM7_02 2
+#define SUM7_03 3
+#define SUM7_04 4
+#define SUM7_05 5
+#define SUM7_06 6
+
+#define SUM7_10 1
+#define SUM7_11 2
+#define SUM7_12 3
+#define SUM7_13 4
+#define SUM7_14 5
+#define SUM7_15 6
+#define SUM7_16 0
+
+#define SUM7_20 2
+#define SUM7_21 3
+#define SUM7_22 4
+#define SUM7_23 5
+#define SUM7_24 6
+#define SUM7_25 0
+#define SUM7_26 1
+
+#define SUM7_30 3
+#define SUM7_31 4
+#define SUM7_32 5
+#define SUM7_33 6
+#define SUM7_34 0
+#define SUM7_35 1
+#define SUM7_36 2
+
+#define SUM7_40 4
+#define SUM7_41 5
+#define SUM7_42 6
+#define SUM7_43 0
+#define SUM7_44 1
+#define SUM7_45 2
+#define SUM7_46 3
+
+#define SUM7_50 5
+#define SUM7_51 6
+#define SUM7_52 0
+#define SUM7_53 1
+#define SUM7_54 2
+#define SUM7_55 3
+#define SUM7_56 4
+
+#define SUM7_60 6
+#define SUM7_61 0
+#define SUM7_62 1
+#define SUM7_63 2
+#define SUM7_64 3
+#define SUM7_65 4
+#define SUM7_66 5
+
+#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
+
+#define PERM_0(d,a) /* XOR 1 */ \
+do { \
+    d##l = shufxor( a##l, 1 ); \
+    d##h = shufxor( a##h, 1 ); \
+ } while(0)
+
+#define PERM_1(d,a) /* XOR 6 */ \
+do { \
+    d##l = shufxor( a##h, 2 ); \
+    d##h = shufxor( a##l, 2 ); \
+} while(0)
+
+#define PERM_2(d,a) /* XOR 2 */ \
+do { \
+    d##l = shufxor( a##l, 2 ); \
+    d##h = shufxor( a##h, 2 ); \
+} while(0)
+
+#define PERM_3(d,a) /* XOR 3 */ \
+do { \
+    d##l = shufxor( a##l, 3 ); \
+    d##h = shufxor( a##h, 3 ); \
+} while(0)
+
+#define PERM_4(d,a) /* XOR 5 */ \
+do { \
+    d##l = shufxor( a##h, 1 ); \
+    d##h = shufxor( a##l, 1 ); \
+} while(0)
+
+#define PERM_5(d,a) /* XOR 7 */ \
+do { \
+    d##l = shufxor( a##h, 3 ); \
+    d##h = shufxor( a##l, 3 ); \
+} while(0)
+
+#define PERM_6(d,a) /* XOR 4 */ \
+do { \
+    d##l = a##h; \
+    d##h = a##l; \
+} while(0)
+
+#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
+do { \
+    TTl  = Fl( a,b,c,fun ); \
+    TTh  = Fh( a,b,c,fun ); \
+    a##l = mm256_rotl_32( a##l, r ); \
+    a##h = mm256_rotl_32( a##h, r ); \
+    w##l = _mm256_add_epi32( w##l, d##l ); \
+    w##h = _mm256_add_epi32( w##h, d##h ); \
+    TTl  = _mm256_add_epi32( TTl, w##l ); \
+    TTh  = _mm256_add_epi32( TTh, w##h ); \
+    TTl  = mm256_rotl_32( TTl, s ); \
+    TTh  = mm256_rotl_32( TTh, s ); \
+    PERM( z,d,a ); \
+} while(0)
+
+#define STEP_1( a,b,c,d,w,fun,r,s,z )   STEP_1_( a,b,c,d,w,fun,r,s,z )
+
+#define STEP_2_( a,b,c,d,w,fun,r,s ) \
+do { \
+    d##l = _mm256_add_epi32( d##l, TTl ); \
+    d##h = _mm256_add_epi32( d##h, TTh ); \
+} while(0)
+
+#define STEP_2( a,b,c,d,w,fun,r,s )  STEP_2_( a,b,c,d,w,fun,r,s )
+
+#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
+do { \
+    register __m256i TTl, TTh, Wl=w1, Wh=w2; \
+    STEP_1( a,b,c,d,W,fun,r,s,z ); \
+    STEP_2( a,b,c,d,W,fun,r,s ); \
+} while(0);
+
+#define MSG_l(x) (2*(x))
+#define MSG_h(x) (2*(x)+1)
+
+#define MSG( w,hh,ll,u,z ) \
+do { \
+    int a = MSG_##u(hh); \
+    int b = MSG_##u(ll); \
+    w##l = _mm256_unpacklo_epi16( W[a], W[b] ); \
+    w##l = _mm256_mullo_epi16( w##l, code[z].m256i ); \
+    w##h = _mm256_unpackhi_epi16( W[a], W[b]) ; \
+    w##h = _mm256_mullo_epi16( w##h, code[z].m256i ); \
+} while(0)
+
+#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
+do { \
+    register __m256i W0l, W1l, W2l, W3l, TTl; \
+    register __m256i W0h, W1h, W2h, W3h, TTh; \
+    MSG( W0, h0, l0, u0, z ); \
+    STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
+    MSG( W1, h1, l1, u1, z ); \
+    STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
+    STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
+    MSG( W2,h2,l2,u2,z ); \
+    STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
+    STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
+    MSG( W3,h3,l3,u3,z ); \
+    STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
+    STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
+    STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
+} while(0)
+
+   // 4 rounds with code 185
+#define PERM_START 0
+   ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
+#undef PERM_START
+#define PERM_START 4
+   ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
+#undef PERM_START
+#define PERM_START 1
+   ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
+#undef PERM_START
+#define PERM_START 5
+   ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
+#undef PERM_START
+
+   // 4 rounds with code 233
+#define PERM_START 2
+   ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
+#undef PERM_START
+#define PERM_START 6
+   ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
+#undef PERM_START
+#define PERM_START 3
+   ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
+#undef PERM_START
+#define PERM_START 0
+   ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
+#undef PERM_START
+
+   // 1 round as feed-forward
+#define PERM_START 4
+   STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0 );
+   STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
+   STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
+   STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3 );
+
+   S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
+   S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
+
+#undef PERM_START
+#undef STEP_1
+#undef STEP_2
+#undef STEP
+#undef ROUND
+}
+
+void SIMD_2way_Compress( simd_2way_context *state, const void *m, int final )
+{
+   m256_v16 Y[32];
+   uint16_t *y = (uint16_t*) Y[0].u16;
+   fft256_2way_msg( y, m, final );
+   rounds512_2way( state->A, m, y );
+}
+
+// imported from nist.c
+
+int simd_2way_init( simd_2way_context *state, int hashbitlen )
+{
+  __m256i *A = (__m256i*)state->A;
+  int n = 8;
+
+  state->hashbitlen = hashbitlen;
+  state->n_feistels = n;
+  state->blocksize = 128*8;
+  state->count = 0;
+
+  for ( int i = 0; i < 8; i++ )
+       A[i] = _mm256_set_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
+                                SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0],
+                                SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
+                                SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] );
+  return 0;
+}
+
+int simd_2way_update( simd_2way_context *state, const void *data,
+                             int databitlen )
+{
+  int bs      = state->blocksize;
+  int current = state->count & (bs - 1);
+
+  while ( databitlen > 0 )
+  {
+    if ( current == 0 && databitlen >= bs )
+    {
+       // We can hash the data directly from the input buffer.
+      SIMD_2way_Compress( state, data, 0 );
+      databitlen -= bs;
+      data += 2*(bs/8);
+      state->count += bs;
+    }
+    else
+    {
+       // Copy a chunk of data to the buffer
+      int len = bs - current;
+      if ( databitlen < len )
+      {
+        memcpy( state->buffer + 2*(current/8), data, 2*((databitlen+7)/8) );
+        state->count += databitlen;
+        return 0;
+      }
+      else
+      {
+        memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
+        state->count += len;
+        databitlen -= len;
+        data += 2*(len/8);
+        current = 0;
+        SIMD_2way_Compress( state, state->buffer, 0 );
+      }
+    }
+  }
+  return 0;
+}
+
+int simd_2way_close( simd_2way_context *state, void *hashval )
+{
+  uint64_t l;
+  int current = state->count & (state->blocksize - 1);
+  int i;
+  int isshort = 1;
+
+  // If there is still some data in the buffer, hash it
+  if ( current )
+  {
+    current = ( current+7 ) / 8;
+    memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current ) );
+    SIMD_2way_Compress( state, state->buffer, 0 );
+  }
+
+  //* Input the message length as the last block
+  memset( state->buffer, 0, 2*(state->blocksize / 8) );
+  l = state->count;
+  for ( i = 0; i < 8; i++ )
+  {
+    state->buffer[ i     ] = l & 0xff;
+    state->buffer[ i+16 ] = l & 0xff;
+    l >>= 8;
+  }
+  if ( state->count < 16384 )
+    isshort = 2;
+
+  SIMD_2way_Compress( state, state->buffer, isshort );
+  memcpy( hashval, state->A, 2*(state->hashbitlen / 8) );
+
+  return 0;
+}
+
+int simd_2way_update_close( simd_2way_context *state, void *hashval,
+                            const void *data, int databitlen )
+{
+  int current, i;
+  int bs = state->blocksize;  // bits in one lane
+  int isshort = 1;
+  uint64_t l;
+
+  current = state->count & (bs - 1);
+
+  while ( databitlen > 0 )
+  {
+    if ( current == 0 && databitlen >= bs )
+    {
+      // We can hash the data directly from the input buffer.
+      SIMD_2way_Compress( state, data, 0 );
+      databitlen -= bs;
+      data += 2*( bs/8 );
+      state->count += bs;
+    }
+    else
+    {
+      // Copy a chunk of data to the buffer
+      int len = bs - current;
+      if ( databitlen < len )
+      {
+        memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
+        state->count += databitlen;
+        break;
+      }
+      else
+      {
+        memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
+        state->count += len;
+        databitlen -= len;
+        data += 2*( len/8 );
+        current = 0;
+        SIMD_2way_Compress( state, state->buffer, 0 );
+      }
+    }
+  }
+
+  current = state->count & (state->blocksize - 1);
+
+  // If there is still some data in the buffer, hash it
+  if ( current )
+  {
+    current = ( current+7 ) / 8;
+    memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current) );
+    SIMD_2way_Compress( state, state->buffer, 0 );
+  }
+
+  //* Input the message length as the last block
+  memset( state->buffer, 0, 2*( state->blocksize/8 ) );
+  l = state->count;
+  for ( i = 0; i < 8; i++ )
+  {
+    state->buffer[ i    ] = l & 0xff;
+    state->buffer[ i+16 ] = l & 0xff;
+    l >>= 8;
+  }
+  if ( state->count < 16384 )
+    isshort = 2;
+
+  SIMD_2way_Compress( state, state->buffer, isshort );
+  memcpy( hashval, state->A, 2*( state->hashbitlen / 8 ) );
+  return 0;
+}
+
+#endif
--- a/algo/simd/simd-hash-2way.h
+++ b/algo/simd/simd-hash-2way.h
@@ -0,0 +1,27 @@
+#ifndef SIMD_HASH_2WAY_H__
+#define SIMD_HASH_2WAY_H__ 1
+
+#include "simd-compat.h"
+
+#if defined(__AVX2__)
+
+#include "avxdefs.h"
+
+typedef struct {
+  uint32_t A[ 32*2 ] __attribute__((aligned(64)));
+  uint8_t buffer[ 128*2 ] __attribute__((aligned(64)));
+  uint64_t count;
+  unsigned int hashbitlen;
+  unsigned int blocksize;
+  unsigned int n_feistels;
+  
+} simd_2way_context;
+
+int simd_2way_init( simd_2way_context *state, int hashbitlen );
+int simd_2way_update( simd_2way_context *state, const void *data,
+                      int databitlen );
+int simd_2way_close( simd_2way_context *state, void *hashval );
+int simd_2way_update_close( simd_2way_context *state, void *hashval,
+                            const void *data, int databitlen );
+#endif
+#endif
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	a28daca3ce	v3.8.1	2018-02-07 16:38:45 -05:00
Jay D Dee	54b8fd7362	v3.8.0.1	2018-02-05 22:10:18 -05:00
Jay D Dee	ad2275f74a	v3.8.0	2018-01-23 21:02:16 -05:00
Jay D Dee	a90d75b8f5	v3.7.10	2018-01-16 15:11:44 -05:00
Jay D Dee	bee78eac76	v3.7.9	2018-01-08 22:04:43 -05:00
Jay D Dee	2d2e54f001	v3.7.8	2017-12-30 19:19:46 -05:00