v3.7.4

2025-09-17 23:44:27 +00:00 · 2017-11-28 16:32:04 -05:00
parent 6d1361c87f
commit 4b57ac0eb9
70 changed files with 10549 additions and 2852 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,24 +22,6 @@ cpuminer_SOURCES = \
  api.c \
  sysinfos.c \
  algo-gate-api.c\
-  algo/groestl/sph_groestl.c \
-  algo/bmw/sph_bmw.c \
-  algo/shavite/sph_shavite.c \
-  algo/shavite/shavite.c \
-  algo/echo/sph_echo.c \
-  algo/heavy/sph_hefty1.c \
-  algo/luffa/sph_luffa.c \
-  algo/cubehash/sph_cubehash.c \
-  algo/simd/sph_simd.c \
-  algo/hamsi/sph_hamsi.c \
-  algo/fugue/sph_fugue.c \
-  algo/gost/sph_gost.c \
-  algo/jh/sph_jh.c \
-  algo/sha/sph_sha2.c \
-  algo/sha/sph_sha2big.c \
-  algo/shabal/sph_shabal.c \
-  algo/sm3/sm3.c \
-  algo/whirlpool/sph_whirlpool.c\
  crypto/blake2s.c \
  crypto/oaes_lib.c \
  crypto/c_keccak.c \
@@ -67,22 +49,34 @@ cpuminer_SOURCES = \
  algo/blake/blake2s.c \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
+  algo/blake/decred-gate.c \
  algo/blake/decred.c \
+  algo/blake/decred-4way.c \
+  algo/blake/pentablake-gate.c \
+  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
+  algo/bmw/sph_bmw.c \
  algo/bmw/bmw256.c \
-  algo/cubehash/sse2/cubehash_sse2.c\
  algo/cryptonight/cryptolight.c \
  algo/cryptonight/cryptonight-common.c\
  algo/cryptonight/cryptonight-aesni.c\
  algo/cryptonight/cryptonight.c\
+  algo/cubehash/sph_cubehash.c \
+  algo/cubehash/sse2/cubehash_sse2.c\
  algo/drop.c \
+  algo/echo/sph_echo.c \
  algo/echo/aes_ni/hash.c\
  algo/fresh.c \
+  algo/gost/sph_gost.c \
+  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
  algo/groestl/myr-groestl.c \
  algo/groestl/aes_ni/hash-groestl.c \
  algo/groestl/aes_ni/hash-groestl256.c \
+  algo/fugue/sph_fugue.c \
+  algo/hamsi/sph_hamsi.c \
  algo/haval/haval.c\
+  algo/heavy/sph_hefty1.c \
  algo/heavy/heavy.c \
  algo/heavy/bastion.c \
  algo/hmq1725.c \
@@ -91,6 +85,10 @@ cpuminer_SOURCES = \
  algo/hodl/hodl-wolf.c \
  algo/hodl/sha512_avx.c \
  algo/hodl/sha512_avx2.c \
+  algo/jh/sph_jh.c \
+  algo/jh/jh-hash-4way.c \
+  algo/jh/jha-gate.c \
+  algo/jh/jha-4way.c \
  algo/jh/jha.c \
  algo/keccak/sph_keccak.c \
  algo/keccak/keccak.c\
@@ -99,6 +97,7 @@ cpuminer_SOURCES = \
  algo/keccak/keccak-gate.c \
  algo/keccak/sse2/keccak.c \
  algo/lbry.c \
+  algo/luffa/sph_luffa.c \
  algo/luffa/luffa.c \
  algo/luffa/sse2/luffa_for_sse2.c \
  algo/lyra2/lyra2.c \
@@ -109,7 +108,9 @@ cpuminer_SOURCES = \
  algo/lyra2/lyra2z330.c \
  algo/m7m.c \
  algo/neoscrypt.c \
-  algo/nist5.c \
+  algo/nist5/nist5-gate.c \
+  algo/nist5/nist5-4way.c \
+  algo/nist5/nist5.c \
  algo/pluck.c \
  algo/polytimos/polytimos-gate.c \
  algo/polytimos/polytimos.c \
@@ -119,8 +120,14 @@ cpuminer_SOURCES = \
  algo/ripemd/sph_ripemd.c \
  algo/scrypt.c \
  algo/scryptjane/scrypt-jane.c \
+  algo/sha/sph_sha2.c \
+  algo/sha/sph_sha2big.c \
  algo/sha/sha2.c \
  algo/sha/sha256t.c \
+  algo/shabal/sph_shabal.c \
+  algo/shavite/sph_shavite.c \
+  algo/shavite/shavite.c \
+  algo/simd/sph_simd.c \
  algo/simd/sse2/nist.c \
  algo/simd/sse2/vector.c \
  algo/skein/sph_skein.c \
@@ -132,11 +139,18 @@ cpuminer_SOURCES = \
  algo/skein/skein2-4way.c \
  algo/skein/skein2-gate.c \
  algo/skunk.c \
+  algo/sm3/sm3.c \
  algo/tiger/sph_tiger.c \
  algo/timetravel.c \
  algo/timetravel10.c \
-  algo/tribus.c \
+  algo/tribus/tribus-gate.c \
+  algo/tribus/tribus.c \
+  algo/tribus/tribus-4way.c \
  algo/veltor.c \
+  algo/whirlpool/sph_whirlpool.c \
+  algo/whirlpool/whirlpool-hash-4way.c \
+  algo/whirlpool/whirlpool-gate.c \
+  algo/whirlpool/whirlpool-4way.c \
  algo/whirlpool/whirlpool.c \
  algo/whirlpool/whirlpoolx.c \
  algo/x11/phi1612.c \
--- a/README.txt
+++ b/README.txt
@@ -17,13 +17,17 @@ supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.

-Exe name                  Compile opts       Arch name
-
-cpuminer-sse2.exe         -march=core2,      Core2   
-cpuminer-sse42.exe        -march=corei7,     Nehalem
-cpuminer-aes-sse42.exe    -maes -msse4.2     Westmere
-cpuminer-aes-avx.exe      -march=corei7-avx, Sandybridge, Ivybridge
-cpuminer-aes-avx2.exe     -march=core-avx2,  Haswell, Broadwell, Skylake, Kabylake
+Exe name                  Compile opts         Arch name

+cpuminer-sse2.exe         -march=core2         Core2   
+cpuminer-sse42.exe        -march=corei7        Nehalem
+cpuminer-aes-sse42.exe    -maes -msse4.2"      Westmere
+cpuminer-aes-avx.exe      -march=corei7-avx"   Sandybridge, Ivybridge
+cpuminer-aes-avx2.exe     "-march=core-avx2"   Haswell, Broadwell, Skylake, Kabylake
+cpuminer-4way.exe         "-march=core-avx2 -DFOUR_WAY"

+4way requires a CPU with AES and AVX2. It is still under development and
+only a few algos are supported. See change log in RELEASE_NOTES in source
+package for supported algos.

+There is no binary support available for SHA on AMD Ryzen CPUs.
--- a/16
+++ b/16
@@ -67,6 +67,11 @@ have been due to AVX and AVX2 optimizations added to that version.
 Additional improvements are expected on Ryzen with openssl 1.1.
 "-march-znver1" or "-msha".

+Additional instructions for static compilalation can be found here:
+https://lxadm.com/Static_compilation_of_cpuminer
+Static builds should only considered in a homogeneous HW and SW environment.
+Local builds will always have the best performance and compatibility.
+
 Extract cpuminer source.

 tar xvzf cpuminer-opt-x.y.z.tar.gz
@@ -96,6 +101,11 @@ Start mining.

 ./cpuminer -a algo -o url -u username -p password

+Windows
+
+The following in how the Windows binary releases are built. It's old and
+not very good but it works, for me anyway.
+
 Building on Windows prerequisites:

 msys
@@ -154,6 +164,12 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.7.4
+
+Removed unnecessary build options.
+
+Added 4way support for tribus and nist5.
+
 v3.7.3

 Added polytimos algo.
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -1,11 +1,11 @@
-#include "algo-gate-api.h"
+#include "blake-gate.h"
 #include "sph_blake.h"
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>

-#if defined (__AVX__)
+#if defined (BLAKE_4WAY)

 void blakehash_4way(void *state, const void *input)
 {
@@ -41,7 +41,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
-   int num_found;
+   int num_found = 0;

 //   if (opt_benchmark)
 //      HTarget = 0x7f;
@@ -55,7 +55,6 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *noncep = vdata + 76;   // 19*4
   do {
      found[0] = found[1] = found[2] = found[3] = false;
-      num_found = 0;
      be32enc( noncep,    n   );
      be32enc( noncep +2, n+1 );
      be32enc( noncep +4, n+2 );
--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -8,11 +8,11 @@ int64_t blake_get_max64 ()
 bool register_blake_algo( algo_gate_t* gate )
 {
  gate->get_max64 = (void*)&blake_get_max64;
-#if defined (__AVX2__) && defined (FOUR_WAY)
+//#if defined (__AVX2__) && defined (FOUR_WAY)
 //   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
 //  gate->scanhash  = (void*)&scanhash_blake_8way;
 //  gate->hash      = (void*)&blakehash_8way;
-#elif defined(__AVX__) && defined (FOUR_WAY)
+#if defined(BLAKE_4WAY)
  gate->optimizations = SSE2_OPT | AVX_OPT;
  gate->scanhash  = (void*)&scanhash_blake_4way;
  gate->hash      = (void*)&blakehash_4way;
--- a/algo/blake/blake-gate.h
+++ b/algo/blake/blake-gate.h
@@ -4,13 +4,11 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined (__AVX2__) 
-//void blakehash_84way(void *state, const void *input);
-//int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
-//                         uint64_t *hashes_done );
+#if defined(FOUR_WAY) && defined(__AVX__)
+  #define BLAKE_4WAY
 #endif

-#if defined (__AVX__)
+#if defined (BLAKE_4WAY)
 void blakehash_4way(void *state, const void *input);
 int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -346,7 +346,7 @@ static const sph_u32 CS[16] = {
 #define CBF   SPH_C64(0x636920D871574E69)

 #if SPH_COMPACT_BLAKE_64
-
+// not used
 static const sph_u64 CB[16] = {
 	SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344),
 	SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89),
@@ -428,7 +428,7 @@ do { \
 } while (0)

 #if SPH_COMPACT_BLAKE_64
-
+// not used
 #define ROUND_B_4WAY(r)   do { \
 	GB_4WAY(M[sigma[r][0x0]], M[sigma[r][0x1]], \
 		CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \
@@ -449,7 +449,7 @@ do { \
 } while (0)

 #else
-
+//current_impl
 #define ROUND_B_4WAY(r)   do { \
 	GB_4WAY(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \
 	GB_4WAY(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \
@@ -696,6 +696,7 @@ do { \

 #if SPH_COMPACT_BLAKE_64

+// not used
 #define COMPRESS64_4WAY   do { \
 	__m256i M[16]; \
 	__m256i V0, V1, V2, V3, V4, V5, V6, V7; \
@@ -759,6 +760,8 @@ do { \

 #else

+//current impl
+
 #define COMPRESS64_4WAY   do { \
     __m256i M0, M1, M2, M3, M4, M5, M6, M7; \
     __m256i M8, M9, MA, MB, MC, MD, ME, MF; \
@@ -986,7 +989,7 @@ blake64_4way( blake_4way_big_context *sc, const void *data, size_t len)
   size_t ptr;
   DECL_STATE64_4WAY

-   const int buf_size = 64;  //  sizeof/8 
+   const int buf_size = 128;  //  sizeof/8 

   buf = sc->buf;
   ptr = sc->ptr;
@@ -1037,7 +1040,7 @@ blake64_4way_close( blake_4way_big_context *sc,
   __m256i *out;

   ptr = sc->ptr;
-   bit_len = ((unsigned)ptr << 3) + n;
+   bit_len = ((unsigned)ptr << 3);
   z = 0x80 >> n;
   zz = ((ub & -z) | z) & 0xFF;
   u.buf[ptr>>3] = _mm256_set_epi64x( zz, zz, zz, zz );
@@ -1057,9 +1060,9 @@ blake64_4way_close( blake_4way_big_context *sc,
   {
        sc->T0 -= 1024 - bit_len;
   }
-   if ( ptr <= (96 >> 3) )
+   if ( ptr <= 104 )
   {
-       memset_zero_m256i( u.buf + (ptr>>3) + 1, (96-ptr) >> 3 );
+       memset_zero_m256i( u.buf + (ptr>>3) + 1, (104-ptr) >> 3 );
       if ( out_size_w64 == 8 )
          u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
                                    _mm256_set_epi64x( 0x0100000000000000,
@@ -1070,11 +1073,13 @@ blake64_4way_close( blake_4way_big_context *sc,
                                    _mm256_set_epi64x( th, th, th, th ) );
       *(u.buf+(120>>3)) = mm256_byteswap_epi64(
                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
+
       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
   }
   else
  {
-       memset_zero_m256i( u.buf + (ptr>>3) + 1, (127 - ptr) >> 3 );
+       memset_zero_m256i( u.buf + (ptr>>3) + 1, (120 - ptr) >> 3 );
+
       blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
       sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
       sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
@@ -1089,6 +1094,7 @@ blake64_4way_close( blake_4way_big_context *sc,
                                    _mm256_set_epi64x( th, th, th, th ) );
       *(u.buf+(120>>3)) = mm256_byteswap_epi64(
                                    _mm256_set_epi64x( tl, tl, tl, tl ) );
+
       blake64_4way( sc, u.buf, 128 );
   }
   out = (__m256i*)dst;
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -62,9 +62,9 @@ extern "C"{
 #ifdef __AVX__
 typedef struct {
        __m128i buf[16] __attribute__ ((aligned (64)));
-	size_t ptr;
        __m128i H[8];
        __m128i S[4];    
+        size_t ptr;
 	sph_u32 T0, T1;
 } blake_4way_small_context;

@@ -82,13 +82,13 @@ void blake256_4way_addbits_and_close(

 typedef struct {
        __m256i buf[16] __attribute__ ((aligned (64)));
-	size_t ptr;
        __m256i H[8];
        __m256i S[4];   
+        size_t ptr;
 	sph_u64 T0, T1;
 } blake_4way_big_context;

-typedef blake_4way_big_context blake512_avx2_context;
+typedef blake_4way_big_context blake512_4way_context;

 void blake512_4way_init(void *cc);
 void blake512_4way(void *cc, const void *data, size_t len);
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -0,0 +1,153 @@
+#include "decred-gate.h"
+#include "sph_blake.h"
+#include "blake-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+#include <unistd.h>
+
+#if defined (DECRED_4WAY)
+
+static __thread blake256_4way_context blake_mid;
+static __thread bool ctx_midstate_done = false;
+
+void decred_hash_4way( void *state, const void *input )
+{
+     uint32_t hash0[16] __attribute__ ((aligned (64)));
+     uint32_t hash1[16] __attribute__ ((aligned (64)));
+     uint32_t hash2[16] __attribute__ ((aligned (64)));
+     uint32_t hash3[16] __attribute__ ((aligned (64)));
+     uint32_t vhash[16*4] __attribute__ ((aligned (64)));
+     blake256_4way_context ctx __attribute__ ((aligned (64)));
+
+     sph_blake256_context ctx2 __attribute__ ((aligned (64)));
+     uint32_t hash[16] __attribute__ ((aligned (64)));
+     uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
+     m128_deinterleave_4x32( sin0, sin1, sin2, sin3, (uint32_t*)input, 180*8 );
+
+     void *tail = input + DECRED_MIDSTATE_LEN;
+     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
+//     #define MIDSTATE_LEN 128
+/*
+        uint8_t *ending = (uint8_t*) input;
+        ending += MIDSTATE_LEN;
+
+     if ( !ctx_midstate_done )
+     {
+          blake256_4way_init( &blake_mid );
+          blake256_4way( &blake_mid, input, DECRED_MIDSTATE_LEN );
+          ctx_midstate_done = true;
+     }
+     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
+
+     blake256_4way( &ctx, tail, tail_len );
+     blake256_4way_close( &ctx, vhash );
+*/
+
+
+     sph_blake256_init( &ctx2 );
+     sph_blake256( &ctx2, sin0, 180 );
+     sph_blake256_close( &ctx2, hash );
+
+     blake256_4way_init( &ctx );
+     blake256_4way( &ctx, input, 180 );
+     blake256_4way_close( &ctx, vhash );
+
+     m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+/*
+        for ( int i = 0; i < 8; i++ )
+          if ( hash[i] != hash0[i] )
+            printf(" hash mismatch, i = %u\n",i);
+
+printf("hash:  %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
+                             *(hash+2), *(hash+3) );
+printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
+                             *(hash0+2), *(hash0+3) );
+printf("\n");
+*/
+
+//     memcpy( state,    hash0, 32 );
+//     memcpy( state+32, hash1, 32 );
+//     memcpy( state+64, hash1, 32 );
+//     memcpy( state+96, hash1, 32 );
+
+     memcpy( state, hash, 32 );
+
+}
+
+int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done)
+{
+   uint32_t vdata[45*4] __attribute__ ((aligned (64)));
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+        uint32_t _ALIGN(64) endiandata[48];
+//        uint32_t _ALIGN(64) hash32[8];
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
+        uint32_t n = first_nonce;
+        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+
+//        #define DCR_NONCE_OFT32 35
+
+        ctx_midstate_done = false;
+
+//        memcpy(endiandata, pdata, 180);
+
+   m128_interleave_4x32( vdata, pdata, pdata, pdata, pdata, 180*8 );
+
+   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      * noncep    = n;
+      *(noncep+2) = n+1;
+      *(noncep+4) = n+2;
+      *(noncep+6) = n+3;
+
+      decred_hash_4way( hash, vdata );
+
+//                endiandata[DCR_NONCE_OFT32] = n;
+//                decred_hash(hash32, endiandata);
+
+      if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
+      {
+          work_set_target_ratio( work, hash );
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          pdata[DECRED_NONCE_INDEX] = n;
+      }
+/*      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
+      {
+          work_set_target_ratio( work, hash+8 );
+          found[1] = true;
+          num_found++;
+          nonces[1] = n;
+      }
+      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
+      {
+          work_set_target_ratio( work, hash+16 );
+          found[2] = true;
+          num_found++;
+          nonces[2] = n;
+      }
+      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
+      {
+          work_set_target_ratio( work, hash+24 );
+          found[3] = true;
+          num_found++;
+          nonces[3] = n;
+      }
+*/
+      n += 4;
+  } while ( (num_found == 0) && (n < max_nonce) 
+            && !work_restart[thr_id].restart );
+
+  *hashes_done = n - first_nonce + 1;
+  return num_found;
+}
+
+#endif
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -0,0 +1,176 @@
+#include "decred-gate.h"
+#include <unistd.h>
+#include <memory.h>
+#include <string.h>
+
+uint32_t *decred_get_nonceptr( uint32_t *work_data )
+{
+   return &work_data[ DECRED_NONCE_INDEX ];
+}
+
+double decred_calc_network_diff( struct work* work )
+{
+   // sample for diff 43.281 : 1c05ea29
+   // todo: endian reversed on longpoll could be zr5 specific...
+   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
+   uint32_t bits = ( nbits & 0xffffff );
+   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
+   int m;
+   double d = (double)0x0000ffff / (double)bits;
+
+   for ( m = shift; m < 29; m++ )
+       d *= 256.0;
+   for ( m = 29; m < shift; m++ )
+       d /= 256.0;
+   if ( shift == 28 )
+       d *= 256.0; // testnet
+   if ( opt_debug_diff )
+       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
+                           shift, bits );
+   return net_diff;
+}
+
+void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
+{
+   // some random extradata to make the work unique
+   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
+   work->height = work->data[32];
+   if (!have_longpoll && work->height > *net_blocks + 1)
+   {
+      char netinfo[64] = { 0 };
+      if (opt_showdiff && net_diff > 0.)
+      {
+         if (net_diff != work->targetdiff)
+            sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
+                   work->targetdiff);
+         else
+             sprintf(netinfo, ", diff %.3f", net_diff);
+       }
+       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
+                       netinfo);
+       *net_blocks = work->height - 1;
+   }
+}
+
+void decred_be_build_stratum_request( char *req, struct work *work,
+                                      struct stratum_ctx *sctx )
+{
+   unsigned char *xnonce2str;
+   uint32_t ntime, nonce;
+   char ntimestr[9], noncestr[9];
+
+   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
+   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
+   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
+   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
+   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
+                                     sctx->xnonce1_size );
+   snprintf( req, JSON_BUF_LEN,
+        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
+         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
+   free(xnonce2str);
+}
+#define min(a,b) (a>b ? (b) :(a))
+
+void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
+{
+   uchar merkle_root[64] = { 0 };
+   uint32_t extraheader[32] = { 0 };
+   int headersize = 0;
+   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
+   size_t t;
+   int i;
+
+   // getwork over stratum, getwork merkle + header passed in coinb1
+   memcpy(merkle_root, sctx->job.coinbase, 32);
+   headersize = min((int)sctx->job.coinbase_size - 32,
+                  sizeof(extraheader) );
+   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
+
+   // Increment extranonce2 
+   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
+
+   // Assemble block header 
+   memset( g_work->data, 0, sizeof(g_work->data) );
+   g_work->data[0] = le32dec( sctx->job.version );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[1 + i] = swab32(
+                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
+
+//   for ( i = 0; i < 8; i++ ) // prevhash
+//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
+//   for ( i = 0; i < 8; i++ ) // merkle
+//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
+
+   for ( i = 0; i < headersize/4; i++ ) // header
+      g_work->data[17 + i] = extraheader[i];
+   // extradata
+
+   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
+      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
+   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
+      g_work->data[i] = 0;
+   g_work->data[37] = (rand()*4) << 8;
+   // block header suffix from coinb2 (stake version)
+   memcpy( &g_work->data[44],
+           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
+   sctx->bloc_height = g_work->data[32];
+   //applog_hex(work->data, 180);
+   //applog_hex(&work->data[36], 36);
+}
+
+#undef min
+
+bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
+                           int thr_id )
+{
+   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
+      // need to regen g_work..
+      return false;
+   if ( have_stratum && !work->data[0] && !opt_benchmark )
+   {
+      sleep(1);
+      return false;
+   }
+   // extradata: prevent duplicates
+   work->data[ DECRED_XNONCE_INDEX     ] += 1;
+   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
+   return true;
+}
+
+
+bool register_decred_algo( algo_gate_t* gate )
+{
+#if defined(DECRED_4WAY)
+  gate->optimizations = SSE2_OPT | AVX_OPT;
+  gate->scanhash  = (void*)&scanhash_decred_4way;
+  gate->hash      = (void*)&decred_hash_4way;
+#else
+  gate->optimizations = SSE2_OPT;
+  gate->scanhash  = (void*)&scanhash_decred;
+  gate->hash      = (void*)&decred_hash;
+#endif
+
+//  gate->optimizations         = SSE2_OPT;
+//  gate->scanhash              = (void*)&scanhash_decred;
+//  gate->hash                  = (void*)&decred_hash;
+  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
+  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
+  gate->display_extra_data    = (void*)&decred_decode_extradata;
+  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
+  gate->build_extraheader     = (void*)&decred_build_extraheader;
+  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
+  gate->nbits_index           = DECRED_NBITS_INDEX;
+  gate->ntime_index           = DECRED_NTIME_INDEX;
+  gate->nonce_index           = DECRED_NONCE_INDEX;
+  gate->work_data_size        = DECRED_DATA_SIZE;
+  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE;
+  allow_mininginfo            = false;
+  have_gbt                    = false;
+  return true;
+}
+
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -0,0 +1,36 @@
+#ifndef __DECRED_GATE_H__
+#define __DECRED_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#define DECRED_NBITS_INDEX 29
+#define DECRED_NTIME_INDEX 34
+#define DECRED_NONCE_INDEX 35
+#define DECRED_XNONCE_INDEX 36
+#define DECRED_DATA_SIZE 192
+#define DECRED_WORK_COMPARE_SIZE 140
+#define DECRED_MIDSTATE_LEN 128
+
+#if defined (__AVX2__) 
+//void blakehash_84way(void *state, const void *input);
+//int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
+//                         uint64_t *hashes_done );
+#endif
+
+#if defined(FOUR_WAY) && defined(__AVX__)
+  #define DECRED_4WAY
+#endif
+
+#if defined (DECRED_4WAY)
+void decred_hash_4way(void *state, const void *input);
+int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+#endif
+
+void decred_hash( void *state, const void *input );
+int scanhash_decred( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+
+#endif
+
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "decred-gate.h"
 #include "sph_blake.h"

 #include <string.h>
@@ -14,33 +14,33 @@
 #define max(a,b) (a<b ? b : a)
 #endif
 */
-
+/*
 #define DECRED_NBITS_INDEX 29
 #define DECRED_NTIME_INDEX 34
 #define DECRED_NONCE_INDEX 35
 #define DECRED_XNONCE_INDEX 36
 #define DECRED_DATA_SIZE 192
 #define DECRED_WORK_COMPARE_SIZE 140
-
+*/
 static __thread sph_blake256_context blake_mid;
 static __thread bool ctx_midstate_done = false;

 void decred_hash(void *state, const void *input)
 {
-        #define MIDSTATE_LEN 128
+//        #define MIDSTATE_LEN 128
        sph_blake256_context ctx __attribute__ ((aligned (64)));

        uint8_t *ending = (uint8_t*) input;
-        ending += MIDSTATE_LEN;
+        ending += DECRED_MIDSTATE_LEN;

        if (!ctx_midstate_done) {
                sph_blake256_init(&blake_mid);
-                sph_blake256(&blake_mid, input, MIDSTATE_LEN);
+                sph_blake256(&blake_mid, input, DECRED_MIDSTATE_LEN);
                ctx_midstate_done = true;
        }
        memcpy(&ctx, &blake_mid, sizeof(blake_mid));

-        sph_blake256(&ctx, ending, (180 - MIDSTATE_LEN));
+        sph_blake256(&ctx, ending, (180 - DECRED_MIDSTATE_LEN));
        sph_blake256_close(&ctx, state);
 }

@@ -59,9 +59,9 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;

-        #define DCR_NONCE_OFT32 35
+//        #define DCR_NONCE_OFT32 35

-        const uint32_t first_nonce = pdata[DCR_NONCE_OFT32];
+        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];

        uint32_t n = first_nonce;
@@ -81,7 +81,7 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t

        do {
                //be32enc(&endiandata[DCR_NONCE_OFT32], n);
-                endiandata[DCR_NONCE_OFT32] = n;
+                endiandata[DECRED_NONCE_INDEX] = n;
                decred_hash(hash32, endiandata);

                if (hash32[7] <= HTarget && fulltest(hash32, ptarget)) {
@@ -92,7 +92,7 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
                        applog_hash(ptarget);
                        applog_compare_hash(hash32, ptarget);
 #endif
-                        pdata[DCR_NONCE_OFT32] = n;
+                        pdata[DECRED_NONCE_INDEX] = n;
                        return 1;
                }

@@ -101,10 +101,11 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
        } while (n < max_nonce && !work_restart[thr_id].restart);

        *hashes_done = n - first_nonce + 1;
-        pdata[DCR_NONCE_OFT32] = n;
+        pdata[DECRED_NONCE_INDEX] = n;
        return 0;
 }

+/*
 uint32_t *decred_get_nonceptr( uint32_t *work_data )
 {
   return &work_data[ DECRED_NONCE_INDEX ];
@@ -172,7 +173,7 @@ void decred_be_build_stratum_request( char *req, struct work *work,
         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
   free(xnonce2str);
 }
-
+*/
 /*
 // data shared between gen_merkle_root and build_extraheader.
 __thread uint32_t decred_extraheader[32] = { 0 };
@@ -188,7 +189,7 @@ void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 }
 */

-
+/*
 #define min(a,b) (a>b ? (b) :(a))

 void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
@@ -282,4 +283,4 @@ bool register_decred_algo( algo_gate_t* gate )
  have_gbt                    = false;
  return true;
 }
-
+*/
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -0,0 +1,206 @@
+#include "pentablake-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "blake-hash-4way.h"
+#include "sph_blake.h"
+
+//#define DEBUG_ALGO
+
+#ifdef PENTABLAKE_4WAY
+
+extern void pentablakehash_4way( void *output, const void *input )
+{
+	unsigned char _ALIGN(32) hash[128];
+//	// same as uint32_t hashA[16], hashB[16];
+//	#define hashB hash+64
+
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake512_4way_context ctx;
+
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, input, 80 );
+     blake512_4way_close( &ctx, vhash );
+
+uint64_t sin0[10], sin1[10], sin2[10], sin3[10];
+m256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
+sph_blake512_context ctx2_blake;
+sph_blake512_init(&ctx2_blake);
+sph_blake512(&ctx2_blake, sin0, 80);
+sph_blake512_close(&ctx2_blake, (void*) hash);
+
+m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+uint64_t* hash64 = (uint64_t*)hash;
+for( int i = 0; i < 8; i++ )
+{
+   if ( hash0[i] != hash64[i] )
+      printf("hash mismatch %u\n",i);
+}
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+
+/*
+     uint64_t sin0[10] __attribute__ ((aligned (64)));
+     uint64_t sin1[10] __attribute__ ((aligned (64)));
+     uint64_t sin2[10] __attribute__ ((aligned (64)));
+     uint64_t sin3[10] __attribute__ ((aligned (64)));
+
+	sph_blake512_context     ctx_blake;
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+	memcpy(output, hash, 32);
+*/
+}
+
+int scanhash_pentablake_4way( int thr_id, struct work *work,
+                              uint32_t max_nonce, uint64_t *hashes_done )
+{
+    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t endiandata[32] __attribute__ ((aligned (64)));
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19] - 1;
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t Htarg = ptarget[7];
+    uint32_t *nonces = work->nonces;
+    bool *found = work->nfound;
+    int num_found = 0;
+    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+    uint32_t *noncep1 = vdata + 75;
+    uint32_t *noncep2 = vdata + 77;
+    uint32_t *noncep3 = vdata + 79;
+
+//    uint32_t _ALIGN(32) hash64[8];
+//    uint32_t _ALIGN(32) endiandata[32];
+
+    uint64_t htmax[] = {
+	0,
+	0xF,
+	0xFF,
+	0xFFF,
+	0xFFFF,
+	0x10000000
+    };
+    uint32_t masks[] = {
+ 	0xFFFFFFFF,
+	0xFFFFFFF0,
+	0xFFFFFF00,
+	0xFFFFF000,
+	0xFFFF0000,
+	0
+    };
+
+	// we need bigendian data...
+    swab32_array( endiandata, pdata, 20 );
+
+    uint64_t *edata = (uint64_t*)endiandata;
+    m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+    for ( int m=0; m < 6; m++ )
+    {
+        if ( Htarg <= htmax[m] )
+        {
+           uint32_t mask = masks[m];
+           do {
+              found[0] = found[1] = found[2] = found[3] = false;
+              be32enc( noncep0, n   );
+              be32enc( noncep1, n+1 );
+              be32enc( noncep2, n+2 );
+              be32enc( noncep3, n+3 );
+
+              pentablakehash_4way( hash, vdata );
+
+              // return immediately on nonce found, only one submit
+              if ( ( !(hash[7] & mask) ) && fulltest( hash, ptarget ) )
+              {
+                  found[0] = true;
+                  num_found++;
+                  nonces[0] = n;
+                  pdata[19] = n;
+                  *hashes_done = n - first_nonce + 1;
+                  return 1;
+              }
+              if ( (! ((hash+8)[7] & mask) ) && fulltest( hash+8, ptarget ) )
+              {
+                  found[1] = true;
+                  num_found++;
+                  nonces[1] = n;
+                  *hashes_done = n - first_nonce + 1;
+                  return 1;
+              }
+              if ( ( !((hash+16)[7] & mask) ) && fulltest( hash+16, ptarget ) )
+              {
+                  found[2] = true;
+                  num_found++;
+                  nonces[2] = n;
+                  *hashes_done = n - first_nonce + 1;
+                  return 1;
+              }
+              if ( ( !((hash+24)[7] & mask) ) && fulltest( hash+24, ptarget ) )
+              {
+                  found[3] = true;
+                  num_found++;
+                  nonces[3] = n;
+                  *hashes_done = n - first_nonce + 1;
+                  return 1;
+              }
+              n += 4;
+
+           } while (n < max_nonce && !work_restart[thr_id].restart);
+           break;
+        }
+    }
+
+    *hashes_done = n - first_nonce + 1;
+    pdata[19] = n;
+    return 0;
+} 
+
+#endif
--- a/algo/blake/pentablake-gate.c
+++ b/algo/blake/pentablake-gate.c
@@ -0,0 +1,16 @@
+#include "pentablake-gate.h"
+
+bool register_pentablake_algo( algo_gate_t* gate )
+{
+#if defined (PENTABLAKE_4WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT;
+    gate->scanhash  = (void*)&scanhash_pentablake_4way;
+    gate->hash      = (void*)&pentablakehash_4way;
+#else
+    gate->scanhash  = (void*)&scanhash_pentablake;
+    gate->hash      = (void*)&pentablakehash;
+#endif
+    gate->get_max64 = (void*)&get_max64_0x3ffff;
+    return true;
+};
+
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -0,0 +1,21 @@
+#ifndef __PENTABLAKE_GATE_H__
+#define __PENTABLAKE_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX__)
+  #define PENTABLAKE_4WAY
+#endif
+
+#if defined(PENTABLAKE_4WAY)
+void pentablakehash_4way( void *state, const void *input );
+int scanhash_pentablake_4way( int thr_id, struct work *work,
+                              uint32_t max_nonce, uint64_t *hashes_done );
+#endif
+
+void pentablakehash( void *state, const void *input );
+int scanhash_pentablake( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
--- a/algo/blake/pentablake.c
+++ b/algo/blake/pentablake.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "pentablake-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -110,11 +110,3 @@ int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 } 

-bool register_pentablake_algo( algo_gate_t* gate )
-{
-    gate->scanhash  = (void*)&scanhash_pentablake;
-    gate->hash      = (void*)&pentablakehash;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-};
-
--- a/algo/blake/sph_blake.c
+++ b/algo/blake/sph_blake.c
@@ -983,9 +983,11 @@ blake64_close(sph_blake_big_context *sc,
 			u.buf[111] |= 1;
 		sph_enc64be_aligned(u.buf + 112, th);
 		sph_enc64be_aligned(u.buf + 120, tl);
+
 		blake64(sc, u.buf + ptr, 128 - ptr);
 	} else {
 		memset(u.buf + ptr + 1, 0, 127 - ptr);
+
 		blake64(sc, u.buf + ptr, 128 - ptr);
 		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
 		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
@@ -994,6 +996,7 @@ blake64_close(sph_blake_big_context *sc,
 			u.buf[111] = 1;
 		sph_enc64be_aligned(u.buf + 112, th);
 		sph_enc64be_aligned(u.buf + 120, tl);
+
 		blake64(sc, u.buf, 128);
 	}
 	out = dst;
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -0,0 +1,639 @@
+/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */
+/*
+ * JH implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifdef __AVX2__
+
+#include <stddef.h>
+#include <string.h>
+
+#include "jh-hash-4way.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH
+#define SPH_SMALL_FOOTPRINT_JH   1
+#endif
+
+#if !defined SPH_JH_64 && SPH_64_TRUE
+#define SPH_JH_64   1
+#endif
+
+#if !SPH_64
+#undef SPH_JH_64
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * The internal bitslice representation may use either big-endian or
+ * little-endian (true bitslice operations do not care about the bit
+ * ordering, and the bit-swapping linear operations in JH happen to
+ * be invariant through endianness-swapping). The constants must be
+ * defined according to the chosen endianness; we use some
+ * byte-swapping macros for that.
+ */
+
+#if SPH_LITTLE_ENDIAN
+
+#if SPH_64
+#define C64e(x)     ((SPH_C64(x) >> 56) \
+                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
+                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
+                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
+                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
+                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
+                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
+                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
+#define dec64e_aligned   sph_dec64le_aligned
+#define enc64e           sph_enc64le
+#endif
+
+#else
+
+#if SPH_64
+#define C64e(x)     SPH_C64(x)
+#define dec64e_aligned   sph_dec64be_aligned
+#define enc64e           sph_enc64be
+#endif
+
+#endif
+
+#define Sb(x0, x1, x2, x3, c) \
+do { \
+   __m256i cc = _mm256_set_epi64x( c, c, c, c ); \
+    x3 = mm256_bitnot( x3 ); \
+    x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_bitnot( x2 ) ) ); \
+    tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
+    x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
+    x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_bitnot( x1 ), x2 ) ); \
+    x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
+    x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_bitnot( x3 ) ) ); \
+    x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
+    x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
+    x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
+    x2 = _mm256_xor_si256( x2, tmp ); \
+} while (0)
+
+/*
+#define Sb(x0, x1, x2, x3, c)   do { \
+		x3 = ~x3; \
+		x0 ^= (c) & ~x2; \
+		tmp = (c) ^ (x0 & x1); \
+		x0 ^= x2 & x3; \
+		x3 ^= ~x1 & x2; \
+		x1 ^= x0 & x2; \
+		x2 ^= x0 & ~x3; \
+		x0 ^= x1 | x3; \
+		x3 ^= x1 & x2; \
+		x1 ^= tmp & x0; \
+		x2 ^= tmp; \
+	} while (0)
+*/
+
+#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
+do { \
+    x4 = _mm256_xor_si256( x4, x1 ); \
+    x5 = _mm256_xor_si256( x5, x2 ); \
+    x6 = _mm256_xor_si256( x6, _mm256_xor_si256( x3, x0 ) ); \
+    x7 = _mm256_xor_si256( x7, x0 ); \
+    x0 = _mm256_xor_si256( x0, x5 ); \
+    x1 = _mm256_xor_si256( x1, x6 ); \
+    x2 = _mm256_xor_si256( x2, _mm256_xor_si256( x7, x4 ) ); \
+    x3 = _mm256_xor_si256( x3, x4 ); \
+} while (0)
+
+
+/*
+#define Lb(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		x4 ^= x1; \
+		x5 ^= x2; \
+		x6 ^= x3 ^ x0; \
+		x7 ^= x0; \
+		x0 ^= x5; \
+		x1 ^= x6; \
+		x2 ^= x7 ^ x4; \
+		x3 ^= x4; \
+	} while (0)
+*/
+
+#if SPH_JH_64
+
+static const sph_u64 C[] = {
+	C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557),
+	C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40),
+	C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a),
+	C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231),
+	C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410),
+	C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc),
+	C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0),
+	C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3),
+	C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce),
+	C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23),
+	C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8),
+	C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197),
+	C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95),
+	C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214),
+	C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80),
+	C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4),
+	C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989),
+	C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36),
+	C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7),
+	C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f),
+	C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727),
+	C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b),
+	C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e),
+	C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062),
+	C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984),
+	C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5),
+	C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2),
+	C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f),
+	C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465),
+	C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a),
+	C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1),
+	C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf),
+	C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48),
+	C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0),
+	C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134),
+	C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a),
+	C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff),
+	C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6),
+	C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae),
+	C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567),
+	C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a),
+	C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518),
+	C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446),
+	C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e),
+	C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee),
+	C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001),
+	C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779),
+	C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83),
+	C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a),
+	C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef),
+	C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d),
+	C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65),
+	C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a),
+	C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c),
+	C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d),
+	C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71),
+	C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc),
+	C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0),
+	C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c),
+	C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f),
+	C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751),
+	C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad),
+	C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56),
+	C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6),
+	C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a),
+	C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163),
+	C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826),
+	C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f),
+	C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30),
+	C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a),
+	C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3),
+	C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505),
+	C64e(0xb17681d913326cce), C64e(0x3c175284f805a262),
+	C64e(0xf42bcbb378471547), C64e(0xff46548223936a48),
+	C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e),
+	C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e),
+	C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd),
+	C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7),
+	C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be),
+	C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de),
+	C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9),
+	C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a),
+	C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b),
+	C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2)
+};
+
+#define Ceven_hi(r)   (C[((r) << 2) + 0])
+#define Ceven_lo(r)   (C[((r) << 2) + 1])
+#define Codd_hi(r)    (C[((r) << 2) + 2])
+#define Codd_lo(r)    (C[((r) << 2) + 3])
+
+#define S(x0, x1, x2, x3, cb, r)   do { \
+		Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
+		Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
+	} while (0)
+
+#define L(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
+			x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
+		Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
+			x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
+	} while (0)
+
+
+#define Wz(x, c, n) \
+do { \
+   __m256i t = _mm256_slli_epi64( _mm256_and_si256(x ## h, (c)), (n) ); \
+   x ## h = _mm256_or_si256( _mm256_and_si256( \
+                                _mm256_srli_epi64(x ## h, (n)), (c)), t ); \
+   t = _mm256_slli_epi64( _mm256_and_si256(x ## l, (c)), (n) ); \
+   x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
+} while (0)
+
+
+/*
+#define Wz(x, c, n)   do { \
+		sph_u64 t = (x ## h & (c)) << (n); \
+		x ## h = ((x ## h >> (n)) & (c)) | t; \
+		t = (x ## l & (c)) << (n); \
+		x ## l = ((x ## l >> (n)) & (c)) | t; \
+	} while (0)
+*/
+
+#define W0(x)   Wz(x, _mm256_set_epi64x( 0x5555555555555555, \
+       0x5555555555555555, 0x5555555555555555, 0x5555555555555555 ), 1 )
+#define W1(x)   Wz(x, _mm256_set_epi64x( 0x3333333333333333, \
+       0x3333333333333333, 0x3333333333333333, 0x3333333333333333 ), 2 )
+#define W2(x)   Wz(x, _mm256_set_epi64x( 0x0F0F0F0F0F0F0F0F, \
+       0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F ), 4 )
+#define W3(x)   Wz(x, _mm256_set_epi64x( 0x00FF00FF00FF00FF, \
+       0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF ), 8 ) 
+#define W4(x)   Wz(x, _mm256_set_epi64x( 0x0000FFFF0000FFFF, \
+       0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF ), 16 )
+#define W5(x)   Wz(x, _mm256_set_epi64x( 0x00000000FFFFFFFF, \
+       0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF ), 32 )
+#define W6(x) \
+do { \
+   __m256i t = x ## h; \
+   x ## h = x ## l; \
+   x ## l = t; \
+} while (0)
+
+/*
+#define W0(x)   Wz(x, SPH_C64(0x5555555555555555),  1)
+#define W1(x)   Wz(x, SPH_C64(0x3333333333333333),  2)
+#define W2(x)   Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F),  4)
+#define W3(x)   Wz(x, SPH_C64(0x00FF00FF00FF00FF),  8)
+#define W4(x)   Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
+#define W5(x)   Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
+#define W6(x)   do { \
+		sph_u64 t = x ## h; \
+		x ## h = x ## l; \
+		x ## l = t; \
+	} while (0)
+*/
+
+#define DECL_STATE \
+	__m256i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
+	__m256i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
+	__m256i tmp;
+
+#define READ_STATE(state)   do { \
+		h0h = (state)->H[ 0]; \
+		h0l = (state)->H[ 1]; \
+		h1h = (state)->H[ 2]; \
+		h1l = (state)->H[ 3]; \
+		h2h = (state)->H[ 4]; \
+		h2l = (state)->H[ 5]; \
+		h3h = (state)->H[ 6]; \
+		h3l = (state)->H[ 7]; \
+		h4h = (state)->H[ 8]; \
+		h4l = (state)->H[ 9]; \
+		h5h = (state)->H[10]; \
+		h5l = (state)->H[11]; \
+		h6h = (state)->H[12]; \
+		h6l = (state)->H[13]; \
+		h7h = (state)->H[14]; \
+		h7l = (state)->H[15]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->H[ 0] = h0h; \
+		(state)->H[ 1] = h0l; \
+		(state)->H[ 2] = h1h; \
+		(state)->H[ 3] = h1l; \
+		(state)->H[ 4] = h2h; \
+		(state)->H[ 5] = h2l; \
+		(state)->H[ 6] = h3h; \
+		(state)->H[ 7] = h3l; \
+		(state)->H[ 8] = h4h; \
+		(state)->H[ 9] = h4l; \
+		(state)->H[10] = h5h; \
+		(state)->H[11] = h5l; \
+		(state)->H[12] = h6h; \
+		(state)->H[13] = h6l; \
+		(state)->H[14] = h7h; \
+		(state)->H[15] = h7l; \
+	} while (0)
+
+#define INPUT_BUF1 \
+	__m256i m0h = buf[0]; \
+	__m256i m0l = buf[1]; \
+	__m256i m1h = buf[2]; \
+	__m256i m1l = buf[3]; \
+	__m256i m2h = buf[4]; \
+	__m256i m2l = buf[5]; \
+	__m256i m3h = buf[6]; \
+	__m256i m3l = buf[7]; \
+        h0h = _mm256_xor_si256( h0h, m0h ); \
+        h0l = _mm256_xor_si256( h0l, m0l ); \
+        h1h = _mm256_xor_si256( h1h, m1h ); \
+        h1l = _mm256_xor_si256( h1l, m1l ); \
+        h2h = _mm256_xor_si256( h2h, m2h ); \
+        h2l = _mm256_xor_si256( h2l, m2l ); \
+        h3h = _mm256_xor_si256( h3h, m3h ); \
+        h3l = _mm256_xor_si256( h3l, m3l ); \
+
+#define INPUT_BUF2 \
+   h4h = _mm256_xor_si256( h4h, m0h ); \
+   h4l = _mm256_xor_si256( h4l, m0l ); \
+   h5h = _mm256_xor_si256( h5h, m1h ); \
+   h5l = _mm256_xor_si256( h5l, m1l ); \
+   h6h = _mm256_xor_si256( h6h, m2h ); \
+   h6l = _mm256_xor_si256( h6l, m2l ); \
+   h7h = _mm256_xor_si256( h7h, m3h ); \
+   h7l = _mm256_xor_si256( h7l, m3l ); \
+
+static const sph_u64 IV256[] = {
+	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
+	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
+	C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477),
+	C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8),
+	C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262),
+	C64e(0x277695f776248f94), C64e(0x87d5b6574780296c),
+	C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f),
+	C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769)
+};
+
+
+static const sph_u64 IV512[] = {
+	C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543),
+	C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361),
+	C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80),
+	C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7),
+	C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a),
+	C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199),
+	C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156),
+	C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b)
+};
+
+#else
+
+
+#endif
+
+#define SL(ro)   SLu(r + ro, ro)
+
+#define SLu(r, ro)   do { \
+		S(h0, h2, h4, h6, Ceven_, r); \
+		S(h1, h3, h5, h7, Codd_, r); \
+		L(h0, h2, h4, h6, h1, h3, h5, h7); \
+		W ## ro(h1); \
+		W ## ro(h3); \
+		W ## ro(h5); \
+		W ## ro(h7); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_JH
+
+#if SPH_JH_64
+
+/*
+ * The "small footprint" 64-bit version just uses a partially unrolled
+ * loop.
+ */
+
+#define E8   do { \
+		unsigned r; \
+		for (r = 0; r < 42; r += 7) { \
+			SL(0); \
+			SL(1); \
+			SL(2); \
+			SL(3); \
+			SL(4); \
+			SL(5); \
+			SL(6); \
+		} \
+	} while (0)
+
+#else
+
+
+#endif
+
+#else
+
+#if SPH_JH_64
+
+/*
+ * On a "true 64-bit" architecture, we can unroll at will.
+ */
+
+#define E8   do { \
+		SLu( 0, 0); \
+		SLu( 1, 1); \
+		SLu( 2, 2); \
+		SLu( 3, 3); \
+		SLu( 4, 4); \
+		SLu( 5, 5); \
+		SLu( 6, 6); \
+		SLu( 7, 0); \
+		SLu( 8, 1); \
+		SLu( 9, 2); \
+		SLu(10, 3); \
+		SLu(11, 4); \
+		SLu(12, 5); \
+		SLu(13, 6); \
+		SLu(14, 0); \
+		SLu(15, 1); \
+		SLu(16, 2); \
+		SLu(17, 3); \
+		SLu(18, 4); \
+		SLu(19, 5); \
+		SLu(20, 6); \
+		SLu(21, 0); \
+		SLu(22, 1); \
+		SLu(23, 2); \
+		SLu(24, 3); \
+		SLu(25, 4); \
+		SLu(26, 5); \
+		SLu(27, 6); \
+		SLu(28, 0); \
+		SLu(29, 1); \
+		SLu(30, 2); \
+		SLu(31, 3); \
+		SLu(32, 4); \
+		SLu(33, 5); \
+		SLu(34, 6); \
+		SLu(35, 0); \
+		SLu(36, 1); \
+		SLu(37, 2); \
+		SLu(38, 3); \
+		SLu(39, 4); \
+		SLu(40, 5); \
+		SLu(41, 6); \
+	} while (0)
+
+#else
+
+
+#endif
+
+#endif
+
+static void
+jh_4way_init( jh_4way_context *sc, const void *iv )
+{
+    uint64_t *v = (uint64_t*)iv;
+    
+    for ( int i = 0; i < 16; i++ )
+        sc->H[i] = _mm256_set_epi64x( v[i], v[i], v[i], v[i] );
+    sc->ptr = 0;
+    sc->block_count = 0;
+}
+
+static void
+jh_4way_core( jh_4way_context *sc, const void *data, size_t len )
+{
+    __m256i *buf;
+    __m256i *vdata = (__m256i*)data;
+   const int buf_size = 64;   // 64 * _m256i
+   size_t ptr;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr) )
+   {
+       memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
+       ptr += len;
+       sc->ptr = ptr;
+       return;
+   }
+
+   READ_STATE(sc);
+   while ( len > 0 )
+   {
+       size_t clen;
+       clen = buf_size - ptr;
+       if ( clen > len )
+          clen = len;
+
+       memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
+       ptr += clen;
+       vdata += (clen>>3);
+       len -= clen;
+       if ( ptr == buf_size )
+       {
+          INPUT_BUF1;
+          E8;
+          INPUT_BUF2;
+          sc->block_count ++;
+          ptr = 0;
+       }
+   }
+   WRITE_STATE(sc);
+   sc->ptr = ptr;
+}
+
+static void
+jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
+               size_t out_size_w32, const void *iv )
+{
+   __m256i buf[16*4];
+   __m256i *dst256 = (__m256i*)dst;
+   size_t numz, u;
+   sph_u64 l0, l1, l0e, l1e;
+
+   buf[0] = _mm256_set_epi64x( 0x80, 0x80, 0x80, 0x80 );
+
+   if ( sc->ptr == 0 )
+       numz = 48;
+   else
+       numz = 112 - sc->ptr;
+
+   memset_zero_m256i( buf+1, (numz>>3) - 1 );   
+
+   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
+   l1 = SPH_T64(sc->block_count >> 55);
+   sph_enc64be( &l0e, l0 );
+   sph_enc64be( &l1e, l1 );
+   *(buf + (numz>>3)    ) = _mm256_set_epi64x( l1e, l1e, l1e, l1e );
+   *(buf + (numz>>3) + 1) = _mm256_set_epi64x( l0e, l0e, l0e, l0e ); 
+
+   jh_4way_core( sc, buf, numz + 16 );
+
+   for ( u=0; u < 8; u++ )
+       buf[u] = sc->H[u+8];
+
+    memcpy_m256i( dst256, buf, 8 );
+}
+
+void
+jh256_4way_init(void *cc)
+{
+	jh_4way_init(cc, IV256);
+}
+
+void
+jh256_4way(void *cc, const void *data, size_t len)
+{
+	jh_4way_core(cc, data, len);
+}
+
+void
+jh256_4way_close(void *cc, void *dst)
+{
+	jh_4way_close(cc, 0, 0, dst, 8, IV256);
+}
+
+void
+jh512_4way_init(void *cc)
+{
+	jh_4way_init(cc, IV512);
+}
+
+void
+jh512_4way(void *cc, const void *data, size_t len)
+{
+	jh_4way_core(cc, data, len);
+}
+
+void
+jh512_4way_close(void *cc, void *dst)
+{
+	jh_4way_close(cc, 0, 0, dst, 16, IV512);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -0,0 +1,100 @@
+/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * JH interface. JH is a family of functions which differ by
+ * their output size; this implementation defines JH for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_jh.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef JH_HASH_4WAY_H__
+#define JH_HASH_4WAY_H__
+
+#ifdef __AVX2__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_jh256   256
+
+#define SPH_SIZE_jh512   512
+
+/**
+ * This structure is a context for JH computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a JH computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running JH computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+    __m256i buf[8] __attribute__ ((aligned (64)));
+    __m256i H[16];
+    size_t ptr;
+    uint64_t block_count;
+/*
+	unsigned char buf[64]; 
+	size_t ptr;
+	union {
+		sph_u64 wide[16];
+	} H;
+	sph_u64 block_count;
+*/
+} jh_4way_context;
+
+typedef jh_4way_context jh256_4way_context;
+
+typedef jh_4way_context jh512_4way_context;
+
+void jh256_4way_init(void *cc);
+
+void jh256_4way(void *cc, const void *data, size_t len);
+
+void jh256_4way_close(void *cc, void *dst);
+
+void jh512_4way_init(void *cc);
+
+void jh512_4way(void *cc, const void *data, size_t len);
+
+void jh512_4way_close(void *cc, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -0,0 +1,228 @@
+#if defined(JHA_4WAY)
+
+#include "jha-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "avxdefs.h"
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+
+//static __thread keccak512_4way_context jha_kec_mid
+//                                   __attribute__ ((aligned (64)));
+
+void jha_hash_4way( void *output, const void *input )
+{
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhasha[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashb[8*4] __attribute__ ((aligned (64)));
+    __m256i mask;
+    __m256i* vh256 = (__m256i*)vhash;
+    __m256i* vha256 = (__m256i*)vhasha;
+    __m256i* vhb256 = (__m256i*)vhashb;
+
+    blake512_4way_context  ctx_blake;
+    hashState_groestl      ctx_groestl;
+    jh512_4way_context     ctx_jh;
+    skein512_4way_context  ctx_skein;
+    keccak512_4way_context ctx_keccak;
+
+    keccak512_4way_init( &ctx_keccak );
+    keccak512_4way( &ctx_keccak, input, 80 );
+    keccak512_4way_close( &ctx_keccak, vhash );
+
+//    memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
+//    keccak512_4way( &ctx_keccak, input+64, 16 );
+//    keccak512_4way_close( &ctx_keccak, vhash );
+
+    // Heavy & Light Pair Loop
+    for ( int round = 0; round < 3; round++ )
+    {
+       memset_zero_m256i( vha256, 20 );
+       memset_zero_m256i( vhb256, 20 );
+
+       mask = _mm256_sub_epi64( _mm256_and_si256( vh256[0],
+                        mm256_vec_epi64( 0x1 ) ), mm256_vec_epi64( 0x1 ) );
+
+       // groestl (serial) v skein
+
+       m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash0,
+                     (char*)hash0, 512 );
+
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash1,
+                                          (char*)hash1, 512 );
+
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash2,
+                                          (char*)hash2, 512 );
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash3,
+                                          (char*)hash3, 512 );
+
+       m256_interleave_4x64( vhasha, hash0, hash1, hash2, hash3, 512 );
+
+       // skein
+
+       skein512_4way_init( &ctx_skein );
+       skein512_4way( &ctx_skein, vhash, 64 );
+       skein512_4way_close( &ctx_skein, vhashb );
+
+       // merge vectored hash
+       for ( int i = 0; i < 8; i++ )
+       {
+          vha256[i] = _mm256_maskload_epi64( 
+                                      vhasha + i*4, mm256_bitnot(mask ) );
+          vhb256[i] = _mm256_maskload_epi64(
+                                      vhashb + i*4, mask );
+          vh256[i]  = _mm256_or_si256( vha256[i], vhb256[i] );
+       }
+
+       // blake v jh
+
+       blake512_4way_init( &ctx_blake );
+       blake512_4way( &ctx_blake, vhash, 64 );
+       blake512_4way_close( &ctx_blake, vhasha );
+
+       jh512_4way_init( &ctx_jh );
+       jh512_4way( &ctx_jh, vhash, 64 );
+       jh512_4way_close( &ctx_jh, vhashb );
+
+       // merge vectored hash
+       for ( int i = 0; i < 8; i++ )
+       {
+          vha256[i] = _mm256_maskload_epi64(
+                                      vhasha + i*4, mm256_bitnot(mask ) );
+          vhb256[i] = _mm256_maskload_epi64(
+                                      vhashb + i*4, mask );
+          vh256[i]  = _mm256_or_si256( vha256[i], vhb256[i] );
+       }
+    }
+
+    m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+    memcpy( output,       hash0, 32 );
+    memcpy( output+32,    hash1, 32 );
+    memcpy( output+64,    hash2, 32 );
+    memcpy( output+96,    hash3, 32 );
+
+}
+
+int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+	uint32_t n = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+
+	uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+	uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+   // we need bigendian data...
+   for ( int i=0; i < 19; i++ )
+      be32enc( &endiandata[i], pdata[i] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   // precalc midstate for keccak
+//   keccak512_4way_init( &jha_kec_mid );
+//   keccak512_4way( &jha_kec_mid, vdata, 64 );
+
+   for ( int m = 0; m < 6; m++ )
+   {
+      if ( Htarg <= htmax[m] )
+      {
+         uint32_t mask = masks[m];
+         do {
+              found[0] = found[1] = found[2] = found[3] = false;
+              be32enc( noncep0, n   );
+              be32enc( noncep1, n+1 );
+              be32enc( noncep2, n+2 );
+              be32enc( noncep3, n+3 );
+
+              jha_hash_4way( hash, vdata );
+
+              pdata[19] = n;
+
+              if ( ( !(hash[7] & mask) )
+                   && fulltest( hash, ptarget ) )
+              {
+                 found[0] = true;
+                 num_found++;
+                 nonces[0] = n;
+                 work_set_target_ratio( work, hash );
+              }
+              if ( ( !((hash+8)[7] & mask) )
+                   && fulltest( hash+8, ptarget ) )
+              {
+                 found[1] = true;
+                 num_found++;
+                 nonces[1] = n+1;
+                 work_set_target_ratio( work, hash+8 );
+              }
+              if ( ( !((hash+16)[7] & mask) )
+                 && fulltest( hash+16, ptarget ) )
+              {
+                 found[2] = true;
+                 num_found++;
+                 nonces[2] = n+2;
+                 work_set_target_ratio( work, hash+16 );
+              }
+              if ( ( !((hash+24)[7] & mask) )
+                   && fulltest( hash+24, ptarget ) )
+              {
+                 found[3] = true;
+                 num_found++;
+                 nonces[3] = n+3;
+                 work_set_target_ratio( work, hash+24 );
+              }
+              n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                     && !work_restart[thr_id].restart );
+
+         break;
+      }
+   }
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+#endif
--- a/algo/jh/jha-gate.c
+++ b/algo/jh/jha-gate.c
@@ -0,0 +1,18 @@
+#include "jha-gate.h"
+
+
+bool register_jha_algo( algo_gate_t* gate )
+{
+//#if defined (JHA_4WAY)
+//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+//  gate->scanhash         = (void*)&scanhash_jha_4way;
+//  gate->hash             = (void*)&jha_hash_4way;
+//#else
+  gate->optimizations = SSE2_OPT | AES_OPT;
+  gate->scanhash         = (void*)&scanhash_jha;
+  gate->hash             = (void*)&jha_hash;
+//#endif
+  gate->set_target       = (void*)&scrypt_set_target;
+  return true;
+};
+
--- a/algo/jh/jha-gate.h
+++ b/algo/jh/jha-gate.h
@@ -0,0 +1,27 @@
+#ifndef JHA_GATE_H__
+#define JHA_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+
+#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+  #define JHA_4WAY
+#endif
+
+//#if defined JHA_4WAY
+//void jha_hash_4way( void *state, const void *input );
+
+//int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
+//                       uint64_t *hashes_done );
+//#else
+
+void jha_hash( void *state, const void *input );
+
+int scanhash_jha( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+
+//#endif
+
+#endif
+
--- a/algo/jh/jha.c
+++ b/algo/jh/jha.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "jha-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -38,7 +38,6 @@ void jha_hash(void *output, const void *input)
 	sph_keccak512_context ctx_keccak;
 	sph_skein512_context ctx_skein;

-	sph_keccak512_init(&ctx_keccak);
        memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
        sph_keccak512(&ctx_keccak, input+64, 16 );
 	sph_keccak512_close(&ctx_keccak, hash );
@@ -154,12 +153,3 @@ int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *ha
 	return 0;
 }

-bool register_jha_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT;
-  gate->scanhash         = (void*)&scanhash_jha;
-  gate->hash             = (void*)&jha_hash;
-  gate->set_target       = (void*)&scrypt_set_target;
-  return true;
-};
-
--- a/algo/jh/sph_jh.c
+++ b/algo/jh/sph_jh.c
@@ -914,6 +914,7 @@ jh_core(sph_jh_context *sc, const void *data, size_t len)

 	buf = sc->buf;
 	ptr = sc->ptr;
+
 	if (len < (sizeof sc->buf) - ptr) {
 		memcpy(buf + ptr, data, len);
 		ptr += len;
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -6,7 +6,7 @@
 #include "sph_keccak.h"
 #include "keccak-hash-4way.h"

-#ifdef __AVX2__
+#ifdef KECCAK_4WAY

 void keccakhash_4way(void *state, const void *input)
 {
@@ -21,7 +21,7 @@ void keccakhash_4way(void *state, const void *input)
     keccak256_4way( &ctx, input, 80 );
     keccak256_4way_close( &ctx, vhash );

-     m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );

     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
@@ -33,16 +33,16 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done)
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
-   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
-   const uint32_t Htarg = ptarget[7];
+//   const uint32_t Htarg = ptarget[7];
   uint32_t endiandata[20];
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
-   int num_found;
+   int num_found = 0;
   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
   uint32_t *noncep1 = vdata + 75;
   uint32_t *noncep2 = vdata + 77;
@@ -52,11 +52,10 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
      be32enc( &endiandata[i], pdata[i] );

   uint64_t *edata = (uint64_t*)endiandata;
-   m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+   m256_interleave_4x64x( (uint64_t*)vdata, edata, edata, edata, edata, 640 );

   do {
      found[0] = found[1] = found[2] = found[3] = false;
-      num_found = 0;
      be32enc( noncep0, n   );
      be32enc( noncep1, n+1 );
      be32enc( noncep2, n+2 );
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -12,7 +12,7 @@ bool register_keccak_algo( algo_gate_t* gate )
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  gate->set_target      = (void*)&keccak_set_target;
  gate->get_max64       = (void*)&keccak_get_max64;
-#if defined (FOUR_WAY) && defined (__AVX2__)
+#if defined (KECCAK_4WAY)
  gate->optimizations = SSE2_OPT | AVX2_OPT;
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -4,7 +4,11 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
+#if defined(FOUR_WAY) && defined(__AVX2__)
+  #define KECCAK_4WAY
+#endif
+
+#if defined(KECCAK_4WAY)

 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -372,7 +372,7 @@ static const sph_u64 RC[] = {
 } while (0)


-static void keccak64_init( keccak64_ctx_m256i *kc, size_t lim )
+static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
 {
   int i;
   for (i = 0; i < 25; i ++)
@@ -386,7 +386,7 @@ static void keccak64_init( keccak64_ctx_m256i *kc, size_t lim )
   kc->w[17] = mm256_neg1;
   kc->w[20] = mm256_neg1;
   kc->ptr = 0;
-   kc->lim = 200 - (lim >> 2);
+   kc->lim = 200 - (out_size >> 2);
 }

 static void
@@ -396,6 +396,7 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
    __m256i *buf;
    __m256i *vdata = (__m256i*)data;
    size_t ptr;
+    DECL_STATE

    buf = kc->buf;
    ptr = kc->ptr;
@@ -407,6 +408,7 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
        return;
    }

+    READ_STATE( kc );
    while ( len > 0 )
    {
        size_t clen;
@@ -425,6 +427,7 @@ keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
            ptr = 0;
        }
    }
+    WRITE_STATE( kc );
    kc->ptr = ptr;
 }

@@ -440,12 +443,11 @@ static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
    size_t m256_len = byte_len >> 3;

    eb = 0x100  >> 8;
-
-    if ( kc->ptr == (lim - 1) )
+    if ( kc->ptr == (lim - 8) )
    {
-        uint64_t t = eb | 0x80;
+        uint64_t t = eb | 0x8000000000000000;
        u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
-        j = 1;
+        j = 8;
    }
    else
    {
--- a/algo/keccak/keccak-hash-4way.c.bak
+++ b/algo/keccak/keccak-hash-4way.c.bak
@@ -1,581 +0,0 @@
-#include <stddef.h>
-#include "keccak-hash-4way.h"
-
-static const sph_u64 RC[] = {
-        SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
-        SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
-        SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
-        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
-        SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
-        SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
-        SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
-        SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
-        SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
-        SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
-        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
-        SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
-};
-
-
-// change u.wide and u.narrow to just w, ie kc->w
-
-#define a00   (kc->w[ 0])
-#define a10   (kc->w[ 1])
-#define a20   (kc->w[ 2])
-#define a30   (kc->w[ 3])
-#define a40   (kc->w[ 4])
-#define a01   (kc->w[ 5])
-#define a11   (kc->w[ 6])
-#define a21   (kc->w[ 7])
-#define a31   (kc->w[ 8])
-#define a41   (kc->w[ 9])
-#define a02   (kc->w[10])
-#define a12   (kc->w[11])
-#define a22   (kc->w[12])
-#define a32   (kc->w[13])
-#define a42   (kc->w[14])
-#define a03   (kc->w[15])
-#define a13   (kc->w[16])
-#define a23   (kc->w[17])
-#define a33   (kc->w[18])
-#define a43   (kc->w[19])
-#define a04   (kc->w[20])
-#define a14   (kc->w[21])
-#define a24   (kc->w[22])
-#define a34   (kc->w[23])
-#define a44   (kc->w[24])
-
-// null when no copy
-#define DECL_STATE
-#define READ_STATE(sc)
-#define WRITE_STATE(sc)
-
-#define INPUT_BUF(size)   do { \
-    size_t j; \
-    for (j = 0; j < (size>>3); j++ ) \
-        kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
-} while (0)
-
-// keccak256 4way uses 136 with 32 bit size
-// keccak256 8way and keccak512 4 way use 72 with 64 bit size
-//#define INPUT_BUF144   INPUT_BUF(144)
-//#define INPUT_BUF136   INPUT_BUF(136) 
-//#define INPUT_BUF104   INPUT_BUF(104)
-//#define INPUT_BUF72    INPUT_BUF(72)   
-
-//simply redefine these macros to do simd
-#define mm256_neg1 \
-        (_mm256_set_epi64x( 0xffffffffffffffff, 0xffffffffffffffff, \
-                            0xffffffffffffffff, 0xffffffffffffffff ) )
-
-#define DECL64(x)        __m256i x
-#define MOV64(d, s)      (d = s)
-#define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
-#define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
-#define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
-#define NOT64(d, s)      (d = _mm256_xor_si256(s,mm256_neg1))
-#define ROL64(d, v, n)   (d = mm256_rotl_64(v, n))
-#define XOR64_IOTA       XOR64
-
-/*
-#define DECL64(x)        sph_u64 x
-#define MOV64(d, s)      (d = s)
-#define XOR64(d, a, b)   (d = a ^ b)
-#define AND64(d, a, b)   (d = a & b)
-#define OR64(d, a, b)    (d = a | b)
-#define NOT64(d, s)      (d = SPH_T64(~s))
-#define ROL64(d, v, n)   (d = SPH_ROTL64(v, n))
-#define XOR64_IOTA       XOR64
-*/
-
-#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
-                DECL64(tt0); \
-                DECL64(tt1); \
-                DECL64(tt2); \
-                DECL64(tt3); \
-uint64_t *ttx = (uint64_t*)&tt0; \
-uint64_t *d0x = (uint64_t*)&d0; \
-uint64_t *d1x = (uint64_t*)&d1; \
-uint64_t *d2x = (uint64_t*)&d2; \
-uint64_t *d3x = (uint64_t*)&d3; \
-uint64_t *d4x = (uint64_t*)&d4; \
-                XOR64(tt0, d0, d1); \
-if (vtp) {printf("Velt0\n"); \
-printf("d0=  %016llx\n",*d0x ); \
-printf("d1=  %016llx\n",*d1x ); \
-printf("d2=  %016llx\n",*d2x ); \
-printf("d3=  %016llx\n",*d3x ); \
-printf("d4=  %016llx\n",*d4x ); \
-printf("tt0= %016llx\n",*ttx );} \
-                XOR64(tt1, d2, d3); \
-                XOR64(tt0, tt0, d4); \
-                XOR64(tt0, tt0, tt1); \
-if(vtp){\
-printf("tt0= %016llx\n",*ttx );} \
-                ROL64(tt0, tt0, 1); \
-if(vtp){\
-printf("tt0= %016llx\n",*ttx );} \
-                XOR64(tt2, c0, c1); \
-                XOR64(tt3, c2, c3); \
-                XOR64(tt0, tt0, c4); \
-                XOR64(tt2, tt2, tt3); \
-                XOR64(t, tt0, tt2); \
-        } while (0)
-
-int vtp = 0;
-
-#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
-        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
-        b40, b41, b42, b43, b44) \
-        do { \
-                DECL64(t0); \
-                DECL64(t1); \
-                DECL64(t2); \
-                DECL64(t3); \
-                DECL64(t4); \
-                TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
-if(vtp){printf("Velt0\n");\
-uint64_t *tx = (uint64_t*)&t0; \
-printf("t0= %016llx\n",tx );} \
-vtp=0; \
-        TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
-                TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
-                TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
-                TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
-                XOR64(b00, b00, t0); \
-                XOR64(b01, b01, t0); \
-                XOR64(b02, b02, t0); \
-                XOR64(b03, b03, t0); \
-                XOR64(b04, b04, t0); \
-                XOR64(b10, b10, t1); \
-                XOR64(b11, b11, t1); \
-                XOR64(b12, b12, t1); \
-                XOR64(b13, b13, t1); \
-                XOR64(b14, b14, t1); \
-                XOR64(b20, b20, t2); \
-                XOR64(b21, b21, t2); \
-                XOR64(b22, b22, t2); \
-                XOR64(b23, b23, t2); \
-                XOR64(b24, b24, t2); \
-                XOR64(b30, b30, t3); \
-                XOR64(b31, b31, t3); \
-                XOR64(b32, b32, t3); \
-                XOR64(b33, b33, t3); \
-                XOR64(b34, b34, t3); \
-                XOR64(b40, b40, t4); \
-                XOR64(b41, b41, t4); \
-                XOR64(b42, b42, t4); \
-                XOR64(b43, b43, t4); \
-                XOR64(b44, b44, t4); \
-        } while (0)
-
-#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
-        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
-        b40, b41, b42, b43, b44) \
-        do { \
-                /* ROL64(b00, b00,  0); */ \
-                ROL64(b01, b01, 36); \
-                ROL64(b02, b02,  3); \
-                ROL64(b03, b03, 41); \
-                ROL64(b04, b04, 18); \
-                ROL64(b10, b10,  1); \
-                ROL64(b11, b11, 44); \
-                ROL64(b12, b12, 10); \
-                ROL64(b13, b13, 45); \
-                ROL64(b14, b14,  2); \
-                ROL64(b20, b20, 62); \
-                ROL64(b21, b21,  6); \
-                ROL64(b22, b22, 43); \
-                ROL64(b23, b23, 15); \
-                ROL64(b24, b24, 61); \
-                ROL64(b30, b30, 28); \
-                ROL64(b31, b31, 55); \
-                ROL64(b32, b32, 25); \
-                ROL64(b33, b33, 21); \
-                ROL64(b34, b34, 56); \
-                ROL64(b40, b40, 27); \
-                ROL64(b41, b41, 20); \
-                ROL64(b42, b42, 39); \
-                ROL64(b43, b43,  8); \
-                ROL64(b44, b44, 14); \
-        } while (0)
-
-/*
- * The KHI macro integrates the "lane complement" optimization. On input,
- * some words are complemented:
- *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
- * On output, the following words are complemented:
- *    a04 a10 a20 a22 a23 a31
- *
- * The (implicit) permutation and the theta expansion will bring back
- * the input mask for the next round.
- */
-
-#define KHI_XO(d, a, b, c)   do { \
-                DECL64(kt); \
-                OR64(kt, b, c); \
-                XOR64(d, a, kt); \
-        } while (0)
-
-#define KHI_XA(d, a, b, c)   do { \
-                DECL64(kt); \
-                AND64(kt, b, c); \
-                XOR64(d, a, kt); \
-        } while (0)
-
-#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
-        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
-        b40, b41, b42, b43, b44) \
-        do { \
-                DECL64(c0); \
-                DECL64(c1); \
-                DECL64(c2); \
-                DECL64(c3); \
-                DECL64(c4); \
-                DECL64(bnn); \
-                NOT64(bnn, b20); \
-                KHI_XO(c0, b00, b10, b20); \
-                KHI_XO(c1, b10, bnn, b30); \
-                KHI_XA(c2, b20, b30, b40); \
-                KHI_XO(c3, b30, b40, b00); \
-                KHI_XA(c4, b40, b00, b10); \
-                MOV64(b00, c0); \
-                MOV64(b10, c1); \
-                MOV64(b20, c2); \
-                MOV64(b30, c3); \
-                MOV64(b40, c4); \
-                NOT64(bnn, b41); \
-                KHI_XO(c0, b01, b11, b21); \
-                KHI_XA(c1, b11, b21, b31); \
-                KHI_XO(c2, b21, b31, bnn); \
-                KHI_XO(c3, b31, b41, b01); \
-                KHI_XA(c4, b41, b01, b11); \
-                MOV64(b01, c0); \
-                MOV64(b11, c1); \
-                MOV64(b21, c2); \
-                MOV64(b31, c3); \
-                MOV64(b41, c4); \
-                NOT64(bnn, b32); \
-                KHI_XO(c0, b02, b12, b22); \
-                KHI_XA(c1, b12, b22, b32); \
-                KHI_XA(c2, b22, bnn, b42); \
-                KHI_XO(c3, bnn, b42, b02); \
-                KHI_XA(c4, b42, b02, b12); \
-                MOV64(b02, c0); \
-                MOV64(b12, c1); \
-                MOV64(b22, c2); \
-                MOV64(b32, c3); \
-                MOV64(b42, c4); \
-                NOT64(bnn, b33); \
-                KHI_XA(c0, b03, b13, b23); \
-                KHI_XO(c1, b13, b23, b33); \
-                KHI_XO(c2, b23, bnn, b43); \
-                KHI_XA(c3, bnn, b43, b03); \
-                KHI_XO(c4, b43, b03, b13); \
-                MOV64(b03, c0); \
-                MOV64(b13, c1); \
-                MOV64(b23, c2); \
-                MOV64(b33, c3); \
-                MOV64(b43, c4); \
-                NOT64(bnn, b14); \
-                KHI_XA(c0, b04, bnn, b24); \
-                KHI_XO(c1, bnn, b24, b34); \
-                KHI_XA(c2, b24, b34, b44); \
-                KHI_XO(c3, b34, b44, b04); \
-                KHI_XA(c4, b44, b04, b14); \
-                MOV64(b04, c0); \
-                MOV64(b14, c1); \
-                MOV64(b24, c2); \
-                MOV64(b34, c3); \
-                MOV64(b44, c4); \
-        } while (0)
-
-#define IOTA(r)   XOR64_IOTA(a00, a00, r)
-
-#define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
-              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
-#define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
-              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
-#define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
-              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
-#define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
-              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
-#define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
-              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
-#define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
-              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
-#define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
-              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
-#define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
-              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
-#define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
-              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
-#define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
-              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
-#define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
-              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
-#define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
-              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
-#define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
-              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
-#define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
-              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
-#define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
-              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
-#define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
-              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
-#define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
-              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
-#define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
-              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
-#define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
-              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
-#define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
-              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
-#define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
-              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
-#define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
-              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
-#define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
-              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
-#define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
-              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
-
-#define P8_TO_P0   do { \
-                DECL64(t); \
-                MOV64(t, a01); \
-                MOV64(a01, a11); \
-                MOV64(a11, a43); \
-                MOV64(a43, t); \
-                MOV64(t, a02); \
-                MOV64(a02, a22); \
-                MOV64(a22, a31); \
-                MOV64(a31, t); \
-                MOV64(t, a03); \
-                MOV64(a03, a33); \
-                MOV64(a33, a24); \
-                MOV64(a24, t); \
-                MOV64(t, a04); \
-                MOV64(a04, a44); \
-                MOV64(a44, a12); \
-                MOV64(a12, t); \
-                MOV64(t, a10); \
-                MOV64(a10, a32); \
-                MOV64(a32, a13); \
-                MOV64(a13, t); \
-                MOV64(t, a14); \
-                MOV64(a14, a21); \
-                MOV64(a21, a20); \
-                MOV64(a20, t); \
-                MOV64(t, a23); \
-                MOV64(a23, a42); \
-                MOV64(a42, a40); \
-                MOV64(a40, t); \
-                MOV64(t, a30); \
-                MOV64(a30, a41); \
-                MOV64(a41, a34); \
-                MOV64(a34, t); \
-        } while (0)
-
-#define LPAR   (
-#define RPAR   )
-
-#define KF_ELT(r, s, k)   do { \
-if(r==0){ vtp=1; printf("Vtheo0\n");}\
-                THETA LPAR P ## r RPAR; \
-if(vtp=1){ \
-uint64_t *W = (uint64_t*)(kc->w); \
-printf("w:   %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] ); \
-printf("     %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] ); \
-printf("     %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] ); \
-printf("     %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );} \
-vtp=0; \
-                RHO LPAR P ## r RPAR; \
-if(r==0){ printf("Vrho0\n");\
-uint64_t *W = (uint64_t*)(kc->w); \
-printf("w:   %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] ); \
-printf("     %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] ); \
-printf("     %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] ); \
-printf("     %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );} \
-                KHI LPAR P ## s RPAR; \
-if(r==0){ printf("Vkhi0\n");\
-uint64_t *W = (uint64_t*)(kc->w); \
-printf("w:   %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] ); \
-printf("     %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] ); \
-printf("     %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] ); \
-printf("     %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );} \
-                IOTA(k); \
-        } while (0)
-
-#define DO(x)   x
-
-#define KECCAK_F_1600   DO(KECCAK_F_1600_)
-
-#define KECCAK_F_1600_   do { \
-    int j; \
-    for (j = 0; j < 24; j += 8) \
-    { \
-       KF_ELT( 0,  1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \
-                                       RC[j + 0], RC[j + 0])) ); \
-       KF_ELT( 1,  2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \
-                                       RC[j + 1], RC[j + 1])) ); \
-       KF_ELT( 2,  3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \
-                                       RC[j + 2], RC[j + 2])) ); \
-       KF_ELT( 3,  4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \
-                                       RC[j + 3], RC[j + 3])) ); \
-       KF_ELT( 4,  5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \
-                                       RC[j + 4], RC[j + 4])) ); \
-       KF_ELT( 5,  6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \
-                                       RC[j + 5], RC[j + 5])) ); \
-       KF_ELT( 6,  7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \
-                                       RC[j + 6], RC[j + 6])) ); \
-       KF_ELT( 7,  8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \
-                                       RC[j + 7], RC[j + 7])) ); \
-       P8_TO_P0; \
-    } \
-} while (0)
-
-
-static void keccak64_init( keccak64_ctx_m256i *kc, size_t lim )
-{
-   int i;
-   for (i = 0; i < 25; i ++)
-          kc->w[i] = _mm256_setzero_si256();
-
-   // Initialization for the "lane complement".
-   kc->w[ 1] = mm256_neg1;
-   kc->w[ 2] = mm256_neg1;
-   kc->w[ 8] = mm256_neg1;
-   kc->w[12] = mm256_neg1;
-   kc->w[17] = mm256_neg1;
-   kc->w[20] = mm256_neg1;
-   kc->ptr = 0;
-   kc->lim = 200 - (lim >> 2);
-}
-
-static void
-keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
-               size_t lim )
-{
-    __m256i *buf;
-    __m256i *vdata = (__m256i*)data;
-    size_t ptr;
-
-    buf = kc->buf;
-    ptr = kc->ptr;
-uint64_t *W = (uint64_t*)(kc->w);
-
-    if ( len < (lim - ptr) )
-    {
-        memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
-        kc->ptr = ptr + len;
-        return;
-    }
-
-    while ( len > 0 )
-    {
-        size_t clen;
-
-        clen = (lim - ptr);
-        if ( clen > len )
-             clen = len;
-        memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
-        ptr += clen;
-        vdata = vdata + (clen>>3);
-        len -= clen;
-        if ( ptr == lim )
-        {
-             INPUT_BUF( lim );
-printf("Vtransform before ptr= %u, len= %u, lim= %u\n",ptr, len, lim);
-printf("w:   %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] );
-printf("     %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] );
-printf("     %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] );
-printf("     %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );
-            KECCAK_F_1600;
-//printf("Vtransform after  ptr= %u, len= %u, lim= %u\n",ptr, len, lim);
-//printf("w:   %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] );
-//printf("     %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] );
-            ptr = 0;
-        }
-    }
-    kc->ptr = ptr;
-}
-
-// keccak512 4way d=64  lim=72,  keccak256 8way d=32 lim=136
-// keccak256 4way d=32 lim=136 
-
-// keccak512 d=64, lim=72,  keccak256 d=32, lim=136 
-static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t d,
-            size_t lim )
-{
-    unsigned eb;
-    union {
-       __m256i tmp[lim + 1];
-       sph_u64 dummy;   /* for alignment */
-    } u;
-    size_t j;
-//    int d = 64;
-
-    eb = 0x100  >> 8;
-
-    if ( kc->ptr == (lim - 1) )
-    {
-        uint64_t t = eb | 0x80;
-        u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
-        j = 1;
-    }
-    else
-    {
-        j = lim - kc->ptr;
-        u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
-        memset_zero_m256i( u.tmp + 1, (j>>3) - 1 );
-        u.tmp[j - 1] = _mm256_set_epi64x( 0x8000000000000000,
-                0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
-    }
-    keccak64_core( kc, u.tmp, j, lim );
-    /* Finalize the "lane complement" */
-    NOT64( kc->w[ 1], kc->w[ 1] );
-    NOT64( kc->w[ 2], kc->w[ 2] );
-    NOT64( kc->w[ 8], kc->w[ 8] );
-    NOT64( kc->w[12], kc->w[12] );
-    NOT64( kc->w[17], kc->w[17] );
-    NOT64( kc->w[20], kc->w[20] );
-    for ( j = 0; j < d; j += 8 )
-         u.tmp[j] =  kc->w[j>>3]; 
-    memcpy_m256i( dst, u.tmp, d>>3 );
-}
-
-void keccak256_4way_init( void *kc )
-{
-   keccak64_init( kc, 256 );
-}
-
-void
-keccak256_4way(void *cc, const void *data, size_t len)
-{
-    keccak64_core(cc, data, len, 136);
-}
-
-void
-keccak256_4way_close(void *cc, void *dst)
-{
-    keccak64_close(cc, dst, 32, 136);
-}
-
-void keccak512_4way_init( void *kc )
-{
-   keccak64_init( kc, 512 );
-}
-
-void
-keccak512_4way(void *cc, const void *data, size_t len)
-{
-        keccak64_core(cc, data, len, 72);
-}
-
-void
-keccak512_4way_close(void *cc, void *dst)
-{
-        keccak64_close(cc, dst, 64, 72);
-}
-
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -40,6 +40,8 @@
 extern "C"{
 #endif

+#ifdef  __AVX2__
+
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"
@@ -62,8 +64,6 @@ extern "C"{
 * <code>memcpy()</code>).
 */

-#ifdef  __AVX2__
-
 typedef struct {
        __m256i buf[144*8];    /* first field, for alignment */
        __m256i w[25];
--- a/algo/keccak/sph_keccak.c.bak
+++ b/algo/keccak/sph_keccak.c.bak
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -197,6 +197,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
    sph_tiger(      &ctx1.tiger,     data, M7_MIDSTATE_LEN );
    sph_ripemd160(  &ctx1.ripemd,    data, M7_MIDSTATE_LEN );

+// the following calculations can be performed once and the results shared
    mpz_t magipi, magisw, product, bns0, bns1;
    mpf_t magifpi, magifpi0, mpt1, mpt2, mptmp, mpten;
    
@@ -221,6 +222,9 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

        memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );

+// with 4 way can a single midstate be shared among lanes?
+// do sinlge round of midstate and inyerleave for final
+
 #ifndef USE_SPH_SHA
        SHA256_Update(  &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
        SHA256_Final( (unsigned char*) (bhash[0]), &ctx2.sha256 );
@@ -249,6 +253,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
        sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );

+// 4 way serial
 	mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
        mpz_set(bns1, bns0);
 	mpz_set(product, bns0);
@@ -274,6 +279,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
        sph_sha256_close( &ctxf_sha256, (void*)(hash) );
 #endif

+// do once and share
        digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
        mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
 	mpf_set_prec_raw(magifpi, prec);
@@ -296,7 +302,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
 	    mpz_set_f(magipi, magifpi);
            mpz_add(magipi,magipi,magisw);
            mpz_add(product,product,magipi);
-			
+// share magipi, product and do serial			
 	    mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
            mpz_add(bns1, bns1, bns0);
            mpz_mul(product,product,bns1);
@@ -317,6 +323,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
 #endif
 	}

+// this is the scanhash part
 	const unsigned char *hash_ = (const unsigned char *)hash;
 	const unsigned char *target_ = (const unsigned char *)ptarget;
 	for ( i = 31; i >= 0; i-- )
@@ -346,6 +353,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

     pdata[19] = n;

+// do this in hashm7m
 out:
     mpf_set_prec_raw(magifpi, prec0);
     mpf_set_prec_raw(magifpi0, prec0);
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -0,0 +1,178 @@
+#include "nist5-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(NIST5_4WAY)
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+
+// no improvement with midstate
+//static __thread blake512_4way_context ctx_mid;
+
+void nist5hash_4way( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake512_4way_context  ctx_blake;
+     hashState_groestl      ctx_groestl;
+     jh512_4way_context     ctx_jh;
+     skein512_4way_context  ctx_skein;
+     keccak512_4way_context ctx_keccak;
+
+//     memcpy( &ctx_blake, &ctx_mid, sizeof(ctx_mid) );
+//     blake512_4way( &ctx_blake, input + (64<<2), 16 );
+
+     blake512_4way_init( &ctx_blake );
+     blake512_4way( &ctx_blake, input, 80 );
+     blake512_4way_close( &ctx_blake, vhash );
+
+     m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash0,
+                               (const char*)hash0, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash1,
+                               (const char*)hash1, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash2,
+                               (const char*)hash2, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash3,
+                               (const char*)hash3, 512 );
+
+     m256_interleave_4x64x( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     jh512_4way_init( &ctx_jh );
+     jh512_4way( &ctx_jh, vhash, 64 );
+     jh512_4way_close( &ctx_jh, vhash );
+
+     keccak512_4way_init( &ctx_keccak );
+     keccak512_4way( &ctx_keccak, vhash, 64 );
+     keccak512_4way_close( &ctx_keccak, vhash );
+
+     skein512_4way_init( &ctx_skein );
+     skein512_4way( &ctx_skein, vhash, 64 );
+     skein512_4way_close( &ctx_skein, vhash );
+
+     m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     memcpy( output,       hash0, 32 );
+     memcpy( output+32,    hash1, 32 );
+     memcpy( output+64,    hash2, 32 );
+     memcpy( output+96,    hash3, 32 );
+}
+
+int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done)
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t Htarg = ptarget[7];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+
+     uint64_t htmax[] = {          0,
+                                 0xF,
+                                0xFF,
+                               0xFFF,
+                              0xFFFF,
+                          0x10000000 };
+
+     uint32_t masks[] = { 0xFFFFFFFF,
+                          0xFFFFFFF0,
+                          0xFFFFFF00,
+                          0xFFFFF000,
+                          0xFFFF0000,
+                                   0 };
+
+     // we need bigendian data...
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     // precalc midstate
+//     blake512_4way_init( &ctx_mid );
+//     blake512_4way( &ctx_mid, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ )
+     {
+        if (Htarg <= htmax[m])
+        {
+           uint32_t mask = masks[m];
+
+           do {
+              found[0] = found[1] = found[2] = found[3] = false;
+              be32enc( noncep0, n   );
+              be32enc( noncep1, n+1 );
+              be32enc( noncep2, n+2 );
+              be32enc( noncep3, n+3 );
+
+              nist5hash_4way( hash, vdata );
+
+              pdata[19] = n;
+
+              if ( ( !(hash[7] & mask) )
+                   && fulltest( hash, ptarget ) ) 
+              {
+                 found[0] = true;
+                 num_found++;
+                 nonces[0] = n; 
+                 work_set_target_ratio( work, hash );
+              }
+              if ( ( !((hash+8)[7] & mask) )
+                   && fulltest( hash+8, ptarget ) )
+              {
+                 found[1] = true;
+                 num_found++;
+                 nonces[1] = n+1;
+                 work_set_target_ratio( work, hash+8 );
+              }
+              if ( ( !((hash+16)[7] & mask) )
+                 && fulltest( hash+16, ptarget ) )
+              {
+                 found[2] = true;
+                 num_found++;
+                 nonces[2] = n+2;
+                 work_set_target_ratio( work, hash+16 );
+              }
+              if ( ( !((hash+24)[7] & mask) )
+                   && fulltest( hash+24, ptarget ) )
+              {
+                 found[3] = true;
+                 num_found++;
+                 nonces[3] = n+3;
+                 work_set_target_ratio( work, hash+24 );
+              }
+              n += 4;
+           } while ( ( num_found == 0 ) && ( n < max_nonce )
+                     && !work_restart[thr_id].restart );
+           break;
+        }
+     }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/nist5/nist5-gate.c
+++ b/algo/nist5/nist5-gate.c
@@ -0,0 +1,17 @@
+#include "nist5-gate.h"
+
+bool register_nist5_algo( algo_gate_t* gate )
+{
+#if defined (NIST5_4WAY)
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+    gate->scanhash = (void*)&scanhash_nist5_4way;
+    gate->hash     = (void*)&nist5hash_4way;
+#else
+    gate->optimizations = SSE2_OPT | AES_OPT;
+    init_nist5_ctx();
+    gate->scanhash = (void*)&scanhash_nist5;
+    gate->hash     = (void*)&nist5hash;
+#endif
+    return true;
+};
+
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -0,0 +1,26 @@
+#ifndef __NIST5_GATE_H__
+#define __NIST5_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+  #define NIST5_4WAY
+#endif
+
+#if defined(NIST5_4WAY)
+
+void nist5hash_4way( void *state, const void *input );
+
+int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+#else
+
+void nist5hash( void *state, const void *input );
+
+int scanhash_nist5( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+#endif
+
+#endif
--- a/algo/nist5/nist5.c
+++ b/algo/nist5/nist5.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "nist5-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -147,7 +147,7 @@ int scanhash_nist5(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
-
+/*
 bool register_nist5_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | AES_OPT;
@@ -156,4 +156,4 @@ bool register_nist5_algo( algo_gate_t* gate )
    gate->hash     = (void*)&nist5hash;
    return true;
 };
-
+*/
--- a/algo/polytimos/polytimos.c.broke
+++ b/algo/polytimos/polytimos.c.broke
@@ -1,125 +0,0 @@
-#include "polytimos-gate.h"
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "algo/skein/sph_skein.h"
-#include "algo/echo/sph_echo.h"
-#include "algo/fugue/sph_fugue.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/shabal/sph_shabal.h"
-#include "algo/gost/sph_gost.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
-#ifndef NO_AES_NI
-  #include "algo/echo/aes_ni/hash_api.h"
-#endif
-
-
-/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
-typedef struct {
-	sph_skein512_context    skein;
-        sph_luffa512_context    luffa;
-//        hashState_luffa         luffa;
-//#ifdef NO_AES_NI
-        sph_echo512_context      echo;
-//#else
-//        hashState_echo          echo;
-//#endif
-	sph_shabal512_context   shabal;
-	sph_fugue512_context    fugue;
-	sph_gost512_context     gost;
-} poly_context_holder;
-
-static __thread poly_context_holder poly_ctx __attribute__ ((aligned (64)));
-
-void init_polytimos_context()
-{
-    sph_skein512_init(&poly_ctx.skein);
-    sph_shabal512_init(&poly_ctx.shabal);
-//#ifdef NO_AES_NI
-    sph_echo512_init(&poly_ctx.echo);
-//#else
-//    init_echo( &poly_ctx.echo, 512 );
-//#endif
-//    init_luffa( &poly_ctx.luffa, 512 );
-    sph_luffa512_init(&poly_ctx.luffa);
-    sph_fugue512_init(&poly_ctx.fugue);
-    sph_gost512_init(&poly_ctx.gost);
-}
-
-void polytimos_hash(void *output, const void *input)
-{
-	poly_context_holder ctx __attribute__ ((aligned (64)));
-	uint32_t hashA[16]__attribute__ ((aligned (64)));
-
-        memcpy( &ctx, &poly_ctx, sizeof(poly_ctx) );
-
-	sph_skein512(&ctx.skein, input, 80);
-	sph_skein512_close(&ctx.skein, hashA);
-
-	sph_shabal512(&ctx.shabal, hashA, 64);
-	sph_shabal512_close(&ctx.shabal, hashA);
-//#ifdef NO_AES_NI
-	sph_echo512(&ctx.echo, hashA, 64);
-	sph_echo512_close(&ctx.echo, hashA);
-//#else
-//        update_final_echo ( &ctx.echo, (BitSequence *)hashA,
-//                            (const BitSequence *)hashA, 512 );
-//#endif
-
-//        update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA,
-//                                (const BitSequence*)hashA, 64 );
-
-        sph_luffa512(&ctx.luffa, hashA, 64);
-        sph_luffa512_close(&ctx.luffa, hashA);
-
-	sph_fugue512(&ctx.fugue, hashA, 64);
-	sph_fugue512_close(&ctx.fugue, hashA);
-
-	sph_gost512(&ctx.gost, hashA, 64);
-	sph_gost512_close(&ctx.gost, hashA);
-
-	memcpy(output, hashA, 32);
-}
-
-int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
-                        uint64_t *hashes_done )
-{
-	uint32_t _ALIGN(128) hash[8];
-	uint32_t _ALIGN(128) endiandata[20];
-	uint32_t *pdata = work->data;
-	uint32_t *ptarget = work->target;
-
-	const uint32_t Htarg = ptarget[7];
-	const uint32_t first_nonce = pdata[19];
-	uint32_t nonce = first_nonce;
-	volatile uint8_t *restart = &(work_restart[thr_id].restart);
-
-	if (opt_benchmark)
-		ptarget[7] = 0x0cff;
-
-	// we need bigendian data...
-	for (int i=0; i < 19; i++) {
-		be32enc(&endiandata[i], pdata[i]);
-	}
-	do {
-		be32enc(&endiandata[19], nonce);
-		polytimos_hash(hash, endiandata);
-
-		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
-			work_set_target_ratio(work, hash);
-			pdata[19] = nonce;
-			*hashes_done = pdata[19] - first_nonce;
-			return 1;
-		}
-		nonce++;
-
-	} while (nonce < max_nonce && !(*restart));
-
-	pdata[19] = nonce;
-	*hashes_done = pdata[19] - first_nonce + 1;
-	return 0;
-}
-
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -1,4 +1,3 @@
-#include "algo-gate-api.h"
 #include "skein-gate.h"
 #include <string.h>
 #include <stdint.h>
@@ -60,7 +59,7 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
    // hash is returned deinterleaved
    uint32_t *nonces = work->nonces;
    bool *found = work->nfound;
-    int num_found;
+    int num_found = 0;

 // data is 80 bytes, 20 u32 or 4 u64.
 	
@@ -76,7 +75,6 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
   do
   {
       found[0] = found[1] = found[2] = found[3] = false;
-       num_found = 0;      
       be32enc( noncep0, n   );
       be32enc( noncep1, n+1 );
       be32enc( noncep2, n+2 );
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -1,7 +1,4 @@
 #include "skein-gate.h"
-#include "algo-gate-api.h"
-//#include <string.h>
-//#include <stdint.h>
 #include "sph_skein.h"
 #include "skein-hash-4way.h"

@@ -9,7 +6,7 @@ int64_t skein_get_max64() { return 0x7ffffLL; }

 bool register_skein_algo( algo_gate_t* gate )
 {
-#if defined (FOUR_WAY) &&  defined (__AVX2__)
+#if defined (SKEIN_4WAY)
    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -1,15 +1,22 @@
 #ifndef __SKEIN_GATE_H__
 #define __SKEIN_GATE_H__
 #include <stdint.h>
+#include "algo-gate-api.h"

-#if defined(__AVX2__)
+#if defined(FOUR_WAY) && defined(__AVX2__)
+  #define SKEIN_4WAY
+#endif
+
+#if defined(SKEIN_4WAY)

 void skeinhash_4way( void *output, const void *input );
+
 int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 #endif

 void skeinhash( void *output, const void *input );
+
 int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );

--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -296,7 +296,6 @@ do { \
  w5 = _mm256_add_epi64( w5, _mm256_add_epi64( SKBI(k,s,5), \
                           _mm256_set_epi64x( SKBT(t,s,0), SKBT(t,s,0), \
                                              SKBT(t,s,0), SKBT(t,s,0) ) ) ); \
-__m256i skbi6 = SKBI(k,s,6); \
  w6 = _mm256_add_epi64( w6, _mm256_add_epi64( SKBI(k,s,6), \
                           _mm256_set_epi64x( SKBT(t,s,1), SKBT(t,s,1), \
                                              SKBT(t,s,1), SKBT(t,s,1) ) ) ); \
@@ -458,20 +457,10 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,
   unsigned first;
   DECL_STATE_BIG_4WAY

-// len is the array size, of data, ie 64 bytes
-// data points to start of 4 element buf
-// ptr is a len offset in bytes
-// buff is an array of 4 elements
-// buff_size is size of one array element
-// One element is 8 bytes (64 bits) scalar but 32 bytes (256 bits) 4way
-// To index buf using ptr it has to be scaled 8 to 1. the amounrt of
-// data to copy is 32 bytes per element instead of 8, or one m256
-
   buf = sc->buf;
   ptr = sc->ptr;
   const int buf_size = 64;   // 64 * _m256i

-// 64 byte len, no part block
   if ( len <= buf_size - ptr )
   {
       memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
@@ -481,8 +470,6 @@ skein_big_core_4way( skein512_4way_context *sc, const void *data,

   READ_STATE_BIG( sc );
   first = ( bcount == 0 ) << 7;
-// 64 byte len, only one block, no transform here.
-// 80 byte len, transform first 64 bytes.
   do {
       size_t clen;

@@ -512,19 +499,7 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
 	__m256i *buf;
 	size_t ptr;
 	unsigned et;
-	int i;
 	DECL_STATE_BIG_4WAY
-	/*
-	 * Add bit padding if necessary.
-	 */
-//	if (n != 0) {
-//		unsigned z;
-//		unsigned char x;
-//
-//		z = 0x80 >> n;
-//		x = ((ub & -z) | z) & 0xFF;
-//		skein_big_core(sc, &x, 1);
-//	}

 	buf = sc->buf;
 	ptr = sc->ptr;
@@ -543,8 +518,6 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
 	 * the encoding of "0", over 8 bytes, then padded with zeros).
 	 */

-// 64 byte len, process only block
-// 80 byte len, process last part block (16 bytes) padded.
 	READ_STATE_BIG(sc);

        memset_zero_m256i( buf + (ptr>>3), (buf_size - ptr) >> 3 );
@@ -555,28 +528,6 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
        bcount = 0;
        UBI_BIG_4WAY( 510, 8 );

-//	for ( i = 0; i < 2; i ++ )
-//        {
-//		UBI_BIG_AVX2( et, ptr );
-//		if (i == 0)
-//                {
-//                        memset_zero_m256i( buf, buf_size >> 3 );
-//			bcount = 0;
-//			et = 510;
-//			ptr = 8;
-//		}
-//	}
-
-// Can LE be assumed? Should be ok SPH_LITTLE_ENDIAN is defined
-/*        _mm256_enc64le( buf, h0 );
-        _mm256_enc64le( buf + 32, h1 );
-        _mm256_enc64le( buf + 64, h2 );
-        _mm256_enc64le( buf + 96, h3 );
-        _mm256_enc64le( buf + 128, h4 );
-        _mm256_enc64le( buf + 160, h5 );
-        _mm256_enc64le( buf + 192, h6 );
-        _mm256_enc64le( buf + 224, h7 );
-*/
        buf[0] = h0;
        buf[1] = h1;
        buf[2] = h2;
@@ -587,7 +538,6 @@ skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
        buf[7] = h7;

        memcpy_m256i( dst, buf, out_len >> 3 );
-//	memcpy( dst, buf, out_len * 4 );
 }

 static const sph_u64 IV256[] = {
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -1,10 +1,9 @@
-#include "skein-gate.h"
-#include "algo-gate-api.h"
+#include "skein2-gate.h"
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"

-#if defined(__AVX2__)
+#if defined(SKEIN2_4WAY)

 void skein2hash_4way( void *output, const void *input )
 {
@@ -38,7 +37,7 @@ int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
    // hash is returned deinterleaved
    uint32_t *nonces = work->nonces;
    bool *found = work->nfound;
-    int num_found;
+    int num_found = 0;

    swab32_array( endiandata, pdata, 20 );

@@ -52,7 +51,6 @@ int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
    do 
    {
       found[0] = found[1] = found[2] = found[3] = false;
-       num_found = 0;
       be32enc( noncep0, n   );
       be32enc( noncep1, n+1 );
       be32enc( noncep2, n+2 );
--- a/algo/skein/skein2-gate.c
+++ b/algo/skein/skein2-gate.c
@@ -1,9 +1,6 @@
 #include "skein2-gate.h"
-#include "algo-gate-api.h"
-//#include <string.h>
 #include <stdint.h>
 #include "sph_skein.h"
-//#include "skein-hash-avx2.h"

 int64_t skein2_get_max64 ()
 {
--- a/algo/skein/skein2-gate.h
+++ b/algo/skein/skein2-gate.h
@@ -1,8 +1,13 @@
 #ifndef __SKEIN2GATE_H__
 #define __SKEIN2_GATE_H__
+#include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(__AVX2__)
+#if defined(FOUR_WAY) && defined(__AVX2__)
+  #define SKEIN2_4WAY
+#endif
+
+#if defined(SKEIN2_4WAY)
 void skein2hash_4way( void *output, const void *input );
 int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t* hashes_done );
--- a/algo/skein/sph_skein.c
+++ b/algo/skein/sph_skein.c
@@ -39,25 +39,6 @@
 extern "C"{
 #endif

-void dump_sph_context( sph_u64 ptr, sph_u64 bcount, uint64_t* buf, 
-  sph_u64 h0, sph_u64 h1, sph_u64 h2, sph_u64 h3, sph_u64 h4, sph_u64 h5,
-  sph_u64 h6, sph_u64 h7 )
-{
-//scalar
-printf("sptr= %llu, bcount= %llu\n", ptr, bcount );
-
-printf("sbuf: %016llx %016llx %016llx %016llx\n", *((uint64_t*)buf),
-       *((uint64_t*)buf+1), *((uint64_t*)buf+2), *((uint64_t*)buf+3) );
-
-printf("      %016llx %016llx %016llx %016llx\n", *((uint64_t*)buf+4),
-       *((uint64_t*)buf+5), *((uint64_t*)buf+6), *((uint64_t*)buf+7) );
-
-printf("sh:%016llx %016llx %016llx %016llx\n", h0, h1, h2, h3 );
-
-printf("   %016llx %016llx %016llx %016llx\n", h4, h5, h6, h7 );
-
-}
-
 #if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN
 #define SPH_SMALL_FOOTPRINT_SKEIN   1
 #endif
--- a/algo/tribus/tribus-4way.c
+++ b/algo/tribus/tribus-4way.c
@@ -0,0 +1,162 @@
+#include "tribus-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(__AVX2__) && !defined(NO_AES_NI)
+
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+static __thread jh512_4way_context ctx_mid;
+
+void tribus_hash_4way(void *state, const void *input)
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     jh512_4way_context     ctx_jh;
+     keccak512_4way_context ctx_keccak;
+     hashState_echo         ctx_echo;
+
+     memcpy( &ctx_jh, &ctx_mid, sizeof(ctx_mid) );
+     jh512_4way( &ctx_jh, input + (64<<2), 16 );
+     jh512_4way_close( &ctx_jh, vhash );
+
+     keccak512_4way_init( &ctx_keccak );
+     keccak512_4way( &ctx_keccak, vhash, 64 );
+     keccak512_4way_close( &ctx_keccak, vhash );
+
+     m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     // hash echo serially
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash0,
+                        (const BitSequence *) hash0, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash1,
+                        (const BitSequence *) hash1, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash2,
+                        (const BitSequence *) hash2, 512 );
+     init_echo( &ctx_echo, 512 );
+     update_final_echo( &ctx_echo, (BitSequence *) hash3,
+                        (const BitSequence *) hash3, 512 );
+
+     memcpy( state,       hash0, 32 );
+     memcpy( state+32,    hash1, 32 );
+     memcpy( state+64,    hash2, 32 );
+     memcpy( state+96,    hash3, 32 );
+}
+
+int scanhash_tribus_4way(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(128) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t n = pdata[19];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+
+   uint64_t htmax[] = {          0,
+                               0xF,
+                              0xFF,
+                             0xFFF,
+                            0xFFFF,
+                        0x10000000 };
+
+   uint32_t masks[] = {	0xFFFFFFFF,
+                        0xFFFFFFF0,
+                        0xFFFFFF00,
+                        0xFFFFF000,
+                        0xFFFF0000,
+                                 0 };
+
+   // we need bigendian data...
+   for ( int i = 0; i < 20; i++ )
+   {
+      be32enc( &endiandata[i], pdata[i] );
+   }
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   // precalc midstate
+   // doing it one way then then interleaving would be faster but too
+   // complicated tto interleave context.
+   jh512_4way_init( &ctx_mid );
+   jh512_4way( &ctx_mid, vdata, 64 );
+
+   for ( int m = 0; m < 6; m++ )
+   {
+      if ( Htarg <= htmax[m] )
+      {
+         uint32_t mask = masks[m];
+         do {
+            found[0] = found[1] = found[2] = found[3] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            be32enc( noncep2, n+2 );
+            be32enc( noncep3, n+3 );
+
+            tribus_hash_4way( hash, vdata );
+
+            pdata[19] = n;
+
+            if ( ( !(hash[7] & mask) )
+                 && fulltest( hash, ptarget ) )
+            {
+                found[0] = true;
+                num_found++;
+                nonces[0] = n;
+                work_set_target_ratio(work, hash);
+             }
+             if ( ( !((hash+8)[7] & mask) )
+                 && fulltest (hash+8, ptarget ) )
+             {
+                found[1] = true;
+                num_found++;
+                nonces[1] = n+1;
+                work_set_target_ratio(work, hash+8);
+             }
+             if ( ( !((hash+16)[7] & mask) )
+                 && fulltest( hash+16, ptarget ) )
+             {
+                found[2] = true;
+                num_found++;
+                nonces[2] = n+2;
+                work_set_target_ratio(work, hash+16);
+             }
+             if ( ( !((hash+24)[7] & mask) )
+                 && fulltest( hash+24, ptarget ) )
+             {
+                found[3] = true;
+                num_found++;
+                nonces[3] = n+3;
+                work_set_target_ratio(work, hash+24);
+             }
+             n += 4;
+         } while ( (num_found == 0) && ( n < max_nonce )
+                    && !work_restart[thr_id].restart);
+         break;
+      }
+   }
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/tribus/tribus-gate.c
+++ b/algo/tribus/tribus-gate.c
@@ -0,0 +1,31 @@
+#include "tribus-gate.h"
+/*
+bool tribus_thread_init()
+{
+   sph_jh512_init( &tribus_ctx.jh );
+   sph_keccak512_init( &tribus_ctx.keccak );
+#ifdef NO_AES_NI
+   sph_echo512_init( &tribus_ctx.echo );
+#else
+   init_echo( &tribus_ctx.echo, 512 );
+#endif
+  return true;
+}
+*/
+bool register_tribus_algo( algo_gate_t* gate )
+{
+//  gate->miner_thread_init = (void*)&tribus_thread_init;
+  gate->get_max64     = (void*)&get_max64_0x1ffff;
+#if defined (TRIBUS_4WAY)
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->scanhash      = (void*)&scanhash_tribus_4way;
+  gate->hash          = (void*)&tribus_hash_4way;
+#else
+  gate->miner_thread_init = (void*)&tribus_thread_init;
+  gate->optimizations = SSE2_OPT | AES_OPT;
+  gate->scanhash      = (void*)&scanhash_tribus;
+  gate->hash          = (void*)&tribus_hash;
+#endif
+  return true;
+};
+
--- a/algo/tribus/tribus-gate.h
+++ b/algo/tribus/tribus-gate.h
@@ -0,0 +1,29 @@
+#ifndef TRIBUS_GATE_H__
+#define TRIBUS_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+  #define TRIBUS_4WAY
+#endif
+
+#if defined(TRIBUS_4WAY)
+
+void tribus_hash_4way( void *state, const void *input );
+
+int scanhash_tribus_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+
+#else
+
+void tribus_hash( void *state, const void *input );
+
+int scanhash_tribus( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+
+bool tribus_thread_init();
+
+#endif
+
+#endif
--- a/algo/tribus/tribus.c
+++ b/algo/tribus/tribus.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "tribus-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -25,6 +25,18 @@ typedef struct {

 static __thread tribus_ctx_holder tribus_ctx;

+bool tribus_thread_init()
+{
+   sph_jh512_init( &tribus_ctx.jh );
+   sph_keccak512_init( &tribus_ctx.keccak );
+#ifdef NO_AES_NI
+   sph_echo512_init( &tribus_ctx.echo );
+#else
+   init_echo( &tribus_ctx.echo, 512 );
+#endif
+  return true;
+}
+
 void tribus_hash(void *state, const void *input)
 {
     unsigned char hash[128] __attribute__ ((aligned (32)));
@@ -122,25 +134,4 @@ int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
 	return 0;
 }

-bool tribus_thread_init()
-{
-   sph_jh512_init( &tribus_ctx.jh );
-   sph_keccak512_init( &tribus_ctx.keccak );
-#ifdef NO_AES_NI
-   sph_echo512_init( &tribus_ctx.echo );
-#else
-   init_echo( &tribus_ctx.echo, 512 );
-#endif
-  return true;
-}
-
-bool register_tribus_algo( algo_gate_t* gate )
-{
-  gate->miner_thread_init = (void*)&tribus_thread_init;
-  gate->optimizations = SSE2_OPT | AES_OPT;
-  gate->get_max64     = (void*)&get_max64_0x1ffff;
-  gate->scanhash      = (void*)&scanhash_tribus;
-  gate->hash          = (void*)&tribus_hash;
-  return true;
-};

--- a/algo/whirlpool/md-helper-4way.c
+++ b/algo/whirlpool/md-helper-4way.c
@@ -0,0 +1,291 @@
+/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * This file contains some functions which implement the external data
+ * handling and padding for Merkle-Damgard hash functions which follow
+ * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
+ *
+ * API: this file is meant to be included, not compiled as a stand-alone
+ * file. Some macros must be defined:
+ *   RFUN   name for the round function
+ *   HASH   "short name" for the hash function
+ *   BE32   defined for big-endian, 32-bit based (e.g. SHA-1)
+ *   LE32   defined for little-endian, 32-bit based (e.g. MD5)
+ *   BE64   defined for big-endian, 64-bit based (e.g. SHA-512)
+ *   LE64   defined for little-endian, 64-bit based (no example yet)
+ *   PW01   if defined, append 0x01 instead of 0x80 (for Tiger)
+ *   BLEN   if defined, length of a message block (in bytes)
+ *   PLW1   if defined, length is defined on one 64-bit word only (for Tiger)
+ *   PLW4   if defined, length is defined on four 64-bit words (for WHIRLPOOL)
+ *   SVAL   if defined, reference to the context state information
+ *
+ * BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
+ * this is used for instance for Tiger, which works on 64-bit words but
+ * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
+ * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
+ * set, then only one word (64 bits) will be used to encode the input
+ * message length (in bits), otherwise two words will be used (as in
+ * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
+ * not PLW1), four 64-bit words will be used to encode the message length
+ * (in bits). Note that regardless of those settings, only 64-bit message
+ * lengths are supported (in bits): messages longer than 2 Exabytes will be
+ * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
+ * 2 millions Terabytes, which is huge).
+ *
+ * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
+ * function. This is used for Tiger2, which is identical to Tiger except
+ * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
+ * of the 0x01 from original Tiger).
+ *
+ * The RFUN function is invoked with two arguments, the first pointing to
+ * aligned data (as a "const void *"), the second being state information
+ * from the context structure. By default, this state information is the
+ * "val" field from the context, and this field is assumed to be an array
+ * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
+ * from the context structure. The "val" field can have any type, except
+ * for the output encoding which assumes that it is an array of "sph_u32"
+ * values. By defining NO_OUTPUT, this last step is deactivated; the
+ * includer code is then responsible for writing out the hash result. When
+ * NO_OUTPUT is defined, the third parameter to the "close()" function is
+ * ignored.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)     SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)    a ## b
+
+#undef SPH_BLEN
+#undef SPH_WLEN
+#if defined BE64 || defined LE64
+#define SPH_BLEN    128U
+#define SPH_WLEN      8U
+#else
+#define SPH_BLEN     64U
+#define SPH_WLEN      4U
+#endif
+
+#ifdef BLEN
+#undef SPH_BLEN
+#define SPH_BLEN    BLEN
+#endif
+
+#undef SPH_MAXPAD
+#if defined PLW1
+#define SPH_MAXPAD   (SPH_BLEN - SPH_WLEN)
+#elif defined PLW4
+#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 2))
+#else
+#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 1))
+#endif
+
+#undef SPH_VAL
+#undef SPH_NO_OUTPUT
+#ifdef SVAL
+#define SPH_VAL         SVAL
+#define SPH_NO_OUTPUT   1
+#else
+#define SPH_VAL   sc->val
+#endif
+
+#ifndef CLOSE_ONLY
+
+#ifdef SPH_UPTR
+static void
+SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len )
+#else
+void
+HASH ( void *cc, const void *data, size_t len )
+#endif
+{
+   SPH_XCAT( HASH, _context ) *sc;
+   __m256i *vdata = (__m256i*)data;
+   size_t ptr;
+
+   sc = cc;
+   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = SPH_BLEN - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_m256i( sc->buf + (ptr>>3), vdata, clen>>3 );
+      vdata = vdata + (clen>>3);
+      ptr += clen;
+      len -= clen;
+      if ( ptr == SPH_BLEN )
+      {
+         RFUN( sc->buf, SPH_VAL );
+         ptr = 0;
+      }
+         sc->count += clen;
+   }
+}
+
+#ifdef SPH_UPTR
+void
+HASH (void *cc, const void *data, size_t len)
+{
+   SPH_XCAT(HASH, _context) *sc;
+   __m256i *vdata = (__m256i*)data;
+   unsigned ptr;
+
+   if ( len < (2 * SPH_BLEN) )
+   {
+      SPH_XCAT(HASH, _short)(cc, data, len);
+      return;
+   }
+   sc = cc;
+   ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
+   if ( ptr > 0 )
+   {
+      unsigned t;
+      t = SPH_BLEN - ptr;
+      SPH_XCAT( HASH, _short )( cc, data, t );
+      vdata = vdata + (t>>3);
+      len -= t;
+   }
+   SPH_XCAT( HASH, _short )( cc, data, len );
+}
+#endif
+
+#endif
+
+/*
+ * Perform padding and produce result. The context is NOT reinitialized
+ * by this function.
+ */
+static void
+SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
+          void *dst, unsigned rnum )
+{
+    SPH_XCAT(HASH, _context) *sc;
+    unsigned ptr, u;
+    sc = cc;
+    ptr = (unsigned)sc->count & (SPH_BLEN - 1U);
+
+uint64_t *b= (uint64_t*)sc->buf;
+uint64_t *s= (uint64_t*)sc->state;
+//printf("Vptr 1= %u\n", ptr);
+//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
+//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
+
+#ifdef PW01
+    sc->buf[ptr>>3] = mm256_vec_epi64( 0x100 >> 8 );
+//    sc->buf[ptr++] = 0x100 >> 8;
+#else
+// need to overwrite exactly one byte
+//    sc->buf[ptr>>3] = _mm256_set_epi64x( 0, 0, 0, 0x80 );
+    sc->buf[ptr>>3] = mm256_vec_epi64( 0x80 );
+//    ptr++;
+#endif
+    ptr += 8;
+
+//printf("Vptr 2= %u\n", ptr);
+//printf("VBuf %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
+//printf("VBuf %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
+
+    if ( ptr > SPH_MAXPAD )
+    {
+         memset_zero_m256i( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 );
+         RFUN( sc->buf, SPH_VAL );
+         memset_zero_m256i( sc->buf, SPH_MAXPAD >> 3 );
+    }
+    else
+    {
+         memset_zero_m256i( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 );
+    }
+#if defined BE64
+#if defined PLW1
+    sc->buf[ SPH_MAXPAD>>3 ] =
+                 mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
+#elif defined PLW4
+    memset_zero_m256i( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
+    sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
+                mm256_byteswap_epi64( mm256_vec_epi64( sc->count >> 61 ) );
+    sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
+                mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
+#else
+    sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
+               mm256_byteswap_epi64( mm256_vec_epi64( sc->count >> 61 ) );
+    sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
+               mm256_byteswap_epi64( mm256_vec_epi64( sc->count << 3 ) );
+#endif  // PLW
+#else  // LE64
+#if defined PLW1
+    sc->buf[ SPH_MAXPAD >> 3 ] = mm256_vec_epi64( sc->count << 3 );
+#elif defined PLW4
+    sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_vec_epi64( sc->count << 3 );
+    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
+                       mm256_vec_epi64( c->count >> 61 );
+    memset_zero_m256i( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ),
+                       2 * SPH_WLEN );
+#else
+    sc->buf[ SPH_MAXPAD >> 3 ] = mm256_vec_epi64( sc->count << 3 );
+    sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] =
+                          mm256_vec_epi64( sc->count >> 61 );
+#endif // PLW
+
+#endif // LE64
+
+//printf("Vptr 3= %u\n", ptr);
+//printf("VBuf   %016llx %016llx %016llx %016llx\n", b[0], b[4], b[8], b[12] );
+//printf("VBuf   %016llx %016llx %016llx %016llx\n", b[16], b[20], b[24], b[28] );
+    RFUN( sc->buf, SPH_VAL );
+
+//printf("Vptr after= %u\n", ptr);
+//printf("VState %016llx %016llx %016llx %016llx\n", s[0], s[4], s[8], s[12] );
+//printf("VState %016llx %016llx %016llx %016llx\n", s[16], s[20], s[24], s[28] );
+
+#ifdef SPH_NO_OUTPUT
+    (void)dst;
+    (void)rnum;
+    (void)u;
+#else
+    for ( u = 0; u < rnum; u ++ )
+    {
+#if defined BE64
+       ((__m256i*)dst)[u] = mm256_byteswap_epi64( sc->val[u] );
+#else  // LE64
+       ((__m256i*)dst)[u] = sc->val[u];
+#endif
+    }
+#endif
+}
+
+static void
+SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum )
+{
+   SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum );
+}
--- a/algo/whirlpool/md_helper.c
+++ b/algo/whirlpool/md_helper.c
@@ -0,0 +1,369 @@
+/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * This file contains some functions which implement the external data
+ * handling and padding for Merkle-Damgard hash functions which follow
+ * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian).
+ *
+ * API: this file is meant to be included, not compiled as a stand-alone
+ * file. Some macros must be defined:
+ *   RFUN   name for the round function
+ *   HASH   "short name" for the hash function
+ *   BE32   defined for big-endian, 32-bit based (e.g. SHA-1)
+ *   LE32   defined for little-endian, 32-bit based (e.g. MD5)
+ *   BE64   defined for big-endian, 64-bit based (e.g. SHA-512)
+ *   LE64   defined for little-endian, 64-bit based (no example yet)
+ *   PW01   if defined, append 0x01 instead of 0x80 (for Tiger)
+ *   BLEN   if defined, length of a message block (in bytes)
+ *   PLW1   if defined, length is defined on one 64-bit word only (for Tiger)
+ *   PLW4   if defined, length is defined on four 64-bit words (for WHIRLPOOL)
+ *   SVAL   if defined, reference to the context state information
+ *
+ * BLEN is used when a message block is not 16 (32-bit or 64-bit) words:
+ * this is used for instance for Tiger, which works on 64-bit words but
+ * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are
+ * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is
+ * set, then only one word (64 bits) will be used to encode the input
+ * message length (in bits), otherwise two words will be used (as in
+ * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but
+ * not PLW1), four 64-bit words will be used to encode the message length
+ * (in bits). Note that regardless of those settings, only 64-bit message
+ * lengths are supported (in bits): messages longer than 2 Exabytes will be
+ * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about
+ * 2 millions Terabytes, which is huge).
+ *
+ * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close()
+ * function. This is used for Tiger2, which is identical to Tiger except
+ * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead
+ * of the 0x01 from original Tiger).
+ *
+ * The RFUN function is invoked with two arguments, the first pointing to
+ * aligned data (as a "const void *"), the second being state information
+ * from the context structure. By default, this state information is the
+ * "val" field from the context, and this field is assumed to be an array
+ * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64).
+ * from the context structure. The "val" field can have any type, except
+ * for the output encoding which assumes that it is an array of "sph_u32"
+ * values. By defining NO_OUTPUT, this last step is deactivated; the
+ * includer code is then responsible for writing out the hash result. When
+ * NO_OUTPUT is defined, the third parameter to the "close()" function is
+ * ignored.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)     SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)    a ## b
+
+#undef SPH_BLEN
+#undef SPH_WLEN
+#if defined BE64 || defined LE64
+#define SPH_BLEN    128U
+#define SPH_WLEN      8U
+#else
+#define SPH_BLEN     64U
+#define SPH_WLEN      4U
+#endif
+
+#ifdef BLEN
+#undef SPH_BLEN
+#define SPH_BLEN    BLEN
+#endif
+
+#undef SPH_MAXPAD
+#if defined PLW1
+#define SPH_MAXPAD   (SPH_BLEN - SPH_WLEN)
+#elif defined PLW4
+#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 2))
+#else
+#define SPH_MAXPAD   (SPH_BLEN - (SPH_WLEN << 1))
+#endif
+
+#undef SPH_VAL
+#undef SPH_NO_OUTPUT
+#ifdef SVAL
+#define SPH_VAL         SVAL
+#define SPH_NO_OUTPUT   1
+#else
+#define SPH_VAL   sc->val
+#endif
+
+#ifndef CLOSE_ONLY
+
+#ifdef SPH_UPTR
+static void
+SPH_XCAT(HASH, _short)(void *cc, const void *data, size_t len)
+#else
+void
+SPH_XCAT(sph_, HASH)(void *cc, const void *data, size_t len)
+#endif
+{
+	SPH_XCAT(sph_, SPH_XCAT(HASH, _context)) *sc;
+	size_t current;
+
+	sc = cc;
+#if SPH_64
+	current = (unsigned)sc->count & (SPH_BLEN - 1U);
+#else
+	current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
+#endif
+	while (len > 0) {
+		size_t clen;
+#if !SPH_64
+		sph_u32 clow, clow2;
+#endif
+
+		clen = SPH_BLEN - current;
+		if (clen > len)
+			clen = len;
+		memcpy(sc->buf + current, data, clen);
+		data = (const unsigned char *)data + clen;
+		current += clen;
+		len -= clen;
+		if (current == SPH_BLEN) {
+			RFUN(sc->buf, SPH_VAL);
+			current = 0;
+		}
+#if SPH_64
+		sc->count += clen;
+#else
+		clow = sc->count_low;
+		clow2 = SPH_T32(clow + clen);
+		sc->count_low = clow2;
+		if (clow2 < clow)
+			sc->count_high ++;
+#endif
+	}
+}
+
+#ifdef SPH_UPTR
+void
+SPH_XCAT(sph_, HASH)(void *cc, const void *data, size_t len)
+{
+	SPH_XCAT(sph_, SPH_XCAT(HASH, _context)) *sc;
+	unsigned current;
+	size_t orig_len;
+#if !SPH_64
+	sph_u32 clow, clow2;
+#endif
+
+	if (len < (2 * SPH_BLEN)) {
+		SPH_XCAT(HASH, _short)(cc, data, len);
+		return;
+	}
+	sc = cc;
+#if SPH_64
+	current = (unsigned)sc->count & (SPH_BLEN - 1U);
+#else
+	current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
+#endif
+	if (current > 0) {
+		unsigned t;
+
+		t = SPH_BLEN - current;
+		SPH_XCAT(HASH, _short)(cc, data, t);
+		data = (const unsigned char *)data + t;
+		len -= t;
+	}
+#if !SPH_UNALIGNED
+	if (((SPH_UPTR)data & (SPH_WLEN - 1U)) != 0) {
+		SPH_XCAT(HASH, _short)(cc, data, len);
+		return;
+	}
+#endif
+	orig_len = len;
+	while (len >= SPH_BLEN) {
+		RFUN(data, SPH_VAL);
+		len -= SPH_BLEN;
+		data = (const unsigned char *)data + SPH_BLEN;
+	}
+	if (len > 0)
+		memcpy(sc->buf, data, len);
+#if SPH_64
+	sc->count += (sph_u64)orig_len;
+#else
+	clow = sc->count_low;
+	clow2 = SPH_T32(clow + orig_len);
+	sc->count_low = clow2;
+	if (clow2 < clow)
+		sc->count_high ++;
+	/*
+	 * This code handles the improbable situation where "size_t" is
+	 * greater than 32 bits, and yet we do not have a 64-bit type.
+	 */
+	orig_len >>= 12;
+	orig_len >>= 10;
+	orig_len >>= 10;
+	sc->count_high += orig_len;
+#endif
+}
+#endif
+
+#endif
+
+/*
+ * Perform padding and produce result. The context is NOT reinitialized
+ * by this function.
+ */
+static void
+SPH_XCAT(HASH, _addbits_and_close)(void *cc,
+	unsigned ub, unsigned n, void *dst, unsigned rnum)
+{
+	SPH_XCAT(sph_, SPH_XCAT(HASH, _context)) *sc;
+	unsigned current, u;
+#if !SPH_64
+	sph_u32 low, high;
+#endif
+
+	sc = cc;
+#if SPH_64
+	current = (unsigned)sc->count & (SPH_BLEN - 1U);
+#else
+	current = (unsigned)sc->count_low & (SPH_BLEN - 1U);
+#endif
+
+uint64_t *b= (uint64_t*)sc->buf;
+uint64_t *s= (uint64_t*)sc->state;
+// printf("Sptr 1= %u\n",current);   
+// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
+// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );
+
+#ifdef PW01
+	sc->buf[current ++] = (0x100 | (ub & 0xFF)) >> (8 - n);
+#else
+	{
+		unsigned z;
+
+		z = 0x80 >> n;
+		sc->buf[current ++] = ((ub & -z) | z) & 0xFF;
+	}
+#endif
+
+// printf("Sptr 2= %u\n",current); 
+// printf("SBuf %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
+// printf("SBuf %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );
+
+	if (current > SPH_MAXPAD) {
+		memset(sc->buf + current, 0, SPH_BLEN - current);
+		RFUN(sc->buf, SPH_VAL);
+		memset(sc->buf, 0, SPH_MAXPAD);
+	} else {
+		memset(sc->buf + current, 0, SPH_MAXPAD - current);
+	}
+#if defined BE64
+#if defined PLW1
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#elif defined PLW4
+	memset(sc->buf + SPH_MAXPAD, 0, 2 * SPH_WLEN);
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD + 2 * SPH_WLEN,
+		sc->count >> 61);
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD + 3 * SPH_WLEN,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#else
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD, sc->count >> 61);
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD + SPH_WLEN,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#endif
+#elif defined LE64
+#if defined PLW1
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+// BUG!! should be PLW4
+#elif defined PLW1
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD + SPH_WLEN, sc->count >> 61);
+	memset(sc->buf + SPH_MAXPAD + 2 * SPH_WLEN, 0, 2 * SPH_WLEN);
+#else
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD + SPH_WLEN, sc->count >> 61);
+#endif
+#else
+#if SPH_64
+#ifdef BE32
+	sph_enc64be_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#else
+	sph_enc64le_aligned(sc->buf + SPH_MAXPAD,
+		SPH_T64(sc->count << 3) + (sph_u64)n);
+#endif
+#else
+	low = sc->count_low;
+	high = SPH_T32((sc->count_high << 3) | (low >> 29));
+	low = SPH_T32(low << 3) + (sph_u32)n;
+#ifdef BE32
+	sph_enc32be(sc->buf + SPH_MAXPAD, high);
+	sph_enc32be(sc->buf + SPH_MAXPAD + SPH_WLEN, low);
+#else
+	sph_enc32le(sc->buf + SPH_MAXPAD, low);
+	sph_enc32le(sc->buf + SPH_MAXPAD + SPH_WLEN, high);
+#endif
+#endif
+#endif
+
+// printf("Sptr 3= %u\n",current);
+// printf("SBuf   %016llx %016llx %016llx %016llx\n", b[0], b[1], b[2], b[3] );
+// printf("SBuf   %016llx %016llx %016llx %016llx\n", b[4], b[5], b[6], b[7] );
+
+	RFUN(sc->buf, SPH_VAL);
+
+// printf("Sptr after= %u\n",current);
+// printf("SState %016llx %016llx %016llx %016llx\n", s[0], s[1], s[2], s[3] );
+// printf("SState %016llx %016llx %016llx %016llx\n", s[4], s[5], s[6], s[7] );
+
+#ifdef SPH_NO_OUTPUT
+	(void)dst;
+	(void)rnum;
+	(void)u;
+#else
+	for (u = 0; u < rnum; u ++) {
+#if defined BE64
+		sph_enc64be((unsigned char *)dst + 8 * u, sc->val[u]);
+#elif defined LE64
+		sph_enc64le((unsigned char *)dst + 8 * u, sc->val[u]);
+#elif defined BE32
+		sph_enc32be((unsigned char *)dst + 4 * u, sc->val[u]);
+#else
+		sph_enc32le((unsigned char *)dst + 4 * u, sc->val[u]);
+#endif
+	}
+#endif
+}
+
+static void
+SPH_XCAT(HASH, _close)(void *cc, void *dst, unsigned rnum)
+{
+	SPH_XCAT(HASH, _addbits_and_close)(cc, 0, 0, dst, rnum);
+}
--- a/algo/whirlpool/sph_whirlpool.c
+++ b/algo/whirlpool/sph_whirlpool.c
@@ -3441,19 +3441,19 @@ ROUND_FUN(whirlpool1, old1)

 #define RFUN   whirlpool_round
 #define HASH   whirlpool
-#include "algo/sha/md_helper.c"
+#include "md_helper.c"
 #undef RFUN
 #undef HASH

 #define RFUN   whirlpool0_round
 #define HASH   whirlpool0
-#include "algo/sha/md_helper.c"
+#include "md_helper.c"
 #undef RFUN
 #undef HASH

 #define RFUN   whirlpool1_round
 #define HASH   whirlpool1
-#include "algo/sha/md_helper.c"
+#include "md_helper.c"
 #undef RFUN
 #undef HASH

@@ -3463,7 +3463,6 @@ sph_ ## name ## _close(void *cc, void *dst) \
 { \
 	sph_ ## name ## _context *sc; \
 	int i; \
- \
 	name ## _close(cc, dst, 0); \
 	sc = cc; \
 	for (i = 0; i < 8; i ++) \
--- a/algo/whirlpool/sph_whirlpool.c.bak
+++ b/algo/whirlpool/sph_whirlpool.c.bak
--- a/algo/whirlpool/sph_whirlpool.h.bak
+++ b/algo/whirlpool/sph_whirlpool.h.bak
@@ -0,0 +1,209 @@
+/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * WHIRLPOOL interface.
+ *
+ * WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
+ * version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
+ * (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
+ * version, 2003, with a new diffusion matrix, also described as "plain
+ * WHIRLPOOL"). All three variants are implemented here.
+ *
+ * The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
+ * M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
+ * NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
+ *
+ * The current WHIRLPOOL specification and a reference implementation
+ * can be found on the WHIRLPOOL web page:
+ * http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_whirlpool.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_WHIRLPOOL_H__
+#define SPH_WHIRLPOOL_H__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for WHIRLPOOL.
+ */
+#define SPH_SIZE_whirlpool   512
+
+/**
+ * Output size (in bits) for WHIRLPOOL-0.
+ */
+#define SPH_SIZE_whirlpool0   512
+
+/**
+ * Output size (in bits) for WHIRLPOOL-1.
+ */
+#define SPH_SIZE_whirlpool1   512
+
+/**
+ * This structure is a context for WHIRLPOOL computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a WHIRLPOOL computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running WHIRLPOOL computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u64 state[8];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_whirlpool_context;
+
+/**
+ * Initialize a WHIRLPOOL context. This process performs no memory allocation.
+ *
+ * @param cc   the WHIRLPOOL context (pointer to a
+ *             <code>sph_whirlpool_context</code>)
+ */
+void sph_whirlpool_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing). This function applies the
+ * plain WHIRLPOOL algorithm.
+ *
+ * @param cc     the WHIRLPOOL context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_whirlpool(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current WHIRLPOOL computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the WHIRLPOOL context
+ * @param dst   the destination buffer
+ */
+void sph_whirlpool_close(void *cc, void *dst);
+
+/**
+ * WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
+ */
+typedef sph_whirlpool_context sph_whirlpool0_context;
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Initialize a WHIRLPOOL-0 context. This function is identical to
+ * <code>sph_whirlpool_init()</code>.
+ *
+ * @param cc   the WHIRLPOOL context (pointer to a
+ *             <code>sph_whirlpool0_context</code>)
+ */
+void sph_whirlpool0_init(void *cc);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_whirlpool0_init   sph_whirlpool_init
+#endif
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing). This function applies the
+ * WHIRLPOOL-0 algorithm.
+ *
+ * @param cc     the WHIRLPOOL context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_whirlpool0(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current WHIRLPOOL-0 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the WHIRLPOOL-0 context
+ * @param dst   the destination buffer
+ */
+void sph_whirlpool0_close(void *cc, void *dst);
+
+/**
+ * WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
+ */
+typedef sph_whirlpool_context sph_whirlpool1_context;
+
+#ifdef DOXYGEN_IGNORE
+/**
+ * Initialize a WHIRLPOOL-1 context. This function is identical to
+ * <code>sph_whirlpool_init()</code>.
+ *
+ * @param cc   the WHIRLPOOL context (pointer to a
+ *             <code>sph_whirlpool1_context</code>)
+ */
+void sph_whirlpool1_init(void *cc);
+#endif
+
+#ifndef DOXYGEN_IGNORE
+#define sph_whirlpool1_init   sph_whirlpool_init
+#endif
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing). This function applies the
+ * WHIRLPOOL-1 algorithm.
+ *
+ * @param cc     the WHIRLPOOL context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_whirlpool1(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current WHIRLPOOL-1 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (64 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the WHIRLPOOL-1 context
+ * @param dst   the destination buffer
+ */
+void sph_whirlpool1_close(void *cc, void *dst);
+
+#endif
+
+#endif
--- a/algo/whirlpool/whirlpool-4way.c
+++ b/algo/whirlpool/whirlpool-4way.c
@@ -0,0 +1,131 @@
+#include "whirlpool-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "sph_whirlpool.h"
+#include "whirlpool-hash-4way.h"
+
+#if defined(__AVX2__)
+
+static __thread whirlpool_4way_context whirl_mid;
+
+void whirlpool_hash_4way( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     const int midlen = 64;
+     const int tail   = 80 - midlen;
+     whirlpool_4way_context ctx;
+
+     memcpy( &ctx, &whirl_mid, sizeof whirl_mid );
+     whirlpool1_4way( &ctx, input + (midlen<<2), tail );
+     whirlpool1_4way_close( &ctx, vhash);
+
+//     whirlpool1_4way_init( &ctx );
+//     whirlpool1_4way( &ctx, input, 80 );
+//     whirlpool1_4way_close( &ctx, vhash);
+
+     whirlpool1_4way_init( &ctx );
+     whirlpool1_4way( &ctx, vhash, 64 );
+     whirlpool1_4way_close( &ctx, vhash);
+
+     whirlpool1_4way_init( &ctx );
+     whirlpool1_4way( &ctx, vhash, 64 );
+     whirlpool1_4way_close( &ctx, vhash);
+
+     whirlpool1_4way_init( &ctx );
+     whirlpool1_4way( &ctx, vhash, 64 );
+     whirlpool1_4way_close( &ctx, vhash);
+
+     m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     memcpy( state   , hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_whirlpool_4way( int thr_id, struct work* work, uint32_t max_nonce,
+                             unsigned long *hashes_done )
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(128) endiandata[20];
+   uint32_t* pdata = work->data;
+   uint32_t* ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+
+//   if (opt_benchmark)
+//      ((uint32_t*)ptarget)[7] = 0x0000ff;
+
+    for (int i=0; i < 19; i++)
+      be32enc(&endiandata[i], pdata[i]);
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   // midstate
+   whirlpool1_4way_init( &whirl_mid );
+   whirlpool1_4way( &whirl_mid, vdata, 64 );
+
+   do {
+     const uint32_t Htarg = ptarget[7];
+     found[0] = found[1] = found[2] = found[3] = false;
+     be32enc( noncep0, n   );
+     be32enc( noncep1, n+1 );
+     be32enc( noncep2, n+2 );
+     be32enc( noncep3, n+3 );
+
+     whirlpool_hash_4way( hash, vdata );
+
+     pdata[19] = n;
+     if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+     {
+         found[0] = true;
+         num_found++;
+         nonces[0] = n;
+         work_set_target_ratio(work, hash);
+     }
+     if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+     {
+         found[1] = true;
+         num_found++;
+         nonces[1] = n+1;
+         work_set_target_ratio( work, hash+8 );
+     }
+     if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+     {
+         found[2] = true;
+         num_found++;
+         nonces[2] = n+2;
+         work_set_target_ratio( work, hash+16 );
+     }
+     if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+     {
+         found[3] = true;
+         num_found++;
+         nonces[3] = n+3;
+         work_set_target_ratio( work, hash+24 );
+     }
+     n += 4;
+
+   } while ( ( num_found == 0 ) && ( n < max_nonce )
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+	return num_found;
+}
+
+#endif
--- a/algo/whirlpool/whirlpool-gate.c
+++ b/algo/whirlpool/whirlpool-gate.c
@@ -0,0 +1,15 @@
+#include "whirlpool-gate.h"
+
+bool register_whirlpool_algo( algo_gate_t* gate )
+{
+//#if defined (WHIRLPOOL_4WAY)
+//  gate->scanhash  = (void*)&scanhash_whirlpool_4way;
+//  gate->hash      = (void*)&whirlpool_hash_4way;
+//#else
+  gate->scanhash  = (void*)&scanhash_whirlpool;
+  gate->hash      = (void*)&whirlpool_hash;
+  init_whirlpool_ctx();
+//#endif
+  return true;
+};
+
--- a/algo/whirlpool/whirlpool-gate.h
+++ b/algo/whirlpool/whirlpool-gate.h
@@ -0,0 +1,24 @@
+#ifndef WHIRLPOOL_GATE_H__
+#define WHIRLPOOL_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX2__)
+  #define WHIRLPOOL_4WAY
+#endif
+
+//#if defined (WHIRLPOOL_4WAY) 
+
+//void whirlpool_hash_4way(void *state, const void *input);
+
+//int scanhash_whirlpool_4way( int thr_id, struct work *work, uint32_t max_nonce,
+//                              uint64_t *hashes_done );
+//#endif
+
+void whirlpool_hash( void *state, const void *input );
+
+int scanhash_whirlpool( int thr_id, struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done );
+#endif
+
--- a/algo/whirlpool/whirlpool-hash-4way.c
+++ b/algo/whirlpool/whirlpool-hash-4way.c
--- a/algo/whirlpool/whirlpool-hash-4way.h
+++ b/algo/whirlpool/whirlpool-hash-4way.h
@@ -0,0 +1,108 @@
+/* $Id: sph_whirlpool.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * WHIRLPOOL interface.
+ *
+ * WHIRLPOOL knows three variants, dubbed "WHIRLPOOL-0" (original
+ * version, published in 2000, studied by NESSIE), "WHIRLPOOL-1"
+ * (first revision, 2001, with a new S-box) and "WHIRLPOOL" (current
+ * version, 2003, with a new diffusion matrix, also described as "plain
+ * WHIRLPOOL"). All three variants are implemented here.
+ *
+ * The original WHIRLPOOL (i.e. WHIRLPOOL-0) was published in: P. S. L.
+ * M. Barreto, V. Rijmen, "The Whirlpool Hashing Function", First open
+ * NESSIE Workshop, Leuven, Belgium, November 13--14, 2000.
+ *
+ * The current WHIRLPOOL specification and a reference implementation
+ * can be found on the WHIRLPOOL web page:
+ * http://paginas.terra.com.br/informatica/paulobarreto/WhirlpoolPage.html
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_whirlpool.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef WHIRLPOOL_HASH_4WAY_H__
+#define WHIRLPOOL_HASH_4WAY_H__
+
+#ifdef __AVX2__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+/**
+ * Output size (in bits) for WHIRLPOOL.
+ */
+#define SPH_SIZE_whirlpool   512
+
+/**
+ * Output size (in bits) for WHIRLPOOL-0.
+ */
+#define SPH_SIZE_whirlpool0   512
+
+/**
+ * Output size (in bits) for WHIRLPOOL-1.
+ */
+#define SPH_SIZE_whirlpool1   512
+
+typedef struct {
+    __m256i buf[8] __attribute__ ((aligned (64)));
+    __m256i state[8];
+    sph_u64 count;
+} whirlpool_4way_context;
+
+void whirlpool_4way_init( void *cc );
+
+void whirlpool_4way( void *cc, const void *data, size_t len );
+
+void whirlpool_4way_close( void *cc, void *dst );
+
+/**
+ * WHIRLPOOL-0 uses the same structure than plain WHIRLPOOL.
+ */
+typedef whirlpool_4way_context whirlpool0_4way_context;
+
+#define whirlpool0_4way_init whirlpool_4way_init
+
+void whirlpool0_4way( void *cc, const void *data, size_t len );
+
+void whirlpool0_4way_close( void *cc, void *dst );
+
+/**
+ * WHIRLPOOL-1 uses the same structure than plain WHIRLPOOL.
+ */
+typedef whirlpool_4way_context whirlpool1_4way_context;
+
+#define whirlpool1_4way_init whirlpool_4way_init
+
+void whirlpool1_4way(void *cc, const void *data, size_t len);
+
+void whirlpool1_4way_close(void *cc, void *dst);
+
+#endif
+
+#endif
--- a/algo/whirlpool/whirlpool.c
+++ b/algo/whirlpool/whirlpool.c
@@ -1,5 +1,4 @@
-#include "algo-gate-api.h"
-
+#include "whirlpool-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -58,7 +57,8 @@ void whirlpool_midstate( const void* input )
 }


-int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+int scanhash_whirlpool( int thr_id, struct work* work, uint32_t max_nonce,
+                        uint64_t *hashes_done )
 {
 	uint32_t _ALIGN(128) endiandata[20];
 	uint32_t* pdata = work->data;
@@ -66,8 +66,8 @@ int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsign
 	const uint32_t first_nonce = pdata[19];
        uint32_t n = first_nonce - 1;

-//	if (opt_benchmark)
-//		((uint32_t*)ptarget)[7] = 0x0000ff;
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0000ff;

        for (int i=0; i < 19; i++)
                be32enc(&endiandata[i], pdata[i]);
@@ -83,7 +83,7 @@ int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsign

 		if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
                {
-//			work_set_target_ratio(work, vhash);
+			work_set_target_ratio(work, vhash);
                       *hashes_done = n - first_nonce + 1;
 			return true;
 		}
@@ -95,11 +95,3 @@ int scanhash_whirlpool(int thr_id, struct work* work, uint32_t max_nonce, unsign
 	return 0;
 }

-bool register_whirlpool_algo( algo_gate_t* gate )
-{
-  gate->scanhash  = (void*)&scanhash_whirlpool;
-  gate->hash      = (void*)&whirlpool_hash;
-  init_whirlpool_ctx();
-  return true;
-};
-
--- a/algo/whirlpool/whirlpoolx.c
+++ b/algo/whirlpool/whirlpoolx.c
@@ -24,7 +24,8 @@ void whirlpoolx_hash(void *state, const void *input)
 	memcpy(state, hash, 32);
 }

-int scanhash_whirlpoolx(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+int scanhash_whirlpoolx( int thr_id, struct work* work, uint32_t max_nonce,
+                         uint64_t *hashes_done)
 {
 	uint32_t _ALIGN(128) endiandata[20];
 	uint32_t* pdata = work->data;
@@ -32,8 +33,8 @@ int scanhash_whirlpoolx(int thr_id, struct work* work, uint32_t max_nonce, unsig
 	const uint32_t first_nonce = pdata[19];
        uint32_t n = first_nonce - 1;

-//	if (opt_benchmark)
-//		((uint32_t*)ptarget)[7] = 0x0000ff;
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0000ff;

        for (int i=0; i < 19; i++)
                be32enc(&endiandata[i], pdata[i]);
@@ -47,7 +48,7 @@ int scanhash_whirlpoolx(int thr_id, struct work* work, uint32_t max_nonce, unsig

 		if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
                {
-//			work_set_target_ratio(work, vhash);
+			work_set_target_ratio(work, vhash);
                       *hashes_done = n - first_nonce + 1;
 			return true;
 		}
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -1,3 +1,6 @@
+#ifndef AVXDEFS_H__
+#define AVXDEFS_H__
+
 // Some tools to help using AVX and AVX2
 // AVX support is required to include this header file, AVX2 optional.

@@ -39,7 +42,23 @@ uint8_t   v8 [16];

 #if defined (__AVX2__)

-// AVX2 replacements for vectorized data
+// AVX2 implementations of
+//   vector versions of common scalar functions
+//   vector handling, indexing, pointer arithmetic
+//   vector scalar conversion
+//   overlay (union) handling for all integer data types
+
+// vectorize 64 bit data by replication.
+// don't need to repeat the val 4 times.
+inline __m256i mm256_vec_epi64( uint64_t val )
+{
+   return _mm256_set_epi64x( val, val, val, val );
+}
+
+inline __m256i mm256_vec_epi32( uint32_t val )
+{
+   return _mm256_set_epi32( val, val, val, val, val, val, val, val );
+}

 // n = number of __m256i (32 bytes)
 inline void memset_zero_m256i( __m256i *dst, int n )
@@ -282,8 +301,8 @@ inline __m256i  mm256_byteswap_epi32( __m256i x )
          _mm256_set_epi32( 0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000,
                            0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000 ) );
  __m256i x0 = _mm256_slli_epi32( x, 24 );   // x0 = x << 24
-          x1 = _mm256_slli_epi32( x1, 8 );   // x1 = mask(x) << 8
-          x2 = _mm256_srli_epi32( x2, 8 );   // x2 = mask(x) >> 8
+          x1 = _mm256_slli_epi32( x1, 8 );   // x1 = mask1(x) << 8
+          x2 = _mm256_srli_epi32( x2, 8 );   // x2 = mask2(x) >> 8
  __m256i x3 = _mm256_srli_epi32( x, 24 );   // x3 = x >> 24
  return _mm256_or_si256( _mm256_or_si256( x0, x1 ),
                          _mm256_or_si256( x2, x3 ) );
@@ -318,6 +337,11 @@ inline __m256i mm256_byteswap_epi64( __m256i x )
  return x;
 }

+// vectorized version of ~ operator
+#define mm256_bitnot( x ) \
+   _mm256_xor_si256( (x), _mm256_set_epi64x( 0xffffffffffffffff, \
+             0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff ) )
+
 #endif  // AVX2

 // AVX replacements for vectorized data
@@ -614,12 +638,65 @@ inline void m256_deinterleave_4x64( uint64_t *dst0, uint64_t *dst1,
   for ( int i = 0; i < bit_len>>6; i++, s += 4 )
  {
     *(dst0+i) = *s;
-     *(dst1+i) = *(s+1);    
-     *(dst2+i) = *(s+2);   
-     *(dst3+i) = *(s+3);   
+     *(dst1+i) = *(s+1);
+     *(dst2+i) = *(s+2);
+     *(dst3+i) = *(s+3);
  }
 }

+// optimized versions using AVX2 code
+// only 512 bit and 640 bit length is supported
+// all data must be aligned to256 bits
+// 640 bit data needs padding for overrun to 768
+// no looping
+inline void m256_interleave_4x64x( uint64_t *dst, uint64_t *src0,
+              uint64_t *src1, uint64_t *src2, uint64_t *src3, int bit_len )
+{
+   __m256i* d = (__m256i*)dst;
+
+   d[0] = _mm256_set_epi64x( src3[0], src2[0], src1[0], src0[0] );
+   d[1] = _mm256_set_epi64x( src3[1], src2[1], src1[1], src0[1] );
+   d[2] = _mm256_set_epi64x( src3[2], src2[2], src1[2], src0[2] );
+   d[3] = _mm256_set_epi64x( src3[3], src2[3], src1[3], src0[3] );
+
+   d[4] = _mm256_set_epi64x( src3[4], src2[4], src1[4], src0[4] );
+   d[5] = _mm256_set_epi64x( src3[5], src2[5], src1[5], src0[5] );
+   d[6] = _mm256_set_epi64x( src3[6], src2[6], src1[6], src0[6] );
+   d[7] = _mm256_set_epi64x( src3[7], src2[7], src1[7], src0[7] );
+
+   if ( bit_len == 512 ) return;
+
+   d[8] = _mm256_set_epi64x( src3[8], src2[8], src1[8], src0[8] );
+   d[9] = _mm256_set_epi64x( src3[9], src2[9], src1[9], src0[9] );
+}
+
+inline void m256_deinterleave_4x64x( uint64_t *dst0, uint64_t *dst1,
+                uint64_t *dst2, uint64_t *dst3, uint64_t *src, int bit_len )
+{
+   __m256i* d0 = (__m256i*)dst0;
+   __m256i* d1 = (__m256i*)dst1;
+   __m256i* d2 = (__m256i*)dst2;
+   __m256i* d3 = (__m256i*)dst3;
+
+   d0[0] = _mm256_set_epi64x( src[12], src[ 8], src[ 4], src[ 0] );
+   d1[0] = _mm256_set_epi64x( src[13], src[ 9], src[ 5], src[ 1] );
+   d2[0] = _mm256_set_epi64x( src[14], src[10], src[ 6], src[ 2] );
+   d3[0] = _mm256_set_epi64x( src[15], src[11], src[ 7], src[ 3] );
+
+   d0[1] = _mm256_set_epi64x( src[28], src[24], src[20], src[16] );
+   d1[1] = _mm256_set_epi64x( src[29], src[25], src[21], src[17] );
+   d2[1] = _mm256_set_epi64x( src[30], src[26], src[22], src[18] );
+   d3[1] = _mm256_set_epi64x( src[31], src[27], src[23], src[19] );
+
+   if ( bit_len == 512 ) return;
+
+   // null change to overrun area
+   d0[2] = _mm256_set_epi64x( dst0[44], dst0[40], src[36], src[32] );
+   d1[2] = _mm256_set_epi64x( dst1[45], dst1[41], src[37], src[33] );
+   d2[2] = _mm256_set_epi64x( dst2[46], dst2[42], src[38], src[34] );
+   d3[2] = _mm256_set_epi64x( dst3[47], dst3[43], src[39], src[35] );
+}
+
 // interleave 8 arrays of 32 bit elements for AVX2 processing
 // bit_len must be multiple of 32
 inline void m256_interleave_8x32( uint32_t *dst, uint32_t *src0,
@@ -718,7 +795,8 @@ inline void m128_interleave_4x32( uint32_t *dst, uint32_t *src0,
 // deinterleave 4 arrays into individual buffers for scalarm processing
 // bit_len must be multiple of 32
 inline void m128_deinterleave_4x32( uint32_t *dst0, uint32_t *dst1,
-                uint32_t *dst2,uint32_t *dst3, uint32_t *src, int bit_len )
+                uint32_t *dst2,uint32_t *dst3, uint32_t *src,
+                int bit_len )
 {
  uint32_t *s = src;
  for ( int i = 0; i < bit_len >> 5; i++, s += 4 )
@@ -730,4 +808,5 @@ inline void m128_deinterleave_4x32( uint32_t *dst0, uint32_t *dst1,
  }
 }

+#endif

--- a/build-4way.sh
+++ b/build-4way.sh
@@ -18,7 +18,8 @@ rm -f config.status
 # Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
 #extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"

-CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
+CFLAGS="-O3 -march=native -Wall -DFOUR_WAY"  ./configure --with-curl
+#CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl

 make -j 4

--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,8 @@ rm -f config.status
 # Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
 #extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"

-CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
+CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
+#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl

 make -j 4

--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.3.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.4.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.7.3'
-PACKAGE_STRING='cpuminer-opt 3.7.3'
+PACKAGE_VERSION='3.7.4'
+PACKAGE_STRING='cpuminer-opt 3.7.4'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.7.3 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.7.4 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1392,7 +1392,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.7.3:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.7.4:";;
   esac
  cat <<\_ACEOF

@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.7.3
+cpuminer-opt configure 3.7.4
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.7.3, which was
+It was created by cpuminer-opt $as_me 3.7.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2981,7 +2981,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.7.3'
+ VERSION='3.7.4'


 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.7.3, which was
+This file was extended by cpuminer-opt $as_me 3.7.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.7.3
+cpuminer-opt config.status 3.7.4
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.7.3])
+AC_INIT([cpuminer-opt], [3.7.4])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1815,14 +1815,30 @@ static void *miner_thread( void *userdata )
             {
                 *algo_gate.get_nonceptr( work.data ) = work.nonces[n]; 
                 if ( !submit_work(mythr, &work) )
+                 {
+                    applog(LOG_WARNING, "Failed to submit share." );
                    break;
+                 }
                 num_submitted++;
             }
-          // must be a ine way algo, nonce is already in work data
+#if FOUR_WAY
+if (num_submitted>1)
+ applog(LOG_NOTICE,  "4 WAY hash, %u nonces submitted," CL_MAG " BONUS!" CL_WHT, num_submitted);
+else
+ applog(LOG_NOTICE,  "4 WAY hash %u nonce submitted", num_submitted);
+#endif
+          // must be a one way algo, nonce is already in work data
          if ( !num_submitted )
          {
             if ( !submit_work(mythr, &work) )
+             {
+                applog(LOG_WARNING, "Failed to submir share.");
                break;
+             }
+#if FOUR_WAY
+applog(LOG_NOTICE,  "1 WAY hash 1 nonce submitted");
+#endif
+
          }

          // prevent stale work in solo
@@ -1836,8 +1852,6 @@ static void *miner_thread( void *userdata )
          }
       }
       // display hashrate
-
-
       if (!opt_quiet)
       {
          char hc[16];
@@ -1846,7 +1860,6 @@ static void *miner_thread( void *userdata )
          char hr_units[2] = {0,0};
          double hashcount = thr_hashcount[thr_id];
          double hashrate  = thr_hashrates[thr_id];
-//printf("display count= %.3f,  tcount= %.3f, rate= %03f trate= %03f\n", hashcount, thr_hashcount[thr_id], hashrate,thr_hashrates[thr_id] );
          if ( hashcount )
          {
             scale_hash_for_display( &hashcount, hc_units );
--- a/miner.h
+++ b/miner.h
@@ -673,19 +673,19 @@ Options:\n\
                          argon2\n\
                          axiom        Shabal-256 MemoHash\n\
                          bastion\n\
-                          blake        Blake-256 (SFR)\n\
+                          blake        blake256r14 (SFR)\n\
                          blakecoin    blake256r8\n\
                          blake2s      Blake-2 S\n\
                          bmw          BMW 256\n\
                          c11          Chaincoin\n\
                          cryptolight  Cryptonight-light\n\
                          cryptonight  cryptonote, Monero (XMR)\n\
-                          decred       Blake256r8dcr\n\
+                          decred       Blake256r14dcr\n\
                          deep         Deepcoin (DCN)\n\
                          dmd-gr       Diamond\n\
                          drop         Dropcoin\n\
                          fresh        Fresh\n\
-                          groestl      dmd-gr, Groestl coin\n\
+                          groestl      Groestl coin\n\
                          heavy        Heavy\n\
                          hmq1725      Espers\n\
                          hodl         Hodlcoin\n\
--- a/winbuild-allarch.sh
+++ b/winbuild-allarch.sh
@@ -3,7 +3,15 @@
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA -DFOUR_WAY" ./configure --with-curl
+make -j 4
+strip -s cpuminer.exe
+mv cpuminer.exe cpuminer-4way.exe
+
+make clean
+rm -f config.status
+./autogen.sh || echo done
+CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-avx2.exe
@@ -11,7 +19,7 @@ mv cpuminer.exe cpuminer-aes-avx2.exe
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=corei7-avx -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -march=corei7-avx -Wall -DUSE_SPH_SHA" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-avx.exe
@@ -19,7 +27,7 @@ mv cpuminer.exe cpuminer-aes-avx.exe
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -maes -msse4.2 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -maes -msse4.2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-sse42.exe
@@ -27,7 +35,7 @@ mv cpuminer.exe cpuminer-aes-sse42.exe
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=corei7 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -march=corei7 -Wall -DUSE_SPH_SHA" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-sse42.exe
@@ -35,7 +43,7 @@ mv cpuminer.exe cpuminer-sse42.exe
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core2 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -march=core2 -Wall -DUSE_SPH_SHA" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-sse2.exe
--- a/winbuild.sh
+++ b/winbuild.sh
@@ -18,7 +18,8 @@ rm -f config.status
 # Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
 #extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"

-CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
+#CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl

 make -j 4