v3.7.3

2025-09-17 23:44:27 +00:00 · 2017-11-20 21:19:15 -05:00
parent ab39e88318
commit 6d1361c87f
46 changed files with 6314 additions and 141 deletions
--- a/34
+++ b/34
@@ -5,19 +5,31 @@
 # ex: docker run -it --rm cpuminer-opt:latest -a cryptonight -o cryptonight.eu.nicehash.com:3355 -u 1MiningDW2GKzf4VQfmp4q2XoUvR6iy6PD.worker1 -p x -t 3
 #
-FROM ubuntu:16.04
+# Build
-RUN BUILD_DEPS="build-essential \
+FROM ubuntu:16.04 as builder
    libssl-dev \
 	  libgmp-dev \
 	  libcurl4-openssl-dev \
 	  libjansson-dev \
 	  automake" && \
-	  apt-get update && \
+RUN apt-get update \
-	  apt-get install -y ${BUILD_DEPS}
+  && apt-get install -y \
    build-essential \
    libssl-dev \
    libgmp-dev \
    libcurl4-openssl-dev \
    libjansson-dev \
    automake \
  && rm -rf /var/lib/apt/lists/*
 COPY . /app/
-RUN	cd /app/ && ./build.sh
+RUN cd /app/ && ./build.sh
-ENTRYPOINT ["/app/cpuminer"]
+# App
 FROM ubuntu:16.04
 RUN apt-get update \
  && apt-get install -y \
    libcurl3 \
    libjansson4 \
  && rm -rf /var/lib/apt/lists/*
 COPY --from=builder /app/cpuminer .
 ENTRYPOINT ["./cpuminer"]
 CMD ["-h"]
--- a/Makefile.am
+++ b/Makefile.am
@@ -23,15 +23,11 @@ cpuminer_SOURCES = \
  sysinfos.c \
  algo-gate-api.c\
  algo/groestl/sph_groestl.c \
  algo/skein/sph_skein.c \
  algo/bmw/sph_bmw.c \
  algo/shavite/sph_shavite.c \
  algo/shavite/shavite.c \
  algo/echo/sph_echo.c \
  algo/blake/sph_blake.c \
  algo/blake/sph_blake2b.c \
  algo/heavy/sph_hefty1.c \
  algo/blake/mod_blakecoin.c \
  algo/luffa/sph_luffa.c \
  algo/cubehash/sph_cubehash.c \
  algo/simd/sph_simd.c \
@@ -39,8 +35,6 @@ cpuminer_SOURCES = \
  algo/fugue/sph_fugue.c \
  algo/gost/sph_gost.c \
  algo/jh/sph_jh.c \
  algo/keccak/sph_keccak.c \
  algo/keccak/keccak.c\
  algo/sha/sph_sha2.c \
  algo/sha/sph_sha2big.c \
  algo/shabal/sph_shabal.c \
@@ -63,9 +57,15 @@ cpuminer_SOURCES = \
  algo/argon2/ar2/ar2-scrypt-jane.c \
  algo/argon2/ar2/blake2b.c \
  algo/axiom.c \
  algo/blake/sph_blake.c \
  algo/blake/blake-hash-4way.c \
  algo/blake/blake-gate.c \
  algo/blake/blake.c \
  algo/blake/blake-4way.c \
  algo/blake/sph_blake2b.c \
  algo/blake/blake2b.c \
  algo/blake/blake2s.c \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
  algo/blake/decred.c \
  algo/blake/pentablake.c \
@@ -92,6 +92,12 @@ cpuminer_SOURCES = \
  algo/hodl/sha512_avx.c \
  algo/hodl/sha512_avx2.c \
  algo/jh/jha.c \
  algo/keccak/sph_keccak.c \
  algo/keccak/keccak.c\
  algo/keccak/keccak-hash-4way.c \
  algo/keccak/keccak-4way.c\
  algo/keccak/keccak-gate.c \
  algo/keccak/sse2/keccak.c \
  algo/lbry.c \
  algo/luffa/luffa.c \
  algo/luffa/sse2/luffa_for_sse2.c \
@@ -101,11 +107,12 @@ cpuminer_SOURCES = \
  algo/lyra2/lyra2re.c \
  algo/lyra2/zcoin.c \
  algo/lyra2/lyra2z330.c \
  algo/keccak/sse2/keccak.c \
  algo/m7m.c \
  algo/neoscrypt.c \
  algo/nist5.c \
  algo/pluck.c \
  algo/polytimos/polytimos-gate.c \
  algo/polytimos/polytimos.c \
  algo/quark/quark.c \
  algo/qubit/qubit.c \
  algo/qubit/deep.c \
@@ -116,8 +123,14 @@ cpuminer_SOURCES = \
  algo/sha/sha256t.c \
  algo/simd/sse2/nist.c \
  algo/simd/sse2/vector.c \
  algo/skein/sph_skein.c \
  algo/skein/skein-hash-4way.c \
  algo/skein/skein.c \
  algo/skein/skein-4way.c \
  algo/skein/skein-gate.c \  
  algo/skein/skein2.c \
  algo/skein/skein2-4way.c \
  algo/skein/skein2-gate.c \
  algo/skunk.c \
  algo/tiger/sph_tiger.c \
  algo/timetravel.c \
--- a/README.md
+++ b/README.md
@@ -50,6 +50,7 @@ Supported Algorithms
                          pentablake   Pentablake
                          phi1612      phi, LUX coin
                          pluck        Pluck:128 (Supcoin)
                          polytimos
                          quark        Quark
                          qubit        Qubit
                          scrypt       scrypt(1024, 1, 1) (default)
--- a/README.txt
+++ b/README.txt
@@ -11,7 +11,11 @@ optimum speed using all the available features.
 Architecture names and compile options used are only provided for Intel
 Core series. Pentium and Celeron often have fewer features.
-AMD is YMMV, see previous paragraph.
+
 AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
 supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.
 Exe name                  Compile opts       Arch name
--- a/25
+++ b/25
@@ -78,6 +78,20 @@ Run ./build.sh to build on Linux or execute the following commands.
 CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 make
 Additional optional compile flags, add the following to CFLAGS to activate:
 -DUSE_SPH_SHA
 SPH may give slightly better performance on algos that use sha256 when using
 openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
 better than SPH.
 -DFOUR_WAY
 4 way will give much better performance on supported algos with CPUs
 that have AVX2 and should only be used on CPUs with AVX2. 4 way algo
 support will be added incrementally, see change log below for supported algos.
 Start mining.
 ./cpuminer -a algo -o url -u username -p password
@@ -140,6 +154,17 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------
 v3.7.3
 Added polytimos algo.
 Introducing 4-way AVX2 optimization giving up to 4x performance inprovement
 on many compute bound algos. First supported algos: skein, skein2, blake &
 keccak. This feature is only available when compiled from source. See above
 for instcuctions how to enable 4-way during compilation.
 Updated Dockerfile.
 v3.7.2
 Fixed yescryptr16
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -184,6 +184,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_PENTABLAKE:   register_pentablake_algo  ( gate ); break;
     case ALGO_PHI1612:      register_phi1612_algo     ( gate ); break;
     case ALGO_PLUCK:        register_pluck_algo       ( gate ); break;
     case ALGO_POLYTIMOS:    register_polytimos_algo   ( gate ); break;
     case ALGO_QUARK:        register_quark_algo       ( gate ); break;
     case ALGO_QUBIT:        register_qubit_algo       ( gate ); break;
     case ALGO_SCRYPT:       register_scrypt_algo      ( gate ); break;
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -0,0 +1,115 @@
 #include "algo-gate-api.h"
 #include "sph_blake.h"
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
 #include <memory.h>
 #if defined (__AVX__)
 void blakehash_4way(void *state, const void *input)
 {
     uint32_t hash0[16] __attribute__ ((aligned (64)));
     uint32_t hash1[16] __attribute__ ((aligned (64)));
     uint32_t hash2[16] __attribute__ ((aligned (64)));
     uint32_t hash3[16] __attribute__ ((aligned (64)));
     uint32_t vhash[16*4] __attribute__ ((aligned (64)));
     blake256_4way_context ctx;
     blake256_4way_init( &ctx );
     blake256_4way( &ctx, input, 16 );
     blake256_4way_close( &ctx, vhash );
     m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash1, 32 );
     memcpy( state+96, hash1, 32 );
 }
 int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done )
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
 //   uint32_t HTarget = ptarget[7];
   uint32_t _ALIGN(32) endiandata[20];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found;
 //   if (opt_benchmark)
 //      HTarget = 0x7f;
   // we need big endian data...
   swab32_array( endiandata, pdata, 20 );
   m128_interleave_4x32( vdata, endiandata, endiandata, endiandata,
                         endiandata, 640 );
   uint32_t *noncep = vdata + 76;   // 19*4
   do {
      found[0] = found[1] = found[2] = found[3] = false;
      num_found = 0;
      be32enc( noncep,    n   );
      be32enc( noncep +2, n+1 );
      be32enc( noncep +4, n+2 );
      be32enc( noncep +6, n+3 );
      blakehash_4way( hash, vdata );
      if ( hash[7] == 0 )
      {
         if ( fulltest( hash, ptarget ) )
         {
             found[0] = true;
             num_found++;
             nonces[0] = n;
             pdata[19] = n;
         }
      }
      if ( (hash+8)[7] == 0 ) 
      {
         if ( fulltest( hash, ptarget ) ) 
         {
             found[1] = true;
             num_found++;
             nonces[1] = n+1;
         }
      }
      if ( (hash+16)[7] == 0 )
      {
          if ( fulltest( hash, ptarget ) )
          {
              found[2] = true;
              num_found++;
              nonces[2] = n+2;
          }
      }
      if ( (hash+24)[7] == 0 )
      {
         if ( fulltest( hash, ptarget ) )
         {
              found[3] = true;
              num_found++;
              nonces[3] = n+3;
         }
      }
      n += 4;
      *hashes_done = n - first_nonce + 1;
   } while ( (num_found == 0) && (n < max_nonce) 
             && !work_restart[thr_id].restart );
   *hashes_done = n - first_nonce + 1;
   return num_found;
 }
 #endif
--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -0,0 +1,26 @@
 #include "blake-gate.h"
 int64_t blake_get_max64 ()
 {
  return 0x7ffffLL;
 }
 bool register_blake_algo( algo_gate_t* gate )
 {
  gate->get_max64 = (void*)&blake_get_max64;
 #if defined (__AVX2__) && defined (FOUR_WAY)
 //   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
 //  gate->scanhash  = (void*)&scanhash_blake_8way;
 //  gate->hash      = (void*)&blakehash_8way;
 #elif defined(__AVX__) && defined (FOUR_WAY)
  gate->optimizations = SSE2_OPT | AVX_OPT;
  gate->scanhash  = (void*)&scanhash_blake_4way;
  gate->hash      = (void*)&blakehash_4way;
 #else
  gate->optimizations = SSE2_OPT;
  gate->scanhash  = (void*)&scanhash_blake;
  gate->hash      = (void*)&blakehash;
 #endif
  return true;
 }
--- a/algo/blake/blake-gate.h
+++ b/algo/blake/blake-gate.h
@@ -0,0 +1,23 @@
 #ifndef __BLAKE_GATE_H__
 #define __BLAKE_GATE_H__
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined (__AVX2__) 
 //void blakehash_84way(void *state, const void *input);
 //int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
 //                         uint64_t *hashes_done );
 #endif
 #if defined (__AVX__)
 void blakehash_4way(void *state, const void *input);
 int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 #endif
 void blakehash( void *state, const void *input );
 int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done );
 #endif
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -0,0 +1,105 @@
 /* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
 /**
 * BLAKE interface. BLAKE is a family of functions which differ by their
 * output size; this implementation defines BLAKE for output sizes 224,
 * 256, 384 and 512 bits. This implementation conforms to the "third
 * round" specification.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 * 
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @file     sph_blake.h
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */
 #ifndef __BLAKE_HASH_4WAY__
 #define __BLAKE_HASH_4WAY___
 #ifdef __cplusplus
 extern "C"{
 #endif
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"
 /**
 * Output size (in bits) for BLAKE-256.
 */
 #define SPH_SIZE_blake256   256
 #if SPH_64
 /**
 * Output size (in bits) for BLAKE-512.
 */
 #define SPH_SIZE_blake512   512
 #endif
 #ifdef __AVX__
 typedef struct {
        __m128i buf[16] __attribute__ ((aligned (64)));
 	size_t ptr;
        __m128i H[8];
        __m128i S[4];    
 	sph_u32 T0, T1;
 } blake_4way_small_context;
 typedef blake_4way_small_context blake256_4way_context;
 void blake256_4way_init(void *cc);
 void blake256_4way(void *cc, const void *data, size_t len);
 void blake256_4way_close(void *cc, void *dst);
 void blake256_4way_addbits_and_close(
        void *cc, unsigned ub, unsigned n, void *dst);
 #endif
 #ifdef __AVX2__
 typedef struct {
        __m256i buf[16] __attribute__ ((aligned (64)));
 	size_t ptr;
        __m256i H[8];
        __m256i S[4];   
 	sph_u64 T0, T1;
 } blake_4way_big_context;
 typedef blake_4way_big_context blake512_avx2_context;
 void blake512_4way_init(void *cc);
 void blake512_4way(void *cc, const void *data, size_t len);
 void blake512_4way_close(void *cc, void *dst);
 void blake512_4way_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
 #endif
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/algo/blake/blake.c
+++ b/algo/blake/blake.c
@@ -89,18 +89,3 @@ int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }
 // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
 int64_t blake_get_max64 ()
 {
  return 0x7ffffLL;
 }
 bool register_blake_algo( algo_gate_t* gate )
 {
  gate->scanhash  = (void*)&scanhash_blake;
  gate->hash      = (void*)&blakehash;
  gate->get_max64 = (void*)&blake_get_max64;
  return true;
 }
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -6,12 +6,10 @@
 #include "algo-gate-api.h"
 #include <string.h>
 #include <stdint.h>
 #include "algo/blake/sph_blake2b.h"
-
+//static __thread sph_blake2b_ctx s_midstate;
-static __thread sph_blake2b_ctx s_midstate;
+//static __thread sph_blake2b_ctx s_ctx;
 static __thread sph_blake2b_ctx s_ctx;
 #define MIDLEN 76
 #define A 64
--- a/algo/blake/sph_blake.c
+++ b/algo/blake/sph_blake.c
@@ -813,6 +813,7 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)
 	buf = sc->buf;
 	ptr = sc->ptr;
 	if (len < (sizeof sc->buf) - ptr) {
 		memcpy(buf + ptr, data, len);
 		ptr += len;
@@ -890,9 +891,9 @@ blake32_close(sph_blake_small_context *sc,
 		sph_enc32be_aligned(u.buf + 60, tl);
 		blake32(sc, u.buf, 64);
 	}
-	out = dst;
+        out = dst;
-	for (k = 0; k < out_size_w32; k ++)
+        for (k = 0; k < out_size_w32; k ++)
-		sph_enc32be(out + (k << 2), sc->H[k]);
+                sph_enc32be(out + (k << 2), sc->H[k]);
 }
 #if SPH_64
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -0,0 +1,105 @@
 #include "keccak-gate.h"
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 #include "sph_keccak.h"
 #include "keccak-hash-4way.h"
 #ifdef __AVX2__
 void keccakhash_4way(void *state, const void *input)
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     keccak256_4way_context ctx;
     keccak256_4way_init( &ctx );
     keccak256_4way( &ctx, input, 80 );
     keccak256_4way_close( &ctx, vhash );
     m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     memcpy( state,    hash0, 32 );
     memcpy( state+32, hash1, 32 );
     memcpy( state+64, hash2, 32 );
     memcpy( state+96, hash3, 32 );
 }
 int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done)
 {
   uint32_t hash[4*8] __attribute__ ((aligned (64)));
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
   const uint32_t first_nonce = pdata[19];
   const uint32_t Htarg = ptarget[7];
   uint32_t endiandata[20];
   uint32_t *nonces = work->nonces;
   bool *found = work->nfound;
   int num_found;
   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
   uint32_t *noncep1 = vdata + 75;
   uint32_t *noncep2 = vdata + 77;
   uint32_t *noncep3 = vdata + 79;
   for ( int i=0; i < 19; i++ ) 
      be32enc( &endiandata[i], pdata[i] );
   uint64_t *edata = (uint64_t*)endiandata;
   m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
   do {
      found[0] = found[1] = found[2] = found[3] = false;
      num_found = 0;
      be32enc( noncep0, n   );
      be32enc( noncep1, n+1 );
      be32enc( noncep2, n+2 );
      be32enc( noncep3, n+3 );
      keccakhash_4way( hash, vdata );
      if ( ( ( hash[7] & 0xFFFFFF00 ) == 0 )
         && fulltest( hash, ptarget) )
      {
          found[0] = true;
          num_found++;
          nonces[0] = n;
          pdata[19] = n;
      }
      if ( ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 )
         && fulltest( hash+8, ptarget) ) 
      {
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
      }
      if ( ( ( (hash+16) [7] & 0xFFFFFF00 ) == 0 )
         && fulltest( hash+16, ptarget) )
      {
          found[2] = true;
          num_found++;
          nonces[2] = n+2;
      }
      if ( ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 )
         && fulltest( hash+24, ptarget) )
      {
          found[3] = true;
          num_found++;
          nonces[3] = n+3;
      }
      n += 4;
   } while ( (num_found == 0) && (n < max_nonce-4)
                   && !work_restart[thr_id].restart);
   *hashes_done = n - first_nonce + 1;
   return num_found;
 }
 #endif
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -0,0 +1,27 @@
 #include "keccak-gate.h"
 void keccak_set_target( struct work* work, double job_diff )
 {
  work_set_target( work, job_diff / (128.0 * opt_diff_factor) );
 }
 int64_t keccak_get_max64() { return 0x7ffffLL; }
 bool register_keccak_algo( algo_gate_t* gate )
 {
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  gate->set_target      = (void*)&keccak_set_target;
  gate->get_max64       = (void*)&keccak_get_max64;
 #if defined (FOUR_WAY) && defined (__AVX2__)
  gate->optimizations = SSE2_OPT | AVX2_OPT;
  gate->scanhash  = (void*)&scanhash_keccak_4way;
  gate->hash      = (void*)&keccakhash_4way;
 #else
  gate->optimizations = SSE2_OPT;
  gate->scanhash        = (void*)&scanhash_keccak;
  gate->hash            = (void*)&keccakhash;
 #endif
  return true;
 };
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -0,0 +1,19 @@
 #ifndef __KECCAK_GATE_H__
 #define __KECCAK_GATE_H__
 #include "algo-gate-api.h"
 #include <stdint.h>
 #if defined(__AVX2__)
 void keccakhash_4way( void *state, const void *input );
 int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 #endif
 void keccakhash( void *state, const void *input );
 int scanhash_keccak( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );
 #endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -0,0 +1,505 @@
 #include <stddef.h>
 #include "keccak-hash-4way.h"
 #if defined(__AVX2__)
 static const sph_u64 RC[] = {
        SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
        SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
        SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
        SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
        SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
        SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
        SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
        SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
        SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
        SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
 };
 #define a00   (kc->w[ 0])
 #define a10   (kc->w[ 1])
 #define a20   (kc->w[ 2])
 #define a30   (kc->w[ 3])
 #define a40   (kc->w[ 4])
 #define a01   (kc->w[ 5])
 #define a11   (kc->w[ 6])
 #define a21   (kc->w[ 7])
 #define a31   (kc->w[ 8])
 #define a41   (kc->w[ 9])
 #define a02   (kc->w[10])
 #define a12   (kc->w[11])
 #define a22   (kc->w[12])
 #define a32   (kc->w[13])
 #define a42   (kc->w[14])
 #define a03   (kc->w[15])
 #define a13   (kc->w[16])
 #define a23   (kc->w[17])
 #define a33   (kc->w[18])
 #define a43   (kc->w[19])
 #define a04   (kc->w[20])
 #define a14   (kc->w[21])
 #define a24   (kc->w[22])
 #define a34   (kc->w[23])
 #define a44   (kc->w[24])
 #define DECL_STATE
 #define READ_STATE(sc)
 #define WRITE_STATE(sc)
 #define INPUT_BUF(size)   do { \
    size_t j; \
    for (j = 0; j < (size>>3); j++ ) \
        kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
 } while (0)
 #define mm256_neg1 \
        (_mm256_set_epi64x( 0xffffffffffffffff, 0xffffffffffffffff, \
                            0xffffffffffffffff, 0xffffffffffffffff ) )
 #define DECL64(x)        __m256i x
 #define MOV64(d, s)      (d = s)
 #define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
 #define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
 #define NOT64(d, s)      (d = _mm256_xor_si256(s,mm256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rotl_64(v, n))
 #define XOR64_IOTA       XOR64
 #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
                DECL64(tt0); \
                DECL64(tt1); \
                DECL64(tt2); \
                DECL64(tt3); \
                XOR64(tt0, d0, d1); \
                XOR64(tt1, d2, d3); \
                XOR64(tt0, tt0, d4); \
                XOR64(tt0, tt0, tt1); \
                ROL64(tt0, tt0, 1); \
                XOR64(tt2, c0, c1); \
                XOR64(tt3, c2, c3); \
                XOR64(tt0, tt0, c4); \
                XOR64(tt2, tt2, tt3); \
                XOR64(t, tt0, tt2); \
        } while (0)
 #define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
        b40, b41, b42, b43, b44) \
        do { \
                DECL64(t0); \
                DECL64(t1); \
                DECL64(t2); \
                DECL64(t3); \
                DECL64(t4); \
                TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
                TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
                TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
                TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
                TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
                XOR64(b00, b00, t0); \
                XOR64(b01, b01, t0); \
                XOR64(b02, b02, t0); \
                XOR64(b03, b03, t0); \
                XOR64(b04, b04, t0); \
                XOR64(b10, b10, t1); \
                XOR64(b11, b11, t1); \
                XOR64(b12, b12, t1); \
                XOR64(b13, b13, t1); \
                XOR64(b14, b14, t1); \
                XOR64(b20, b20, t2); \
                XOR64(b21, b21, t2); \
                XOR64(b22, b22, t2); \
                XOR64(b23, b23, t2); \
                XOR64(b24, b24, t2); \
                XOR64(b30, b30, t3); \
                XOR64(b31, b31, t3); \
                XOR64(b32, b32, t3); \
                XOR64(b33, b33, t3); \
                XOR64(b34, b34, t3); \
                XOR64(b40, b40, t4); \
                XOR64(b41, b41, t4); \
                XOR64(b42, b42, t4); \
                XOR64(b43, b43, t4); \
                XOR64(b44, b44, t4); \
        } while (0)
 #define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
        b40, b41, b42, b43, b44) \
        do { \
                /* ROL64(b00, b00,  0); */ \
                ROL64(b01, b01, 36); \
                ROL64(b02, b02,  3); \
                ROL64(b03, b03, 41); \
                ROL64(b04, b04, 18); \
                ROL64(b10, b10,  1); \
                ROL64(b11, b11, 44); \
                ROL64(b12, b12, 10); \
                ROL64(b13, b13, 45); \
                ROL64(b14, b14,  2); \
                ROL64(b20, b20, 62); \
                ROL64(b21, b21,  6); \
                ROL64(b22, b22, 43); \
                ROL64(b23, b23, 15); \
                ROL64(b24, b24, 61); \
                ROL64(b30, b30, 28); \
                ROL64(b31, b31, 55); \
                ROL64(b32, b32, 25); \
                ROL64(b33, b33, 21); \
                ROL64(b34, b34, 56); \
                ROL64(b40, b40, 27); \
                ROL64(b41, b41, 20); \
                ROL64(b42, b42, 39); \
                ROL64(b43, b43,  8); \
                ROL64(b44, b44, 14); \
        } while (0)
 /*
 * The KHI macro integrates the "lane complement" optimization. On input,
 * some words are complemented:
 *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
 * On output, the following words are complemented:
 *    a04 a10 a20 a22 a23 a31
 *
 * The (implicit) permutation and the theta expansion will bring back
 * the input mask for the next round.
 */
 #define KHI_XO(d, a, b, c)   do { \
                DECL64(kt); \
                OR64(kt, b, c); \
                XOR64(d, a, kt); \
        } while (0)
 #define KHI_XA(d, a, b, c)   do { \
                DECL64(kt); \
                AND64(kt, b, c); \
                XOR64(d, a, kt); \
        } while (0)
 #define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
        b40, b41, b42, b43, b44) \
        do { \
                DECL64(c0); \
                DECL64(c1); \
                DECL64(c2); \
                DECL64(c3); \
                DECL64(c4); \
                DECL64(bnn); \
                NOT64(bnn, b20); \
                KHI_XO(c0, b00, b10, b20); \
                KHI_XO(c1, b10, bnn, b30); \
                KHI_XA(c2, b20, b30, b40); \
                KHI_XO(c3, b30, b40, b00); \
                KHI_XA(c4, b40, b00, b10); \
                MOV64(b00, c0); \
                MOV64(b10, c1); \
                MOV64(b20, c2); \
                MOV64(b30, c3); \
                MOV64(b40, c4); \
                NOT64(bnn, b41); \
                KHI_XO(c0, b01, b11, b21); \
                KHI_XA(c1, b11, b21, b31); \
                KHI_XO(c2, b21, b31, bnn); \
                KHI_XO(c3, b31, b41, b01); \
                KHI_XA(c4, b41, b01, b11); \
                MOV64(b01, c0); \
                MOV64(b11, c1); \
                MOV64(b21, c2); \
                MOV64(b31, c3); \
                MOV64(b41, c4); \
                NOT64(bnn, b32); \
                KHI_XO(c0, b02, b12, b22); \
                KHI_XA(c1, b12, b22, b32); \
                KHI_XA(c2, b22, bnn, b42); \
                KHI_XO(c3, bnn, b42, b02); \
                KHI_XA(c4, b42, b02, b12); \
                MOV64(b02, c0); \
                MOV64(b12, c1); \
                MOV64(b22, c2); \
                MOV64(b32, c3); \
                MOV64(b42, c4); \
                NOT64(bnn, b33); \
                KHI_XA(c0, b03, b13, b23); \
                KHI_XO(c1, b13, b23, b33); \
                KHI_XO(c2, b23, bnn, b43); \
                KHI_XA(c3, bnn, b43, b03); \
                KHI_XO(c4, b43, b03, b13); \
                MOV64(b03, c0); \
                MOV64(b13, c1); \
                MOV64(b23, c2); \
                MOV64(b33, c3); \
                MOV64(b43, c4); \
                NOT64(bnn, b14); \
                KHI_XA(c0, b04, bnn, b24); \
                KHI_XO(c1, bnn, b24, b34); \
                KHI_XA(c2, b24, b34, b44); \
                KHI_XO(c3, b34, b44, b04); \
                KHI_XA(c4, b44, b04, b14); \
                MOV64(b04, c0); \
                MOV64(b14, c1); \
                MOV64(b24, c2); \
                MOV64(b34, c3); \
                MOV64(b44, c4); \
        } while (0)
 #define IOTA(r)   XOR64_IOTA(a00, a00, r)
 #define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
 #define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
 #define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
 #define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
 #define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
 #define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
 #define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
 #define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
 #define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
 #define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
 #define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
 #define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
 #define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
 #define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
 #define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
 #define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
 #define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
 #define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
 #define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
 #define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
 #define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
 #define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
 #define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
 #define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
 #define P8_TO_P0   do { \
                DECL64(t); \
                MOV64(t, a01); \
                MOV64(a01, a11); \
                MOV64(a11, a43); \
                MOV64(a43, t); \
                MOV64(t, a02); \
                MOV64(a02, a22); \
                MOV64(a22, a31); \
                MOV64(a31, t); \
                MOV64(t, a03); \
                MOV64(a03, a33); \
                MOV64(a33, a24); \
                MOV64(a24, t); \
                MOV64(t, a04); \
                MOV64(a04, a44); \
                MOV64(a44, a12); \
                MOV64(a12, t); \
                MOV64(t, a10); \
                MOV64(a10, a32); \
                MOV64(a32, a13); \
                MOV64(a13, t); \
                MOV64(t, a14); \
                MOV64(a14, a21); \
                MOV64(a21, a20); \
                MOV64(a20, t); \
                MOV64(t, a23); \
                MOV64(a23, a42); \
                MOV64(a42, a40); \
                MOV64(a40, t); \
                MOV64(t, a30); \
                MOV64(a30, a41); \
                MOV64(a41, a34); \
                MOV64(a34, t); \
        } while (0)
 #define LPAR   (
 #define RPAR   )
 #define KF_ELT(r, s, k)   do { \
                THETA LPAR P ## r RPAR; \
                RHO LPAR P ## r RPAR; \
                KHI LPAR P ## s RPAR; \
                IOTA(k); \
        } while (0)
 #define DO(x)   x
 #define KECCAK_F_1600   DO(KECCAK_F_1600_)
 #define KECCAK_F_1600_   do { \
    int j; \
    for (j = 0; j < 24; j += 8) \
    { \
       KF_ELT( 0,  1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \
                                       RC[j + 0], RC[j + 0])) ); \
       KF_ELT( 1,  2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \
                                       RC[j + 1], RC[j + 1])) ); \
       KF_ELT( 2,  3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \
                                       RC[j + 2], RC[j + 2])) ); \
       KF_ELT( 3,  4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \
                                       RC[j + 3], RC[j + 3])) ); \
       KF_ELT( 4,  5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \
                                       RC[j + 4], RC[j + 4])) ); \
       KF_ELT( 5,  6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \
                                       RC[j + 5], RC[j + 5])) ); \
       KF_ELT( 6,  7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \
                                       RC[j + 6], RC[j + 6])) ); \
       KF_ELT( 7,  8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \
                                       RC[j + 7], RC[j + 7])) ); \
       P8_TO_P0; \
    } \
 } while (0)
 static void keccak64_init( keccak64_ctx_m256i *kc, size_t lim )
 {
   int i;
   for (i = 0; i < 25; i ++)
          kc->w[i] = _mm256_setzero_si256();
   // Initialization for the "lane complement".
   kc->w[ 1] = mm256_neg1;
   kc->w[ 2] = mm256_neg1;
   kc->w[ 8] = mm256_neg1;
   kc->w[12] = mm256_neg1;
   kc->w[17] = mm256_neg1;
   kc->w[20] = mm256_neg1;
   kc->ptr = 0;
   kc->lim = 200 - (lim >> 2);
 }
 static void
 keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
               size_t lim )
 {
    __m256i *buf;
    __m256i *vdata = (__m256i*)data;
    size_t ptr;
    buf = kc->buf;
    ptr = kc->ptr;
    if ( len < (lim - ptr) )
    {
        memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
        kc->ptr = ptr + len;
        return;
    }
    while ( len > 0 )
    {
        size_t clen;
        clen = (lim - ptr);
        if ( clen > len )
             clen = len;
        memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
        ptr += clen;
        vdata = vdata + (clen>>3);
        len -= clen;
        if ( ptr == lim )
        {
            INPUT_BUF( lim );
            KECCAK_F_1600;
            ptr = 0;
        }
    }
    kc->ptr = ptr;
 }
 static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
            size_t lim )
 {
    unsigned eb;
    union {
       __m256i tmp[lim + 1];
       sph_u64 dummy;   /* for alignment */
    } u;
    size_t j;
    size_t m256_len = byte_len >> 3;
    eb = 0x100  >> 8;
    if ( kc->ptr == (lim - 1) )
    {
        uint64_t t = eb | 0x80;
        u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
        j = 1;
    }
    else
    {
        j = lim - kc->ptr;
        u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
        memset_zero_m256i( u.tmp + 1, (j>>3) - 2 );
        u.tmp[ (j>>3) - 1] = _mm256_set_epi64x( 0x8000000000000000,
                0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
    }
    keccak64_core( kc, u.tmp, j, lim );
    /* Finalize the "lane complement" */
    NOT64( kc->w[ 1], kc->w[ 1] );
    NOT64( kc->w[ 2], kc->w[ 2] );
    NOT64( kc->w[ 8], kc->w[ 8] );
    NOT64( kc->w[12], kc->w[12] );
    NOT64( kc->w[17], kc->w[17] );
    NOT64( kc->w[20], kc->w[20] );
    for ( j = 0; j < m256_len; j++ )
         u.tmp[j] =  kc->w[j]; 
    memcpy_m256i( dst, u.tmp, m256_len );
 }
 void keccak256_4way_init( void *kc )
 {
   keccak64_init( kc, 256 );
 }
 void
 keccak256_4way(void *cc, const void *data, size_t len)
 {
    keccak64_core(cc, data, len, 136);
 }
 void
 keccak256_4way_close(void *cc, void *dst)
 {
    keccak64_close(cc, dst, 32, 136);
 }
 void keccak512_4way_init( void *kc )
 {
   keccak64_init( kc, 512 );
 }
 void
 keccak512_4way(void *cc, const void *data, size_t len)
 {
        keccak64_core(cc, data, len, 72);
 }
 void
 keccak512_4way_close(void *cc, void *dst)
 {
        keccak64_close(cc, dst, 64, 72);
 }
 #endif
--- a/algo/keccak/keccak-hash-4way.c.bak
+++ b/algo/keccak/keccak-hash-4way.c.bak
@@ -0,0 +1,581 @@
 #include <stddef.h>
 #include "keccak-hash-4way.h"
 static const sph_u64 RC[] = {
        SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
        SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
        SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
        SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
        SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
        SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
        SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
        SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
        SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
        SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
 };
 // change u.wide and u.narrow to just w, ie kc->w
 #define a00   (kc->w[ 0])
 #define a10   (kc->w[ 1])
 #define a20   (kc->w[ 2])
 #define a30   (kc->w[ 3])
 #define a40   (kc->w[ 4])
 #define a01   (kc->w[ 5])
 #define a11   (kc->w[ 6])
 #define a21   (kc->w[ 7])
 #define a31   (kc->w[ 8])
 #define a41   (kc->w[ 9])
 #define a02   (kc->w[10])
 #define a12   (kc->w[11])
 #define a22   (kc->w[12])
 #define a32   (kc->w[13])
 #define a42   (kc->w[14])
 #define a03   (kc->w[15])
 #define a13   (kc->w[16])
 #define a23   (kc->w[17])
 #define a33   (kc->w[18])
 #define a43   (kc->w[19])
 #define a04   (kc->w[20])
 #define a14   (kc->w[21])
 #define a24   (kc->w[22])
 #define a34   (kc->w[23])
 #define a44   (kc->w[24])
 // null when no copy
 #define DECL_STATE
 #define READ_STATE(sc)
 #define WRITE_STATE(sc)
 #define INPUT_BUF(size)   do { \
    size_t j; \
    for (j = 0; j < (size>>3); j++ ) \
        kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
 } while (0)
 // keccak256 4way uses 136 with 32 bit size
 // keccak256 8way and keccak512 4 way use 72 with 64 bit size
 //#define INPUT_BUF144   INPUT_BUF(144)
 //#define INPUT_BUF136   INPUT_BUF(136) 
 //#define INPUT_BUF104   INPUT_BUF(104)
 //#define INPUT_BUF72    INPUT_BUF(72)   
 //simply redefine these macros to do simd
 #define mm256_neg1 \
        (_mm256_set_epi64x( 0xffffffffffffffff, 0xffffffffffffffff, \
                            0xffffffffffffffff, 0xffffffffffffffff ) )
 #define DECL64(x)        __m256i x
 #define MOV64(d, s)      (d = s)
 #define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
 #define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
 #define NOT64(d, s)      (d = _mm256_xor_si256(s,mm256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rotl_64(v, n))
 #define XOR64_IOTA       XOR64
 /*
 #define DECL64(x)        sph_u64 x
 #define MOV64(d, s)      (d = s)
 #define XOR64(d, a, b)   (d = a ^ b)
 #define AND64(d, a, b)   (d = a & b)
 #define OR64(d, a, b)    (d = a | b)
 #define NOT64(d, s)      (d = SPH_T64(~s))
 #define ROL64(d, v, n)   (d = SPH_ROTL64(v, n))
 #define XOR64_IOTA       XOR64
 */
 #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
                DECL64(tt0); \
                DECL64(tt1); \
                DECL64(tt2); \
                DECL64(tt3); \
 uint64_t *ttx = (uint64_t*)&tt0; \
 uint64_t *d0x = (uint64_t*)&d0; \
 uint64_t *d1x = (uint64_t*)&d1; \
 uint64_t *d2x = (uint64_t*)&d2; \
 uint64_t *d3x = (uint64_t*)&d3; \
 uint64_t *d4x = (uint64_t*)&d4; \
                XOR64(tt0, d0, d1); \
 if (vtp) {printf("Velt0\n"); \
 printf("d0=  %016llx\n",*d0x ); \
 printf("d1=  %016llx\n",*d1x ); \
 printf("d2=  %016llx\n",*d2x ); \
 printf("d3=  %016llx\n",*d3x ); \
 printf("d4=  %016llx\n",*d4x ); \
 printf("tt0= %016llx\n",*ttx );} \
                XOR64(tt1, d2, d3); \
                XOR64(tt0, tt0, d4); \
                XOR64(tt0, tt0, tt1); \
 if(vtp){\
 printf("tt0= %016llx\n",*ttx );} \
                ROL64(tt0, tt0, 1); \
 if(vtp){\
 printf("tt0= %016llx\n",*ttx );} \
                XOR64(tt2, c0, c1); \
                XOR64(tt3, c2, c3); \
                XOR64(tt0, tt0, c4); \
                XOR64(tt2, tt2, tt3); \
                XOR64(t, tt0, tt2); \
        } while (0)
 int vtp = 0;
 #define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
        b40, b41, b42, b43, b44) \
        do { \
                DECL64(t0); \
                DECL64(t1); \
                DECL64(t2); \
                DECL64(t3); \
                DECL64(t4); \
                TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
 if(vtp){printf("Velt0\n");\
 uint64_t *tx = (uint64_t*)&t0; \
 printf("t0= %016llx\n",tx );} \
 vtp=0; \
        TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
                TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
                TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
                TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
                XOR64(b00, b00, t0); \
                XOR64(b01, b01, t0); \
                XOR64(b02, b02, t0); \
                XOR64(b03, b03, t0); \
                XOR64(b04, b04, t0); \
                XOR64(b10, b10, t1); \
                XOR64(b11, b11, t1); \
                XOR64(b12, b12, t1); \
                XOR64(b13, b13, t1); \
                XOR64(b14, b14, t1); \
                XOR64(b20, b20, t2); \
                XOR64(b21, b21, t2); \
                XOR64(b22, b22, t2); \
                XOR64(b23, b23, t2); \
                XOR64(b24, b24, t2); \
                XOR64(b30, b30, t3); \
                XOR64(b31, b31, t3); \
                XOR64(b32, b32, t3); \
                XOR64(b33, b33, t3); \
                XOR64(b34, b34, t3); \
                XOR64(b40, b40, t4); \
                XOR64(b41, b41, t4); \
                XOR64(b42, b42, t4); \
                XOR64(b43, b43, t4); \
                XOR64(b44, b44, t4); \
        } while (0)
 #define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
        b40, b41, b42, b43, b44) \
        do { \
                /* ROL64(b00, b00,  0); */ \
                ROL64(b01, b01, 36); \
                ROL64(b02, b02,  3); \
                ROL64(b03, b03, 41); \
                ROL64(b04, b04, 18); \
                ROL64(b10, b10,  1); \
                ROL64(b11, b11, 44); \
                ROL64(b12, b12, 10); \
                ROL64(b13, b13, 45); \
                ROL64(b14, b14,  2); \
                ROL64(b20, b20, 62); \
                ROL64(b21, b21,  6); \
                ROL64(b22, b22, 43); \
                ROL64(b23, b23, 15); \
                ROL64(b24, b24, 61); \
                ROL64(b30, b30, 28); \
                ROL64(b31, b31, 55); \
                ROL64(b32, b32, 25); \
                ROL64(b33, b33, 21); \
                ROL64(b34, b34, 56); \
                ROL64(b40, b40, 27); \
                ROL64(b41, b41, 20); \
                ROL64(b42, b42, 39); \
                ROL64(b43, b43,  8); \
                ROL64(b44, b44, 14); \
        } while (0)
 /*
 * The KHI macro integrates the "lane complement" optimization. On input,
 * some words are complemented:
 *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
 * On output, the following words are complemented:
 *    a04 a10 a20 a22 a23 a31
 *
 * The (implicit) permutation and the theta expansion will bring back
 * the input mask for the next round.
 */
 #define KHI_XO(d, a, b, c)   do { \
                DECL64(kt); \
                OR64(kt, b, c); \
                XOR64(d, a, kt); \
        } while (0)
 #define KHI_XA(d, a, b, c)   do { \
                DECL64(kt); \
                AND64(kt, b, c); \
                XOR64(d, a, kt); \
        } while (0)
 #define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
        b40, b41, b42, b43, b44) \
        do { \
                DECL64(c0); \
                DECL64(c1); \
                DECL64(c2); \
                DECL64(c3); \
                DECL64(c4); \
                DECL64(bnn); \
                NOT64(bnn, b20); \
                KHI_XO(c0, b00, b10, b20); \
                KHI_XO(c1, b10, bnn, b30); \
                KHI_XA(c2, b20, b30, b40); \
                KHI_XO(c3, b30, b40, b00); \
                KHI_XA(c4, b40, b00, b10); \
                MOV64(b00, c0); \
                MOV64(b10, c1); \
                MOV64(b20, c2); \
                MOV64(b30, c3); \
                MOV64(b40, c4); \
                NOT64(bnn, b41); \
                KHI_XO(c0, b01, b11, b21); \
                KHI_XA(c1, b11, b21, b31); \
                KHI_XO(c2, b21, b31, bnn); \
                KHI_XO(c3, b31, b41, b01); \
                KHI_XA(c4, b41, b01, b11); \
                MOV64(b01, c0); \
                MOV64(b11, c1); \
                MOV64(b21, c2); \
                MOV64(b31, c3); \
                MOV64(b41, c4); \
                NOT64(bnn, b32); \
                KHI_XO(c0, b02, b12, b22); \
                KHI_XA(c1, b12, b22, b32); \
                KHI_XA(c2, b22, bnn, b42); \
                KHI_XO(c3, bnn, b42, b02); \
                KHI_XA(c4, b42, b02, b12); \
                MOV64(b02, c0); \
                MOV64(b12, c1); \
                MOV64(b22, c2); \
                MOV64(b32, c3); \
                MOV64(b42, c4); \
                NOT64(bnn, b33); \
                KHI_XA(c0, b03, b13, b23); \
                KHI_XO(c1, b13, b23, b33); \
                KHI_XO(c2, b23, bnn, b43); \
                KHI_XA(c3, bnn, b43, b03); \
                KHI_XO(c4, b43, b03, b13); \
                MOV64(b03, c0); \
                MOV64(b13, c1); \
                MOV64(b23, c2); \
                MOV64(b33, c3); \
                MOV64(b43, c4); \
                NOT64(bnn, b14); \
                KHI_XA(c0, b04, bnn, b24); \
                KHI_XO(c1, bnn, b24, b34); \
                KHI_XA(c2, b24, b34, b44); \
                KHI_XO(c3, b34, b44, b04); \
                KHI_XA(c4, b44, b04, b14); \
                MOV64(b04, c0); \
                MOV64(b14, c1); \
                MOV64(b24, c2); \
                MOV64(b34, c3); \
                MOV64(b44, c4); \
        } while (0)
 #define IOTA(r)   XOR64_IOTA(a00, a00, r)
 #define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
 #define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
 #define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
 #define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
 #define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
 #define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
 #define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
 #define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
 #define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
 #define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
 #define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
 #define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
 #define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
 #define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
 #define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
 #define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
 #define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
 #define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
 #define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
 #define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
 #define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
 #define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
 #define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
 #define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
 #define P8_TO_P0   do { \
                DECL64(t); \
                MOV64(t, a01); \
                MOV64(a01, a11); \
                MOV64(a11, a43); \
                MOV64(a43, t); \
                MOV64(t, a02); \
                MOV64(a02, a22); \
                MOV64(a22, a31); \
                MOV64(a31, t); \
                MOV64(t, a03); \
                MOV64(a03, a33); \
                MOV64(a33, a24); \
                MOV64(a24, t); \
                MOV64(t, a04); \
                MOV64(a04, a44); \
                MOV64(a44, a12); \
                MOV64(a12, t); \
                MOV64(t, a10); \
                MOV64(a10, a32); \
                MOV64(a32, a13); \
                MOV64(a13, t); \
                MOV64(t, a14); \
                MOV64(a14, a21); \
                MOV64(a21, a20); \
                MOV64(a20, t); \
                MOV64(t, a23); \
                MOV64(a23, a42); \
                MOV64(a42, a40); \
                MOV64(a40, t); \
                MOV64(t, a30); \
                MOV64(a30, a41); \
                MOV64(a41, a34); \
                MOV64(a34, t); \
        } while (0)
 #define LPAR   (
 #define RPAR   )
 #define KF_ELT(r, s, k)   do { \
 if(r==0){ vtp=1; printf("Vtheo0\n");}\
                THETA LPAR P ## r RPAR; \
 if(vtp=1){ \
 uint64_t *W = (uint64_t*)(kc->w); \
 printf("w:   %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] ); \
 printf("     %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] ); \
 printf("     %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] ); \
 printf("     %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );} \
 vtp=0; \
                RHO LPAR P ## r RPAR; \
 if(r==0){ printf("Vrho0\n");\
 uint64_t *W = (uint64_t*)(kc->w); \
 printf("w:   %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] ); \
 printf("     %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] ); \
 printf("     %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] ); \
 printf("     %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );} \
                KHI LPAR P ## s RPAR; \
 if(r==0){ printf("Vkhi0\n");\
 uint64_t *W = (uint64_t*)(kc->w); \
 printf("w:   %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] ); \
 printf("     %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] ); \
 printf("     %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] ); \
 printf("     %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );} \
                IOTA(k); \
        } while (0)
 #define DO(x)   x
 #define KECCAK_F_1600   DO(KECCAK_F_1600_)
 #define KECCAK_F_1600_   do { \
    int j; \
    for (j = 0; j < 24; j += 8) \
    { \
       KF_ELT( 0,  1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \
                                       RC[j + 0], RC[j + 0])) ); \
       KF_ELT( 1,  2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \
                                       RC[j + 1], RC[j + 1])) ); \
       KF_ELT( 2,  3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \
                                       RC[j + 2], RC[j + 2])) ); \
       KF_ELT( 3,  4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \
                                       RC[j + 3], RC[j + 3])) ); \
       KF_ELT( 4,  5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \
                                       RC[j + 4], RC[j + 4])) ); \
       KF_ELT( 5,  6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \
                                       RC[j + 5], RC[j + 5])) ); \
       KF_ELT( 6,  7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \
                                       RC[j + 6], RC[j + 6])) ); \
       KF_ELT( 7,  8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \
                                       RC[j + 7], RC[j + 7])) ); \
       P8_TO_P0; \
    } \
 } while (0)
 static void keccak64_init( keccak64_ctx_m256i *kc, size_t lim )
 {
   int i;
   for (i = 0; i < 25; i ++)
          kc->w[i] = _mm256_setzero_si256();
   // Initialization for the "lane complement".
   kc->w[ 1] = mm256_neg1;
   kc->w[ 2] = mm256_neg1;
   kc->w[ 8] = mm256_neg1;
   kc->w[12] = mm256_neg1;
   kc->w[17] = mm256_neg1;
   kc->w[20] = mm256_neg1;
   kc->ptr = 0;
   kc->lim = 200 - (lim >> 2);
 }
 static void
 keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
               size_t lim )
 {
    __m256i *buf;
    __m256i *vdata = (__m256i*)data;
    size_t ptr;
    buf = kc->buf;
    ptr = kc->ptr;
 uint64_t *W = (uint64_t*)(kc->w);
    if ( len < (lim - ptr) )
    {
        memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
        kc->ptr = ptr + len;
        return;
    }
    while ( len > 0 )
    {
        size_t clen;
        clen = (lim - ptr);
        if ( clen > len )
             clen = len;
        memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
        ptr += clen;
        vdata = vdata + (clen>>3);
        len -= clen;
        if ( ptr == lim )
        {
             INPUT_BUF( lim );
 printf("Vtransform before ptr= %u, len= %u, lim= %u\n",ptr, len, lim);
 printf("w:   %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] );
 printf("     %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] );
 printf("     %016x %016x %016x %016x\n", W[32], W[36], W[40], W[44] );
 printf("     %016x %016x %016x %016x\n", W[48], W[52], W[56], W[60] );
            KECCAK_F_1600;
 //printf("Vtransform after  ptr= %u, len= %u, lim= %u\n",ptr, len, lim);
 //printf("w:   %016x %016x %016x %016x\n", W[0], W[4], W[8], W[12] );
 //printf("     %016x %016x %016x %016x\n", W[16], W[20], W[24], W[28] );
            ptr = 0;
        }
    }
    kc->ptr = ptr;
 }
 // keccak512 4way d=64  lim=72,  keccak256 8way d=32 lim=136
 // keccak256 4way d=32 lim=136 
 // keccak512 d=64, lim=72,  keccak256 d=32, lim=136 
 static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t d,
            size_t lim )
 {
    unsigned eb;
    union {
       __m256i tmp[lim + 1];
       sph_u64 dummy;   /* for alignment */
    } u;
    size_t j;
 //    int d = 64;
    eb = 0x100  >> 8;
    if ( kc->ptr == (lim - 1) )
    {
        uint64_t t = eb | 0x80;
        u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
        j = 1;
    }
    else
    {
        j = lim - kc->ptr;
        u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
        memset_zero_m256i( u.tmp + 1, (j>>3) - 1 );
        u.tmp[j - 1] = _mm256_set_epi64x( 0x8000000000000000,
                0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
    }
    keccak64_core( kc, u.tmp, j, lim );
    /* Finalize the "lane complement" */
    NOT64( kc->w[ 1], kc->w[ 1] );
    NOT64( kc->w[ 2], kc->w[ 2] );
    NOT64( kc->w[ 8], kc->w[ 8] );
    NOT64( kc->w[12], kc->w[12] );
    NOT64( kc->w[17], kc->w[17] );
    NOT64( kc->w[20], kc->w[20] );
    for ( j = 0; j < d; j += 8 )
         u.tmp[j] =  kc->w[j>>3]; 
    memcpy_m256i( dst, u.tmp, d>>3 );
 }
 void keccak256_4way_init( void *kc )
 {
   keccak64_init( kc, 256 );
 }
 void
 keccak256_4way(void *cc, const void *data, size_t len)
 {
    keccak64_core(cc, data, len, 136);
 }
 void
 keccak256_4way_close(void *cc, void *dst)
 {
    keccak64_close(cc, dst, 32, 136);
 }
 void keccak512_4way_init( void *kc )
 {
   keccak64_init( kc, 512 );
 }
 void
 keccak512_4way(void *cc, const void *data, size_t len)
 {
        keccak64_core(cc, data, len, 72);
 }
 void
 keccak512_4way_close(void *cc, void *dst)
 {
        keccak64_close(cc, dst, 64, 72);
 }
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -0,0 +1,94 @@
 /* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
 /**
 * Keccak interface. This is the interface for Keccak with the
 * recommended parameters for SHA-3, with output lengths 224, 256,
 * 384 and 512 bits.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 * 
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @file     sph_keccak.h
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */
 #ifndef KECCAK_HASH_4WAY_H__
 #define KECCAK_HASH_4WAY_H__
 #ifdef __cplusplus
 extern "C"{
 #endif
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"
 #define SPH_SIZE_keccak256   256
 /**
 * Output size (in bits) for Keccak-512.
 */
 #define SPH_SIZE_keccak512   512
 /**
 * This structure is a context for Keccak computations: it contains the
 * intermediate values and some data from the last entered block. Once a
 * Keccak computation has been performed, the context can be reused for
 * another computation.
 *
 * The contents of this structure are private. A running Keccak computation
 * can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
 #ifdef  __AVX2__
 typedef struct {
        __m256i buf[144*8];    /* first field, for alignment */
        __m256i w[25];
        size_t ptr, lim;
 //        sph_u64 wide[25];
 } keccak64_ctx_m256i;
 typedef keccak64_ctx_m256i keccak256_4way_context;
 typedef keccak64_ctx_m256i keccak512_4way_context;
 void keccak256_4way_init(void *cc);
 void keccak256_4way(void *cc, const void *data, size_t len);
 void keccak256_4way_close(void *cc, void *dst);
 void keccak512_4way_init(void *cc);
 void keccak512_4way(void *cc, const void *data, size_t len);
 void keccak512_4way_close(void *cc, void *dst);
 void keccak512_4way_addbits_and_close(
        void *cc, unsigned ub, unsigned n, void *dst);
 #endif
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/algo/keccak/keccak.c
+++ b/algo/keccak/keccak.c
@@ -50,17 +50,3 @@ int scanhash_keccak(int thr_id, struct work *work,
 	return 0;
 }
 void keccak_set_target( struct work* work, double job_diff )
 {
  work_set_target( work, job_diff / (128.0 * opt_diff_factor) );
 }
 bool register_keccak_algo( algo_gate_t* gate )
 {
  gate->scanhash        = (void*)&scanhash_keccak;
  gate->hash            = (void*)&keccakhash;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  gate->set_target      = (void*)&keccak_set_target;
  return true;
 };
--- a/algo/keccak/sph_keccak.c
+++ b/algo/keccak/sph_keccak.c
@@ -955,6 +955,7 @@ static const struct {
 #endif
 #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
 		DECL64(tt0); \
 		DECL64(tt1); \
@@ -1643,8 +1644,7 @@ keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
 		for (j = 0; j < d; j += 8) \
 			sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \
 		memcpy(dst, u.tmp, d); \
-		keccak_init(kc, (unsigned)d << 3); \
+}
 	} \
 #else
--- a/algo/keccak/sph_keccak.c.bak
+++ b/algo/keccak/sph_keccak.c.bak
--- a/algo/polytimos/polytimos-gate.c
+++ b/algo/polytimos/polytimos-gate.c
@@ -0,0 +1,12 @@
 #include "polytimos-gate.h"
 bool register_polytimos_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  init_polytimos_context();
  gate->scanhash  = (void*)&scanhash_polytimos;
  gate->hash      = (void*)&polytimos_hash;
  gate->get_max64 = (void*)&get_max64_0x3ffff;
  return true;
 };
--- a/algo/polytimos/polytimos-gate.h
+++ b/algo/polytimos/polytimos-gate.h
@@ -0,0 +1,12 @@
 #ifndef __POLYTIMOS_GATE_H__
 #define __POLYTIMOS_GATE_H__
 #include "algo-gate-api.h"
 #include <stdint.h>
 void polytimos_hash( void *state, const void *input );
 int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );
 void init_polytimos_context();
 #endif
--- a/algo/polytimos/polytimos.c
+++ b/algo/polytimos/polytimos.c
@@ -0,0 +1,115 @@
 #include "polytimos-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "algo/skein/sph_skein.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/fugue//sph_fugue.h"
 #include "algo/luffa/sse2/luffa_for_sse2.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/gost/sph_gost.h"
 #ifndef NO_AES_NI
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
 typedef struct {
 	sph_skein512_context    skein;
        sph_shabal512_context   shabal;
 #ifdef NO_AES_NI
 	sph_echo512_context		echo;
 #else
        hashState_echo          echo;
 #endif
        hashState_luffa         luffa;
 	sph_fugue512_context    fugue;
 	sph_gost512_context     gost;
 } poly_ctx_holder;
 poly_ctx_holder poly_ctx;
 void init_polytimos_context()
 {
 	sph_skein512_init(&poly_ctx.skein);
        sph_shabal512_init(&poly_ctx.shabal);
 #ifdef NO_AES_NI
        sph_echo512_init(&poly_ctx.echo);
 #else
        init_echo( &poly_ctx.echo, 512 );
 #endif
        init_luffa( &poly_ctx.luffa, 512 );
        sph_fugue512_init(&poly_ctx.fugue);
        sph_gost512_init(&poly_ctx.gost);
 }
 void polytimos_hash(void *output, const void *input)
 {
        uint32_t hashA[16] __attribute__ ((aligned (64)));
        poly_ctx_holder ctx __attribute__ ((aligned (64)));
        memcpy( &ctx, &poly_ctx, sizeof(poly_ctx) );
 	sph_skein512(&ctx.skein, input, 80);
 	sph_skein512_close(&ctx.skein, hashA);
 	sph_shabal512(&ctx.shabal, hashA, 64);
 	sph_shabal512_close(&ctx.shabal, hashA);
 #ifdef NO_AES_NI
 	sph_echo512(&ctx.echo, hashA, 64);
 	sph_echo512_close(&ctx.echo, hashA);
 #else
        update_final_echo ( &ctx.echo, (BitSequence *)hashA,
                            (const BitSequence *)hashA, 512 );
 #endif
        update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA,
                                (const BitSequence*)hashA, 64 );
 	sph_fugue512(&ctx.fugue, hashA, 64);
 	sph_fugue512_close(&ctx.fugue, hashA);
 	sph_gost512(&ctx.gost, hashA, 64);
 	sph_gost512_close(&ctx.gost, hashA);
 	memcpy(output, hashA, 32);
 }
 int scanhash_polytimos(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
 {
 	uint32_t _ALIGN(128) hash[8];
 	uint32_t _ALIGN(128) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
 	volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	if (opt_benchmark)
 		ptarget[7] = 0x0cff;
 	// we need bigendian data...
 	for (int i=0; i < 19; i++) {
 		be32enc(&endiandata[i], pdata[i]);
 	}
 	do {
 		be32enc(&endiandata[19], nonce);
 		polytimos_hash(hash, endiandata);
 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			work_set_target_ratio(work, hash);
 			pdata[19] = nonce;
 			*hashes_done = pdata[19] - first_nonce;
 			return 1;
 		}
 		nonce++;
 	} while (nonce < max_nonce && !(*restart));
 	pdata[19] = nonce;
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
--- a/algo/polytimos/polytimos.c.broke
+++ b/algo/polytimos/polytimos.c.broke
@@ -0,0 +1,125 @@
 #include "polytimos-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
 #include "algo/skein/sph_skein.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/luffa/sph_luffa.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/luffa/sse2/luffa_for_sse2.h"
 #ifndef NO_AES_NI
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
 /* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
 typedef struct {
 	sph_skein512_context    skein;
        sph_luffa512_context    luffa;
 //        hashState_luffa         luffa;
 //#ifdef NO_AES_NI
        sph_echo512_context      echo;
 //#else
 //        hashState_echo          echo;
 //#endif
 	sph_shabal512_context   shabal;
 	sph_fugue512_context    fugue;
 	sph_gost512_context     gost;
 } poly_context_holder;
 static __thread poly_context_holder poly_ctx __attribute__ ((aligned (64)));
 void init_polytimos_context()
 {
    sph_skein512_init(&poly_ctx.skein);
    sph_shabal512_init(&poly_ctx.shabal);
 //#ifdef NO_AES_NI
    sph_echo512_init(&poly_ctx.echo);
 //#else
 //    init_echo( &poly_ctx.echo, 512 );
 //#endif
 //    init_luffa( &poly_ctx.luffa, 512 );
    sph_luffa512_init(&poly_ctx.luffa);
    sph_fugue512_init(&poly_ctx.fugue);
    sph_gost512_init(&poly_ctx.gost);
 }
 void polytimos_hash(void *output, const void *input)
 {
 	poly_context_holder ctx __attribute__ ((aligned (64)));
 	uint32_t hashA[16]__attribute__ ((aligned (64)));
        memcpy( &ctx, &poly_ctx, sizeof(poly_ctx) );
 	sph_skein512(&ctx.skein, input, 80);
 	sph_skein512_close(&ctx.skein, hashA);
 	sph_shabal512(&ctx.shabal, hashA, 64);
 	sph_shabal512_close(&ctx.shabal, hashA);
 //#ifdef NO_AES_NI
 	sph_echo512(&ctx.echo, hashA, 64);
 	sph_echo512_close(&ctx.echo, hashA);
 //#else
 //        update_final_echo ( &ctx.echo, (BitSequence *)hashA,
 //                            (const BitSequence *)hashA, 512 );
 //#endif
 //        update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA,
 //                                (const BitSequence*)hashA, 64 );
        sph_luffa512(&ctx.luffa, hashA, 64);
        sph_luffa512_close(&ctx.luffa, hashA);
 	sph_fugue512(&ctx.fugue, hashA, 64);
 	sph_fugue512_close(&ctx.fugue, hashA);
 	sph_gost512(&ctx.gost, hashA, 64);
 	sph_gost512_close(&ctx.gost, hashA);
 	memcpy(output, hashA, 32);
 }
 int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
                        uint64_t *hashes_done )
 {
 	uint32_t _ALIGN(128) hash[8];
 	uint32_t _ALIGN(128) endiandata[20];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t Htarg = ptarget[7];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;
 	volatile uint8_t *restart = &(work_restart[thr_id].restart);
 	if (opt_benchmark)
 		ptarget[7] = 0x0cff;
 	// we need bigendian data...
 	for (int i=0; i < 19; i++) {
 		be32enc(&endiandata[i], pdata[i]);
 	}
 	do {
 		be32enc(&endiandata[19], nonce);
 		polytimos_hash(hash, endiandata);
 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			work_set_target_ratio(work, hash);
 			pdata[19] = nonce;
 			*hashes_done = pdata[19] - first_nonce;
 			return 1;
 		}
 		nonce++;
 	} while (nonce < max_nonce && !(*restart));
 	pdata[19] = nonce;
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -0,0 +1,122 @@
 #include "algo-gate-api.h"
 #include "skein-gate.h"
 #include <string.h>
 #include <stdint.h>
 #include <openssl/sha.h>
 #include "skein-hash-4way.h"
 #if defined (__AVX2__)
 void skeinhash_4way( void *state, const void *input )
 {
     uint64_t hash0[8] __attribute__ ((aligned (64)));
     uint64_t hash1[8] __attribute__ ((aligned (64)));
     uint64_t hash2[8] __attribute__ ((aligned (64)));
     uint64_t hash3[8] __attribute__ ((aligned (64)));
     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
     skein512_4way_context ctx_skein;
     SHA256_CTX            ctx_sha256;
     skein512_4way_init( &ctx_skein );
     skein512_4way( &ctx_skein, input, 80 );
     skein512_4way_close( &ctx_skein, vhash );
     m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
     SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
     SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
     SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
     SHA256_Init( &ctx_sha256 );
     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
     memcpy(  (char*)state,       (char*)hash0, 32 );
     memcpy( ((char*)state) + 32, (char*)hash1, 32 );
     memcpy( ((char*)state) + 64, (char*)hash2, 32 );
     memcpy( ((char*)state) + 96, (char*)hash3, 32 );
 }
 int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done )
 {
    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__ ((aligned (64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    uint64_t *edata = (uint64_t*)endiandata;
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
    // hash is returned deinterleaved
    uint32_t *nonces = work->nonces;
    bool *found = work->nfound;
    int num_found;
 // data is 80 bytes, 20 u32 or 4 u64.
    swab32_array( endiandata, pdata, 20 );
    m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
    uint32_t *noncep1 = vdata + 75;
    uint32_t *noncep2 = vdata + 77;
    uint32_t *noncep3 = vdata + 79;
   do
   {
       found[0] = found[1] = found[2] = found[3] = false;
       num_found = 0;      
       be32enc( noncep0, n   );
       be32enc( noncep1, n+1 );
       be32enc( noncep2, n+2 );
       be32enc( noncep3, n+3 );
       skeinhash_4way( hash, vdata );
       if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
       {
           found[0] = true;
           num_found++;
           nonces[0] = n;
           // always put nonce0 in work data for compartibility with 
           // non vectored algos.
           pdata[19] = n;
       }
       if ( (hash+8)[7] < Htarg && fulltest( hash+8, ptarget ) )
       {
           found[1] = true;
           num_found++;
           nonces[1] = n+1;           
       }
       if ( (hash+16)[7] < Htarg && fulltest( hash+16, ptarget ) )
       {
           found[2] = true;
           num_found++;
           nonces[2] = n+2;           
       }
       if ( (hash+24)[7] < Htarg && fulltest( hash+24, ptarget ) )
       {
           found[3] = true;
           num_found++;
           nonces[3] = n+3;           
       }
       n += 4;
    } while ( (num_found == 0) && (n < max_nonce)
               && !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce + 1;
    return num_found;
 }
 #endif
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -0,0 +1,24 @@
 #include "skein-gate.h"
 #include "algo-gate-api.h"
 //#include <string.h>
 //#include <stdint.h>
 #include "sph_skein.h"
 #include "skein-hash-4way.h"
 int64_t skein_get_max64() { return 0x7ffffLL; }
 bool register_skein_algo( algo_gate_t* gate )
 {
 #if defined (FOUR_WAY) &&  defined (__AVX2__)
    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash  = (void*)&scanhash_skein_4way;
    gate->hash      = (void*)&skeinhash_4way;
 #else
    gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash  = (void*)&scanhash_skein;
    gate->hash      = (void*)&skeinhash;
 #endif
    gate->get_max64 = (void*)&skein_get_max64;
    return true;
 };
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -0,0 +1,16 @@
 #ifndef __SKEIN_GATE_H__
 #define __SKEIN_GATE_H__
 #include <stdint.h>
 #if defined(__AVX2__)
 void skeinhash_4way( void *output, const void *input );
 int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done );
 #endif
 void skeinhash( void *output, const void *input );
 int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );
 #endif
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -0,0 +1,648 @@
 /* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */
 /*
 * Skein implementation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 * 
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */
 #if defined (__AVX2__)
 #include <stddef.h>
 #include <string.h>
 #include "skein-hash-4way.h"
 #ifdef __cplusplus
 extern "C"{
 #endif
 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif
 /*
 * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
 */
 #define M9_0_0    0
 #define M9_0_1    1
 #define M9_0_2    2
 #define M9_0_3    3
 #define M9_0_4    4
 #define M9_0_5    5
 #define M9_0_6    6
 #define M9_0_7    7
 #define M9_1_0    1
 #define M9_1_1    2
 #define M9_1_2    3
 #define M9_1_3    4
 #define M9_1_4    5
 #define M9_1_5    6
 #define M9_1_6    7
 #define M9_1_7    8
 #define M9_2_0    2
 #define M9_2_1    3
 #define M9_2_2    4
 #define M9_2_3    5
 #define M9_2_4    6
 #define M9_2_5    7
 #define M9_2_6    8
 #define M9_2_7    0
 #define M9_3_0    3
 #define M9_3_1    4
 #define M9_3_2    5
 #define M9_3_3    6
 #define M9_3_4    7
 #define M9_3_5    8
 #define M9_3_6    0
 #define M9_3_7    1
 #define M9_4_0    4
 #define M9_4_1    5
 #define M9_4_2    6
 #define M9_4_3    7
 #define M9_4_4    8
 #define M9_4_5    0
 #define M9_4_6    1
 #define M9_4_7    2
 #define M9_5_0    5
 #define M9_5_1    6
 #define M9_5_2    7
 #define M9_5_3    8
 #define M9_5_4    0
 #define M9_5_5    1
 #define M9_5_6    2
 #define M9_5_7    3
 #define M9_6_0    6
 #define M9_6_1    7
 #define M9_6_2    8
 #define M9_6_3    0
 #define M9_6_4    1
 #define M9_6_5    2
 #define M9_6_6    3
 #define M9_6_7    4
 #define M9_7_0    7
 #define M9_7_1    8
 #define M9_7_2    0
 #define M9_7_3    1
 #define M9_7_4    2
 #define M9_7_5    3
 #define M9_7_6    4
 #define M9_7_7    5
 #define M9_8_0    8
 #define M9_8_1    0
 #define M9_8_2    1
 #define M9_8_3    2
 #define M9_8_4    3
 #define M9_8_5    4
 #define M9_8_6    5
 #define M9_8_7    6
 #define M9_9_0    0
 #define M9_9_1    1
 #define M9_9_2    2
 #define M9_9_3    3
 #define M9_9_4    4
 #define M9_9_5    5
 #define M9_9_6    6
 #define M9_9_7    7
 #define M9_10_0   1
 #define M9_10_1   2
 #define M9_10_2   3
 #define M9_10_3   4
 #define M9_10_4   5
 #define M9_10_5   6
 #define M9_10_6   7
 #define M9_10_7   8
 #define M9_11_0   2
 #define M9_11_1   3
 #define M9_11_2   4
 #define M9_11_3   5
 #define M9_11_4   6
 #define M9_11_5   7
 #define M9_11_6   8
 #define M9_11_7   0
 #define M9_12_0   3
 #define M9_12_1   4
 #define M9_12_2   5
 #define M9_12_3   6
 #define M9_12_4   7
 #define M9_12_5   8
 #define M9_12_6   0
 #define M9_12_7   1
 #define M9_13_0   4
 #define M9_13_1   5
 #define M9_13_2   6
 #define M9_13_3   7
 #define M9_13_4   8
 #define M9_13_5   0
 #define M9_13_6   1
 #define M9_13_7   2
 #define M9_14_0   5
 #define M9_14_1   6
 #define M9_14_2   7
 #define M9_14_3   8
 #define M9_14_4   0
 #define M9_14_5   1
 #define M9_14_6   2
 #define M9_14_7   3
 #define M9_15_0   6
 #define M9_15_1   7
 #define M9_15_2   8
 #define M9_15_3   0
 #define M9_15_4   1
 #define M9_15_5   2
 #define M9_15_6   3
 #define M9_15_7   4
 #define M9_16_0   7
 #define M9_16_1   8
 #define M9_16_2   0
 #define M9_16_3   1
 #define M9_16_4   2
 #define M9_16_5   3
 #define M9_16_6   4
 #define M9_16_7   5
 #define M9_17_0   8
 #define M9_17_1   0
 #define M9_17_2   1
 #define M9_17_3   2
 #define M9_17_4   3
 #define M9_17_5   4
 #define M9_17_6   5
 #define M9_17_7   6
 #define M9_18_0   0
 #define M9_18_1   1
 #define M9_18_2   2
 #define M9_18_3   3
 #define M9_18_4   4
 #define M9_18_5   5
 #define M9_18_6   6
 #define M9_18_7   7
 /*
 * M3_ ## s ## _ ## i  evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
 */
 #define M3_0_0    0
 #define M3_0_1    1
 #define M3_1_0    1
 #define M3_1_1    2
 #define M3_2_0    2
 #define M3_2_1    0
 #define M3_3_0    0
 #define M3_3_1    1
 #define M3_4_0    1
 #define M3_4_1    2
 #define M3_5_0    2
 #define M3_5_1    0
 #define M3_6_0    0
 #define M3_6_1    1
 #define M3_7_0    1
 #define M3_7_1    2
 #define M3_8_0    2
 #define M3_8_1    0
 #define M3_9_0    0
 #define M3_9_1    1
 #define M3_10_0   1
 #define M3_10_1   2
 #define M3_11_0   2
 #define M3_11_1   0
 #define M3_12_0   0
 #define M3_12_1   1
 #define M3_13_0   1
 #define M3_13_1   2
 #define M3_14_0   2
 #define M3_14_1   0
 #define M3_15_0   0
 #define M3_15_1   1
 #define M3_16_0   1
 #define M3_16_1   2
 #define M3_17_0   2
 #define M3_17_1   0
 #define M3_18_0   0
 #define M3_18_1   1
 #define XCAT(x, y)     XCAT_(x, y)
 #define XCAT_(x, y)    x ## y
 #define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
 #define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
 // AVX2 all scalar vars are now vectors representing 4 nonces in parallel
 #define TFBIG_KINIT_4WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
 do { \
  k8 = _mm256_xor_si256( _mm256_xor_si256( \
                            _mm256_xor_si256( _mm256_xor_si256( k0, k1 ), \
                                              _mm256_xor_si256( k2, k3 ) ), \
                            _mm256_xor_si256( _mm256_xor_si256( k4, k5 ), \
                                              _mm256_xor_si256( k6, k7 ) ) ), \
                         _mm256_set_epi64x( SPH_C64(0x1BD11BDAA9FC1A22), \
                                            SPH_C64(0x1BD11BDAA9FC1A22), \
                                            SPH_C64(0x1BD11BDAA9FC1A22), \
                                            SPH_C64(0x1BD11BDAA9FC1A22) ) ); \
  t2 = t0 ^ t1; \
 } while (0)
 #define TFBIG_ADDKEY_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
 do { \
  w0 = _mm256_add_epi64( w0, SKBI(k,s,0) ); \
  w1 = _mm256_add_epi64( w1, SKBI(k,s,1) ); \
  w2 = _mm256_add_epi64( w2, SKBI(k,s,2) ); \
  w3 = _mm256_add_epi64( w3, SKBI(k,s,3) ); \
  w4 = _mm256_add_epi64( w4, SKBI(k,s,4) ); \
  w5 = _mm256_add_epi64( w5, _mm256_add_epi64( SKBI(k,s,5), \
                           _mm256_set_epi64x( SKBT(t,s,0), SKBT(t,s,0), \
                                              SKBT(t,s,0), SKBT(t,s,0) ) ) ); \
 __m256i skbi6 = SKBI(k,s,6); \
  w6 = _mm256_add_epi64( w6, _mm256_add_epi64( SKBI(k,s,6), \
                           _mm256_set_epi64x( SKBT(t,s,1), SKBT(t,s,1), \
                                              SKBT(t,s,1), SKBT(t,s,1) ) ) ); \
  w7 = _mm256_add_epi64( w7, _mm256_add_epi64( SKBI(k,s,7), \
                                      _mm256_set_epi64x( s, s, s, s ) ) ); \
 } while (0)
 #define TFBIG_MIX_4WAY(x0, x1, rc) \
 do { \
     x0 = _mm256_add_epi64( x0, x1 ); \
     x1 = _mm256_xor_si256( mm256_rotl_64( x1, rc ), x0 ); \
 } while (0)
 // typeless
 #define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
 		TFBIG_MIX_4WAY(w0, w1, rc0); \
 		TFBIG_MIX_4WAY(w2, w3, rc1); \
 		TFBIG_MIX_4WAY(w4, w5, rc2); \
 		TFBIG_MIX_4WAY(w6, w7, rc3); \
 	} while (0)
 #define TFBIG_4e(s)   do { \
 		TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
 		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
 		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
 		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
 		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
 	} while (0)
 #define TFBIG_4o(s)   do { \
 		TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
 		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
 		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
 		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
 		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
 	} while (0)
 // scale buf offset by 4
 #define UBI_BIG_4WAY(etype, extra) \
 do { \
  sph_u64 t0, t1, t2; \
  __m256i h8; \
 /* can LE be assumed? \
   dec64le does nothing when SPH_LITTLE endian is set, as it is. \
  __m256i m0 = _mm256_dec64le( buf ); \
  __m256i m1 = _mm256_dec64le( buf +  8*4 ); \
  __m256i m2 = _mm256_dec64le( buf + 16*4 ); \
  __m256i m3 = _mm256_dec64le( buf + 24*4 ); \
  __m256i m4 = _mm256_dec64le( buf + 32*4 ); \
  __m256i m5 = _mm256_dec64le( buf + 40*4 ); \
  __m256i m6 = _mm256_dec64le( buf + 48*4 ); \
  __m256i m7 = _mm256_dec64le( buf + 56*4 ); \
 */ \
  __m256i m0 =  buf[0]; \
  __m256i m1 =  buf[1]; \
  __m256i m2 =  buf[2]; \
  __m256i m3 =  buf[3]; \
  __m256i m4 =  buf[4]; \
  __m256i m5 =  buf[5]; \
  __m256i m6 =  buf[6]; \
  __m256i m7 =  buf[7]; \
 \
  __m256i p0 = m0; \
  __m256i p1 = m1; \
  __m256i p2 = m2; \
  __m256i p3 = m3; \
  __m256i p4 = m4; \
  __m256i p5 = m5; \
  __m256i p6 = m6; \
  __m256i p7 = m7; \
  t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
  t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
  TFBIG_KINIT_4WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
  TFBIG_4e(0); \
  TFBIG_4o(1); \
  TFBIG_4e(2); \
  TFBIG_4o(3); \
  TFBIG_4e(4); \
  TFBIG_4o(5); \
  TFBIG_4e(6); \
  TFBIG_4o(7); \
  TFBIG_4e(8); \
  TFBIG_4o(9); \
  TFBIG_4e(10); \
  TFBIG_4o(11); \
  TFBIG_4e(12); \
  TFBIG_4o(13); \
  TFBIG_4e(14); \
  TFBIG_4o(15); \
  TFBIG_4e(16); \
  TFBIG_4o(17); \
  TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
  h0 = _mm256_xor_si256( m0, p0 );\
  h1 = _mm256_xor_si256( m1, p1 );\
  h2 = _mm256_xor_si256( m2, p2 );\
  h3 = _mm256_xor_si256( m3, p3 );\
  h4 = _mm256_xor_si256( m4, p4 );\
  h5 = _mm256_xor_si256( m5, p5 );\
  h6 = _mm256_xor_si256( m6, p6 );\
  h7 = _mm256_xor_si256( m7, p7 );\
 } while (0)
 #define DECL_STATE_BIG_4WAY \
  __m256i h0, h1, h2, h3, h4, h5, h6, h7; \
  sph_u64 bcount;
 #define READ_STATE_BIG(sc)   do { \
 		h0 = (sc)->h0; \
 		h1 = (sc)->h1; \
 		h2 = (sc)->h2; \
 		h3 = (sc)->h3; \
 		h4 = (sc)->h4; \
 		h5 = (sc)->h5; \
 		h6 = (sc)->h6; \
 		h7 = (sc)->h7; \
 		bcount = sc->bcount; \
 	} while (0)
 #define WRITE_STATE_BIG(sc)   do { \
 		(sc)->h0 = h0; \
 		(sc)->h1 = h1; \
 		(sc)->h2 = h2; \
 		(sc)->h3 = h3; \
 		(sc)->h4 = h4; \
 		(sc)->h5 = h5; \
 		(sc)->h6 = h6; \
 		(sc)->h7 = h7; \
 		sc->bcount = bcount; \
 	} while (0)
 static void
 skein_big_init_4way( skein512_4way_context *sc, const sph_u64 *iv )
 {
        sc->h0 = _mm256_set_epi64x( iv[0], iv[0],iv[0],iv[0] );
        sc->h1 = _mm256_set_epi64x( iv[1], iv[1],iv[1],iv[1] );
        sc->h2 = _mm256_set_epi64x( iv[2], iv[2],iv[2],iv[2] );
        sc->h3 = _mm256_set_epi64x( iv[3], iv[3],iv[3],iv[3] );
        sc->h4 = _mm256_set_epi64x( iv[4], iv[4],iv[4],iv[4] );
        sc->h5 = _mm256_set_epi64x( iv[5], iv[5],iv[5],iv[5] );
        sc->h6 = _mm256_set_epi64x( iv[6], iv[6],iv[6],iv[6] );
        sc->h7 = _mm256_set_epi64x( iv[7], iv[7],iv[7],iv[7] );
        sc->bcount = 0;
        sc->ptr = 0;
 }
 static void
 skein_big_core_4way( skein512_4way_context *sc, const void *data,
                     size_t len )
 {
   __m256i *vdata = (__m256i*)data;
   __m256i *buf;
   size_t ptr;
   unsigned first;
   DECL_STATE_BIG_4WAY
 // len is the array size, of data, ie 64 bytes
 // data points to start of 4 element buf
 // ptr is a len offset in bytes
 // buff is an array of 4 elements
 // buff_size is size of one array element
 // One element is 8 bytes (64 bits) scalar but 32 bytes (256 bits) 4way
 // To index buf using ptr it has to be scaled 8 to 1. the amounrt of
 // data to copy is 32 bytes per element instead of 8, or one m256
   buf = sc->buf;
   ptr = sc->ptr;
   const int buf_size = 64;   // 64 * _m256i
 // 64 byte len, no part block
   if ( len <= buf_size - ptr )
   {
       memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
       sc->ptr = ptr + len;
       return;
   }
   READ_STATE_BIG( sc );
   first = ( bcount == 0 ) << 7;
 // 64 byte len, only one block, no transform here.
 // 80 byte len, transform first 64 bytes.
   do {
       size_t clen;
       if ( ptr == buf_size )
       {
            bcount ++;
            UBI_BIG_4WAY( 96 + first, 0 );
            first = 0;
            ptr = 0;
       }
       clen = buf_size - ptr;
       if ( clen > len )
            clen = len;
       memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
       ptr += clen;
       vdata += (clen>>3);
       len -= clen;
   } while ( len > 0 );
   WRITE_STATE_BIG( sc );
   sc->ptr = ptr;
 }
 static void
 skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
                      void *dst, size_t out_len )
 {
 	__m256i *buf;
 	size_t ptr;
 	unsigned et;
 	int i;
 	DECL_STATE_BIG_4WAY
 	/*
 	 * Add bit padding if necessary.
 	 */
 //	if (n != 0) {
 //		unsigned z;
 //		unsigned char x;
 //
 //		z = 0x80 >> n;
 //		x = ((ub & -z) | z) & 0xFF;
 //		skein_big_core(sc, &x, 1);
 //	}
 	buf = sc->buf;
 	ptr = sc->ptr;
        const int buf_size = 64;
 	/*
 	 * At that point, if ptr == 0, then the message was empty;
 	 * otherwise, there is between 1 and 64 bytes (inclusive) which
 	 * are yet to be processed. Either way, we complete the buffer
 	 * to a full block with zeros (the Skein specification mandates
 	 * that an empty message is padded so that there is at least
 	 * one block to process).
 	 *
 	 * Once this block has been processed, we do it again, with
 	 * a block full of zeros, for the output (that block contains
 	 * the encoding of "0", over 8 bytes, then padded with zeros).
 	 */
 // 64 byte len, process only block
 // 80 byte len, process last part block (16 bytes) padded.
 	READ_STATE_BIG(sc);
        memset_zero_m256i( buf + (ptr>>3), (buf_size - ptr) >> 3 );
 	et = 352 + ((bcount == 0) << 7);
        UBI_BIG_4WAY( et, ptr );
        memset_zero_m256i( buf, buf_size >> 3 );
        bcount = 0;
        UBI_BIG_4WAY( 510, 8 );
 //	for ( i = 0; i < 2; i ++ )
 //        {
 //		UBI_BIG_AVX2( et, ptr );
 //		if (i == 0)
 //                {
 //                        memset_zero_m256i( buf, buf_size >> 3 );
 //			bcount = 0;
 //			et = 510;
 //			ptr = 8;
 //		}
 //	}
 // Can LE be assumed? Should be ok SPH_LITTLE_ENDIAN is defined
 /*        _mm256_enc64le( buf, h0 );
        _mm256_enc64le( buf + 32, h1 );
        _mm256_enc64le( buf + 64, h2 );
        _mm256_enc64le( buf + 96, h3 );
        _mm256_enc64le( buf + 128, h4 );
        _mm256_enc64le( buf + 160, h5 );
        _mm256_enc64le( buf + 192, h6 );
        _mm256_enc64le( buf + 224, h7 );
 */
        buf[0] = h0;
        buf[1] = h1;
        buf[2] = h2;
        buf[3] = h3;
        buf[4] = h4;
        buf[5] = h5;
        buf[6] = h6;
        buf[7] = h7;
        memcpy_m256i( dst, buf, out_len >> 3 );
 //	memcpy( dst, buf, out_len * 4 );
 }
 static const sph_u64 IV256[] = {
 	SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
 	SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
 	SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
 	SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
 };
 static const sph_u64 IV512[] = {
 	SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
 	SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
 	SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
 	SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
 };
 void
 skein256_4way_init(void *cc)
 {
 	skein_big_init_4way(cc, IV256);
 }
 void
 skein256_4way(void *cc, const void *data, size_t len)
 {
 	skein_big_core_4way(cc, data, len);
 }
 void
 skein256_4way_close(void *cc, void *dst)
 {
        skein_big_close_4way(cc, 0, 0, dst, 32);
 }
 void
 skein512_4way_init(void *cc)
 {
 	skein_big_init_4way(cc, IV512);
 }
 void
 skein512_4way(void *cc, const void *data, size_t len)
 {
 	skein_big_core_4way(cc, data, len);
 }
 void
 skein512_4way_close(void *cc, void *dst)
 {
        skein_big_close_4way(cc, 0, 0, dst, 64);
 }
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -0,0 +1,93 @@
 /* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */
 /**
 * Skein interface. The Skein specification defines three main
 * functions, called Skein-256, Skein-512 and Skein-1024, which can be
 * further parameterized with an output length. For the SHA-3
 * competition, Skein-512 is used for output sizes of 224, 256, 384 and
 * 512 bits; this is what this code implements. Thus, we hereafter call
 * Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein
 * specification defines as Skein-512-224, Skein-512-256, Skein-512-384
 * and Skein-512-512, respectively.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 * 
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @file     sph_skein.h
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */
 #ifndef __SKEIN_HASH_4WAY_H__
 #define __SKEIN_HASH_4WAY_H__
 #ifdef __cplusplus
 extern "C"{
 #endif
 #include <stddef.h>
 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"
 // Output size in bits
 #define SPH_SIZE_skein256   256
 #define SPH_SIZE_skein512   512
 #ifdef __AVX2__
 typedef struct {
        __m256i buf[8] __attribute__ ((aligned (32)));
        __m256i h0, h1, h2, h3, h4, h5, h6, h7;
        size_t ptr;
 	sph_u64 bcount;
 } skein512_4way_context;
 void skein512_4way_init(void *cc);
 void skein512_4way(void *cc, const void *data, size_t len);
 void skein512_4way_close(void *cc, void *dst);
 //void sph_skein512_addbits_and_close(
 //        void *cc, unsigned ub, unsigned n, void *dst);
 #endif
 #ifdef __AVX__
 typedef struct {
        __m128i buf[8] __attribute__ ((aligned (32)));
        __m128i h0, h1, h2, h3, h4, h5, h6, h7;
        size_t ptr;
        sph_u64 bcount;
 } skein256_4way_context;
 void skein256_4way_init(void *cc);
 void skein256_4way(void *cc, const void *data, size_t len);
 void skein256_4way_close(void *cc, void *dst);
 //void sph_skein256_addbits_and_close(
 //	void *cc, unsigned ub, unsigned n, void *dst);
 #endif
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -5,42 +5,28 @@
 #include <openssl/sha.h>
 #include "algo/sha/sph_sha2.h"
 typedef struct {
   sph_skein512_context skein;
 #ifndef USE_SPH_SHA
   SHA256_CTX         sha256;
 #else
   sph_sha256_context sha256;
 #endif
 } skein_ctx_holder;
 skein_ctx_holder skein_ctx;
 void init_skein_ctx()
 {
   sph_skein512_init( &skein_ctx.skein );
 #ifndef USE_SPH_SHA
   SHA256_Init( &skein_ctx.sha256 );
 #else
   sph_sha256_init( &skein_ctx.sha256 );
 #endif
 }
 void skeinhash(void *state, const void *input)
 {
     skein_ctx_holder ctx __attribute__ ((aligned (64)));
     memcpy( &ctx, &skein_ctx, sizeof(skein_ctx) );
     uint32_t hash[16] __attribute__ ((aligned (64)));
-	
+     sph_skein512_context ctx_skein;
-     sph_skein512( &ctx.skein, input, 80 );
+#ifndef USE_SPH_SHA
-     sph_skein512_close( &ctx.skein, hash );
+     SHA256_CTX           ctx_sha256;
 #else
     sph_sha256_context   ctx_sha256;
 #endif
     sph_skein512_init( &ctx_skein );
     sph_skein512( &ctx_skein, input, 80 );
     sph_skein512_close( &ctx_skein, hash );
 #ifndef USE_SPH_SHA
-     SHA256_Update( &ctx.sha256, hash, 64 );
+     SHA256_Init( &ctx_sha256 );
-     SHA256_Final( (unsigned char*) hash, &ctx.sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash, 64 );
     SHA256_Final( (unsigned char*) hash, &ctx_sha256 );
 #else
-     sph_sha256( &ctx.sha256, hash, 64 );
+     sph_sha256_init( &ctx_sha256 );
-     sph_sha256_close( &ctx.sha256, hash );
+     sph_sha256( &ctx_sha256, hash, 64 );
     sph_sha256_close( &ctx_sha256, hash );
 #endif
     memcpy(state, hash, 32);
@@ -77,15 +63,3 @@ int scanhash_skein(int thr_id, struct work *work,
 	return 0;
 }
 int64_t skein_get_max64() { return 0x7ffffLL; }
 bool register_skein_algo( algo_gate_t* gate )
 {
    init_skein_ctx();
    gate->optimizations = SSE2_OPT | SHA_OPT;
    gate->scanhash  = (void*)&scanhash_skein;
    gate->hash      = (void*)&skeinhash;
    gate->get_max64 = (void*)&skein_get_max64;
    return true;
 };
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -0,0 +1,95 @@
 #include "skein-gate.h"
 #include "algo-gate-api.h"
 #include <string.h>
 #include <stdint.h>
 #include "skein-hash-4way.h"
 #if defined(__AVX2__)
 void skein2hash_4way( void *output, const void *input )
 {
   skein512_4way_context ctx;
   uint64_t hash[8*4] __attribute__ ((aligned (64)));
   uint64_t *out64 = (uint64_t*)output;
   skein512_4way_init( &ctx );
   skein512_4way( &ctx, input, 80 );
   skein512_4way_close( &ctx, hash );
   skein512_4way_init( &ctx );
   skein512_4way( &ctx, hash, 64 );
   skein512_4way_close( &ctx, hash );
   m256_deinterleave_4x64( out64, out64+4, out64+8, out64+12, hash, 256 );
 }
 int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done )
 {
    uint32_t hash[4*8] __attribute__ ((aligned (64)));
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__ ((aligned (64)));
    uint64_t *edata = (uint64_t*)endiandata;
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
    // hash is returned deinterleaved
    uint32_t *nonces = work->nonces;
    bool *found = work->nfound;
    int num_found;
    swab32_array( endiandata, pdata, 20 );
    m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
    uint32_t *noncep1 = vdata + 75;
    uint32_t *noncep2 = vdata + 77;
    uint32_t *noncep3 = vdata + 79;
    do 
    {
       found[0] = found[1] = found[2] = found[3] = false;
       num_found = 0;
       be32enc( noncep0, n   );
       be32enc( noncep1, n+1 );
       be32enc( noncep2, n+2 );
       be32enc( noncep3, n+3 );
       skein2hash( hash, vdata );
       if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
       {
           found[0] = true;
           num_found++;
           nonces[0] = n;
       }
       if ( (hash+8)[7] < Htarg && fulltest( hash+8, ptarget ) )
       {
           found[1] = true;
           num_found++;
           nonces[1] = n+1;
       }
       if ( (hash+16)[7] < Htarg && fulltest( hash+16, ptarget ) )
       {
           found[2] = true;
           num_found++;
           nonces[2] = n+2;
       }
       if ( (hash+24)[7] < Htarg && fulltest( hash+24, ptarget ) )
       {
           found[3] = true;
           num_found++;
           nonces[3] = n+3;
       }
       n += 4;
    } while ( (num_found == 0) && (n < max_nonce)
             &&  !work_restart[thr_id].restart );
    *hashes_done = n - first_nonce + 1;
    return num_found;
 }
 #endif
--- a/algo/skein/skein2-gate.c
+++ b/algo/skein/skein2-gate.c
@@ -0,0 +1,27 @@
 #include "skein2-gate.h"
 #include "algo-gate-api.h"
 //#include <string.h>
 #include <stdint.h>
 #include "sph_skein.h"
 //#include "skein-hash-avx2.h"
 int64_t skein2_get_max64 ()
 {
  return 0x7ffffLL;
 }
 bool register_skein2_algo( algo_gate_t* gate )
 {
 #if defined (FOUR_WAY) && defined (__AVX2__)
  gate->optimizations = SSE2_OPT | AVX2_OPT;
  gate->scanhash  = (void*)&scanhash_skein2_4way;
  gate->hash      = (void*)&skein2hash_4way;
 #else
  gate->optimizations = SSE2_OPT;
  gate->scanhash  = (void*)&scanhash_skein2;
  gate->hash      = (void*)&skein2hash;
 #endif
  gate->get_max64 = (void*)&skein2_get_max64;
  return true;
 };
--- a/algo/skein/skein2-gate.h
+++ b/algo/skein/skein2-gate.h
@@ -0,0 +1,15 @@
 #ifndef __SKEIN2GATE_H__
 #define __SKEIN2_GATE_H__
 #include <stdint.h>
 #if defined(__AVX2__)
 void skein2hash_4way( void *output, const void *input );
 int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t* hashes_done );
 #endif
 void skein2hash( void *output, const void *input );
 int scanhash_skein2( int thr_id, struct work *work, uint32_t max_nonce,
                     uint64_t *hashes_done );
 #endif
--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -65,16 +65,4 @@ int scanhash_skein2(int thr_id, struct work *work,
 	return 0;
 }
 int64_t skein2_get_max64 ()
 {
  return 0x7ffffLL;
 }
 bool register_skein2_algo( algo_gate_t* gate )
 {
  gate->scanhash  = (void*)&scanhash_skein2;
  gate->hash      = (void*)&skein2hash;
  gate->get_max64 = (void*)&skein2_get_max64;
  return true;
 };
--- a/algo/skein/sph_skein.c
+++ b/algo/skein/sph_skein.c
@@ -39,6 +39,24 @@
 extern "C"{
 #endif
 void dump_sph_context( sph_u64 ptr, sph_u64 bcount, uint64_t* buf, 
  sph_u64 h0, sph_u64 h1, sph_u64 h2, sph_u64 h3, sph_u64 h4, sph_u64 h5,
  sph_u64 h6, sph_u64 h7 )
 {
 //scalar
 printf("sptr= %llu, bcount= %llu\n", ptr, bcount );
 printf("sbuf: %016llx %016llx %016llx %016llx\n", *((uint64_t*)buf),
       *((uint64_t*)buf+1), *((uint64_t*)buf+2), *((uint64_t*)buf+3) );
 printf("      %016llx %016llx %016llx %016llx\n", *((uint64_t*)buf+4),
       *((uint64_t*)buf+5), *((uint64_t*)buf+6), *((uint64_t*)buf+7) );
 printf("sh:%016llx %016llx %016llx %016llx\n", h0, h1, h2, h3 );
 printf("   %016llx %016llx %016llx %016llx\n", h4, h5, h6, h7 );
 }
 #if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN
 #define SPH_SMALL_FOOTPRINT_SKEIN   1
@@ -883,6 +901,7 @@ skein_big_core(sph_skein_big_context *sc, const void *data, size_t len)
 	}
 	READ_STATE_BIG(sc);
 	first = (bcount == 0) << 7;
 	do {
 		size_t clen;
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -3,6 +3,7 @@
 #include <inttypes.h>
 #include <immintrin.h>
 #include <memory.h>
 // Use these overlays to access the same data in memory as different types
 //
@@ -45,7 +46,6 @@ inline void memset_zero_m256i( __m256i *dst, int n )
 {
   __m256i zero = _mm256_setzero_si256();
   for ( int i = 0; i < n; i++ ) dst[i] = zero;
 //   for ( int i = 0; i < n; i++ ) dst[i] = _mm256_xor_si256( dst[i], dst[i] );
 }
 inline void memset_m256i( __m256i *dst, const __m256i a,  int n )
@@ -54,7 +54,7 @@ inline void memset_m256i( __m256i *dst, const __m256i a,  int n )
 }
 // Optimized copying using vectors. For misaligned data or more ganuularity
-// use __m228i versions or plain memcpy as appropriate.
+// use __m128i versions or plain memcpy as appropriate.
 // Copying fixed size
@@ -289,6 +289,35 @@ inline __m256i  mm256_byteswap_epi32( __m256i x )
                          _mm256_or_si256( x2, x3 ) );
 }
 inline __m256i mm256_byteswap_epi64( __m256i x )
 {
 // x = (x >> 32) | (x << 32)
  x = _mm256_or_si256( _mm256_srli_epi64( x, 32 ), _mm256_slli_epi64( x, 32 ) );
 // x = ( (x & 0xFFFF0000FFFF0000) >> 16 ) | ( (x & 0x0000FFFF0000FFFF) << 16 )
  x = _mm256_or_si256(
        _mm256_srli_epi64(
          _mm256_and_si256( x,
           _mm256_set_epi64x( 0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000,
                              0xFFFF0000FFFF0000, 0xFFFF0000FFFF0000 ) ), 16 ),
        _mm256_slli_epi64(
          _mm256_and_si256( x,
           _mm256_set_epi64x( 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF,
                              0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF ) ), 16 ));
 // x = ( (x & 0xFF00FF00FF00FF00) >> 8 ) | ( (x & 0x00FF00FF00FF00FF) << 16 )
   x = _mm256_or_si256(
        _mm256_srli_epi64(
          _mm256_and_si256( x,
            _mm256_set_epi64x( 0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00,
                               0xFF00FF00FF00FF00, 0xFF00FF00FF00FF00 ) ), 8 ),
        _mm256_slli_epi64(
          _mm256_and_si256( x,
            _mm256_set_epi64x( 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF,
                               0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF ) ), 8 ));
  return x;
 }
 #endif  // AVX2
 // AVX replacements for vectorized data
@@ -492,8 +521,11 @@ inline void mcpy( void* dst, const void* src, int n )
 // rotate bits in 2 uint64
 // _m128i mm_rotr_64( __m128i, int )
-#define  mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \
+#define mm_rotr_64( w, c ) _mm_or_si128( _mm_srli_epi64( w, c ), \
-                                      _mm_slli_epi64(w, 64 - c))
+                                         _mm_slli_epi64( w, 64-c ) )
 #define mm_rotr_32( w, c ) _mm_or_si128( _mm_srli_epi32( w, c ), \
                                         _mm_slli_epi32( w, 32-c ) )
 // swap 128 bit source vectors
 // void mm128_swap128( __m128i, __m128i )
@@ -538,6 +570,7 @@ inline void mcpy( void* dst, const void* src, int n )
   s0 = t; \
 } while(0)
 // vectored version of BYTES_SWAP32
 inline __m128i  mm_byteswap_epi32( __m128i x )
 {
@@ -552,3 +585,149 @@ inline __m128i  mm_byteswap_epi32( __m128i x )
  return _mm_or_si128( _mm_or_si128( x0, x1 ), _mm_or_si128( x2, x3 ) );
 }
 // Functions for interleaving buffers for vector processing
 // change size to bits for consistency
 #if defined (__AVX2__)
 // interleave 4 arrays of 64 bit elements for AVX2 processing
 // bit_len must be multiple of 64
 inline void m256_interleave_4x64( uint64_t *dst, uint64_t *src0,
              uint64_t *src1, uint64_t *src2, uint64_t *src3, int bit_len )
 {
   uint64_t *d = dst;
   for ( int i = 0; i < bit_len>>6; i++, d += 4 )
   {
      *d     = *(src0+i);
      *(d+1) = *(src1+i);     
      *(d+2) = *(src2+i);
      *(d+3) = *(src3+i);
  }
 }
 // Deinterleave 4 arrays into indivudual 64 bit arrays for scalar processing
 // bit_len must be multiple 0f 64
 inline void m256_deinterleave_4x64( uint64_t *dst0, uint64_t *dst1,
                uint64_t *dst2,uint64_t *dst3, uint64_t *src, int bit_len )
 {
  uint64_t *s = src;
   for ( int i = 0; i < bit_len>>6; i++, s += 4 )
  {
     *(dst0+i) = *s;
     *(dst1+i) = *(s+1);    
     *(dst2+i) = *(s+2);   
     *(dst3+i) = *(s+3);   
  }
 }
 // interleave 8 arrays of 32 bit elements for AVX2 processing
 // bit_len must be multiple of 32
 inline void m256_interleave_8x32( uint32_t *dst, uint32_t *src0,
     uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4,
     uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len )
 {
   uint32_t *d = dst;;
   for ( int i = 0; i < bit_len>>5; i++, d += 8 )
   {
      *d     = *(src0+i);
      *(d+1) = *(src1+i);
      *(d+2) = *(src2+i);
      *(d+3) = *(src3+i);
      *(d+4) = *(src4+i);
      *(d+5) = *(src5+i);
      *(d+6) = *(src6+i);
      *(d+7) = *(src7+i);
  }
 }
 // Deinterleave 8 arrays into indivdual buffers for scalar processing
 // bit_len must be multiple of 32
 inline void m256_deinterleave_8x32( uint32_t *dst0, uint32_t *dst1,
                uint32_t *dst2,uint32_t *dst3, uint32_t *dst4, uint32_t *dst5,
                uint32_t *dst6,uint32_t *dst7,uint32_t *src, int bit_len )
 {
  uint32_t *s = src;
  for ( int i = 0; i < bit_len>>5; i++, s += 8 )
  {
     *(dst0+i) = *( s     );
     *(dst1+i) = *( s + 1 );
     *(dst2+i) = *( s + 2 );
     *(dst3+i) = *( s + 3 );
     *(dst4+i) = *( s + 4 );
     *(dst5+i) = *( s + 5 ); 
     *(dst6+i) = *( s + 6 );
     *(dst7+i) = *( s + 7 );
  }
 }
 // convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2
 // bit_len must be multiple of 64
 inline void m256_reinterleave_4x64( uint64_t *dst, uint32_t *src,
                                         int  bit_len )
 {
   uint32_t *d = (uint32_t*)dst;
   for ( int i = 0; i < bit_len >> 5; i += 8 )
   {
      *( d + i     ) = *( src + i     );      // 0 <- 0    8 <- 8
      *( d + i + 1 ) = *( src + i + 4 );      // 1 <- 4    9 <- 12
      *( d + i + 2 ) = *( src + i + 1 );      // 2 <- 1    10 <- 9
      *( d + i + 3 ) = *( src + i + 5 );      // 3 <- 5    11 <- 13
      *( d + i + 4 ) = *( src + i + 2 );      // 4 <- 2    12 <- 10
      *( d + i + 5 ) = *( src + i + 6 );      // 5 <- 6    13 <- 14
      *( d + i + 6 ) = *( src + i + 3 );      // 6 <- 3    14 <- 11
      *( d + i + 7 ) = *( src + i + 7 );      // 7 <- 7    15 <- 15
     }
 }
 // convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
 // bit_len must be multiple of 64
 inline void m128_reinterleave_4x32( uint32_t *dst, uint64_t *src,
                                         int  bit_len )
 {
   uint32_t *s = (uint32_t*)src;
   for ( int i = 0; i < bit_len >> 5; i +=8 )
   {
      *( dst + i     ) = *( s + i     );
      *( dst + i + 1 ) = *( s + i + 2 );
      *( dst + i + 2 ) = *( s + i + 4 );
      *( dst + i + 3 ) = *( s + i + 6 );
      *( dst + i + 4 ) = *( s + i + 1 );
      *( dst + i + 5 ) = *( s + i + 3 );
      *( dst + i + 6 ) = *( s + i + 5 );
      *( dst + i + 7 ) = *( s + i + 7 );
   }
 }
 #endif
 // interleave 4 arrays of 32 bit elements for AVX processing
 // bit_len must be multiple of 32
 inline void m128_interleave_4x32( uint32_t *dst, uint32_t *src0,
              uint32_t *src1, uint32_t *src2, uint32_t *src3, int bit_len )
 {
   uint32_t *d = dst;;
   for ( int i = 0; i < bit_len >> 5; i++, d += 4 )
   {
      *d     = *(src0+i);
      *(d+1) = *(src1+i);
      *(d+2) = *(src2+i);
      *(d+3) = *(src3+i);
   }
 }
 // deinterleave 4 arrays into individual buffers for scalarm processing
 // bit_len must be multiple of 32
 inline void m128_deinterleave_4x32( uint32_t *dst0, uint32_t *dst1,
                uint32_t *dst2,uint32_t *dst3, uint32_t *src, int bit_len )
 {
  uint32_t *s = src;
  for ( int i = 0; i < bit_len >> 5; i++, s += 4 )
  {
     *(dst0+i) = *s;
     *(dst1+i) = *(s+1);
     *(dst2+i) = *(s+2);
     *(dst3+i) = *(s+3);
  }
 }
--- a/build-4way.sh
+++ b/build-4way.sh
@@ -0,0 +1,25 @@
 #!/bin/bash
 #if [ "$OS" = "Windows_NT" ]; then
 #    ./mingw64.sh
 #    exit 0
 #fi
 # Linux build
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
 # Ubuntu 10.04 (gcc 4.4)
 # extracflags="-O3 -march=native -Wall -D_REENTRANT -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants -fbranch-target-load-optimize2 -fsched2-use-superblocks -falign-loops=16 -falign-functions=16 -falign-jumps=16 -falign-labels=16"
 # Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
 #extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
 CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 make -j 4
 strip -s cpuminer
--- a/20
+++ b/20
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.2.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.3.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.7.2'
+PACKAGE_VERSION='3.7.3'
-PACKAGE_STRING='cpuminer-opt 3.7.2'
+PACKAGE_STRING='cpuminer-opt 3.7.3'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.7.2 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.7.3 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1392,7 +1392,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.7.2:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.7.3:";;
   esac
  cat <<\_ACEOF
@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.7.2
+cpuminer-opt configure 3.7.3
 generated by GNU Autoconf 2.69
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by cpuminer-opt $as_me 3.7.2, which was
+It was created by cpuminer-opt $as_me 3.7.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  $ $0 $@
@@ -2981,7 +2981,7 @@ fi
 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.7.2'
+ VERSION='3.7.3'
 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.7.2, which was
+This file was extended by cpuminer-opt $as_me 3.7.3, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.7.2
+cpuminer-opt config.status 3.7.3
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.7.2])
+AC_INIT([cpuminer-opt], [3.7.3])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -1698,7 +1698,7 @@ static void *miner_thread( void *userdata )
       uint64_t hashes_done;
       struct timeval tv_start, tv_end, diff;
       int64_t max64;
-       bool nonce_found = false;
+       int nonce_found = 0;
       if ( algo_gate.do_this_thread( thr_id ) )
       {
@@ -1792,7 +1792,7 @@ static void *miner_thread( void *userdata )
       // Scan for nonce
       nonce_found = (bool) algo_gate.scanhash( thr_id, &work, max_nonce,
-                                               &hashes_done );
+                                                &hashes_done );
       // record scanhash elapsed time
       gettimeofday(&tv_end, NULL);
@@ -1805,11 +1805,26 @@ static void *miner_thread( void *userdata )
 		hashes_done / (diff.tv_sec + diff.tv_usec * 1e-6);
 	  pthread_mutex_unlock(&stats_lock);
       }
-       // if nonce found, submit work 
+       // if nonce(s) submit work 
       if ( nonce_found && !opt_benchmark )
       {
-          if ( !submit_work(mythr, &work) )
+          int num_submitted = 0;
          // look for 4way nonces
          for ( int n = 0; n < 4; n++ )
             if ( work.nfound[n] )
             {
                 *algo_gate.get_nonceptr( work.data ) = work.nonces[n]; 
                 if ( !submit_work(mythr, &work) )
                    break;
                 num_submitted++;
             }
          // must be a ine way algo, nonce is already in work data
          if ( !num_submitted )
          {
             if ( !submit_work(mythr, &work) )
                break;
          }
          // prevent stale work in solo
          // we can't submit twice a block!
          if (!have_stratum && !have_longpoll)
@@ -1821,6 +1836,8 @@ static void *miner_thread( void *userdata )
          }
       }
       // display hashrate
       if (!opt_quiet)
       {
          char hc[16];
@@ -1829,6 +1846,7 @@ static void *miner_thread( void *userdata )
          char hr_units[2] = {0,0};
          double hashcount = thr_hashcount[thr_id];
          double hashrate  = thr_hashrates[thr_id];
 //printf("display count= %.3f,  tcount= %.3f, rate= %03f trate= %03f\n", hashcount, thr_hashcount[thr_id], hashrate,thr_hashrates[thr_id] );
          if ( hashcount )
          {
             scale_hash_for_display( &hashcount, hc_units );
@@ -2290,7 +2308,7 @@ static void *stratum_thread(void *userdata )
       if ( !s )
       {
          stratum_disconnect(&stratum);
-	  applog(LOG_ERR, "Stratum connection interrupted");
+//	  applog(LOG_WARNING, "Stratum connection interrupted");
 	  continue;
       }
       if (!stratum_handle_method(&stratum, s))
@@ -2364,7 +2382,8 @@ void show_version_and_exit(void)
 void show_usage_and_exit(int status)
 {
 	if (status)
-		fprintf(stderr, "Try `" PACKAGE_NAME " --help' for more information.\n");
+                fprintf(stderr, "Try `--help' for more information.\n");
 //		fprintf(stderr, "Try `" PACKAGE_NAME " --help' for more information.\n");
 	else
 		printf(usage);
 	exit(status);
--- a/miner.h
+++ b/miner.h
@@ -354,6 +354,8 @@ struct work {
 	char *job_id;
 	size_t xnonce2_len;
 	unsigned char *xnonce2;
        uint32_t nonces[4];
        bool     nfound[4];
 };
 struct stratum_job {
@@ -510,6 +512,7 @@ enum algos {
        ALGO_PENTABLAKE,  
        ALGO_PHI1612,
        ALGO_PLUCK,       
        ALGO_POLYTIMOS,
        ALGO_QUARK,
        ALGO_QUBIT,       
        ALGO_SCRYPT,
@@ -578,6 +581,7 @@ static const char* const algo_names[] = {
        "pentablake",
        "phi1612",
        "pluck",
        "polytimos",
        "quark",
        "qubit",
        "scrypt",
@@ -676,7 +680,7 @@ Options:\n\
                          c11          Chaincoin\n\
                          cryptolight  Cryptonight-light\n\
                          cryptonight  cryptonote, Monero (XMR)\n\
-                          decred\n\
+                          decred       Blake256r8dcr\n\
                          deep         Deepcoin (DCN)\n\
                          dmd-gr       Diamond\n\
                          drop         Dropcoin\n\
@@ -697,9 +701,10 @@ Options:\n\
                          myr-gr       Myriad-Groestl\n\
                          neoscrypt    NeoScrypt(128, 2, 1)\n\
                          nist5        Nist5\n\
-                          pentablake   Pentablake\n\
+                          pentablake   5 x blake512\n\
                          phi1612      phi, LUX coin\n\
                          pluck        Pluck:128 (Supcoin)\n\
                          polytimos\n\
                          quark        Quark\n\
                          qubit        Qubit\n\
                          scrypt       scrypt(1024, 1, 1) (default)\n\
--- a/util.c
+++ b/util.c
@@ -1069,7 +1069,7 @@ char *stratum_recv_line(struct stratum_ctx *sctx)
 		time(&rstart);
 		if (!socket_full(sctx->sock, 60)) {
-			applog(LOG_ERR, "stratum_recv_line timed out");
+			applog(LOG_WARNING, "stratum_recv_line timed out");
 			goto out;
 		}
 		do {
@@ -1092,7 +1092,7 @@ char *stratum_recv_line(struct stratum_ctx *sctx)
 		} while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n"));
 		if (!ret) {
-			applog(LOG_ERR, "stratum_recv_line failed");
+			applog(LOG_WARNING, "stratum_recv_line failed");
 			goto out;
 		}
 	}
--- a/winbuild-allarch.sh
+++ b/winbuild-allarch.sh
@@ -3,7 +3,7 @@
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core-avx2 -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -march=core-avx2 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-avx2.exe
@@ -11,7 +11,7 @@ mv cpuminer.exe cpuminer-aes-avx2.exe
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=corei7-avx -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -march=corei7-avx -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-avx.exe
@@ -19,7 +19,7 @@ mv cpuminer.exe cpuminer-aes-avx.exe
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -maes -msse4.2 -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -maes -msse4.2 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-aes-sse42.exe
@@ -27,7 +27,7 @@ mv cpuminer.exe cpuminer-aes-sse42.exe
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=corei7 -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -march=corei7 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-sse42.exe
@@ -35,7 +35,7 @@ mv cpuminer.exe cpuminer-sse42.exe
 make clean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core2 -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -march=core2 -Wall -DUSE_SPH_SHA" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
 mv cpuminer.exe cpuminer-sse2.exe