v3.7.4

v3.7.3
v3.7.2
2025-09-17 23:44:27 +00:00 · 2017-11-28 16:32:04 -05:00 · 2017-11-20 21:19:15 -05:00 · 2017-11-01 11:03:23 -04:00 · 2017-10-31 00:25:24 -04:00 · 2017-10-17 11:38:59 -04:00
151 changed files with 23901 additions and 1104 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -11,7 +11,6 @@ autom4te.cache
 Makefile
 Makefile.in
 INSTALL
-configure
 configure.lineno
 depcomp
 missing
--- a/34
+++ b/34
@@ -5,19 +5,31 @@
 # ex: docker run -it --rm cpuminer-opt:latest -a cryptonight -o cryptonight.eu.nicehash.com:3355 -u 1MiningDW2GKzf4VQfmp4q2XoUvR6iy6PD.worker1 -p x -t 3
 #

-FROM ubuntu:16.04
-RUN BUILD_DEPS="build-essential \
-    libssl-dev \
-	  libgmp-dev \
-	  libcurl4-openssl-dev \
-	  libjansson-dev \
-	  automake" && \
+# Build
+FROM ubuntu:16.04 as builder

-	  apt-get update && \
-	  apt-get install -y ${BUILD_DEPS}
+RUN apt-get update \
+  && apt-get install -y \
+    build-essential \
+    libssl-dev \
+    libgmp-dev \
+    libcurl4-openssl-dev \
+    libjansson-dev \
+    automake \
+  && rm -rf /var/lib/apt/lists/*

 COPY . /app/
-RUN	cd /app/ && ./build.sh
+RUN cd /app/ && ./build.sh

-ENTRYPOINT ["/app/cpuminer"]
+# App
+FROM ubuntu:16.04
+
+RUN apt-get update \
+  && apt-get install -y \
+    libcurl3 \
+    libjansson4 \
+  && rm -rf /var/lib/apt/lists/*
+
+COPY --from=builder /app/cpuminer .
+ENTRYPOINT ["./cpuminer"]
 CMD ["-h"]
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,29 +22,6 @@ cpuminer_SOURCES = \
  api.c \
  sysinfos.c \
  algo-gate-api.c\
-  algo/groestl/sph_groestl.c \
-  algo/skein/sph_skein.c \
-  algo/bmw/sph_bmw.c \
-  algo/shavite/sph_shavite.c \
-  algo/shavite/shavite.c \
-  algo/echo/sph_echo.c \
-  algo/blake/sph_blake.c \
-  algo/blake/sph_blake2b.c \
-  algo/heavy/sph_hefty1.c \
-  algo/blake/mod_blakecoin.c \
-  algo/luffa/sph_luffa.c \
-  algo/cubehash/sph_cubehash.c \
-  algo/simd/sph_simd.c \
-  algo/hamsi/sph_hamsi.c \
-  algo/fugue/sph_fugue.c \
-  algo/gost/sph_gost.c \
-  algo/jh/sph_jh.c \
-  algo/keccak/sph_keccak.c \
-  algo/keccak/keccak.c\
-  algo/sha/sph_sha2.c \
-  algo/sha/sph_sha2big.c \
-  algo/shabal/sph_shabal.c \
-  algo/whirlpool/sph_whirlpool.c\
  crypto/blake2s.c \
  crypto/oaes_lib.c \
  crypto/c_keccak.c \
@@ -62,26 +39,44 @@ cpuminer_SOURCES = \
  algo/argon2/ar2/ar2-scrypt-jane.c \
  algo/argon2/ar2/blake2b.c \
  algo/axiom.c \
+  algo/blake/sph_blake.c \
+  algo/blake/blake-hash-4way.c \
+  algo/blake/blake-gate.c \
  algo/blake/blake.c \
+  algo/blake/blake-4way.c \
+  algo/blake/sph_blake2b.c \
  algo/blake/blake2b.c \
  algo/blake/blake2s.c \
+  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
+  algo/blake/decred-gate.c \
  algo/blake/decred.c \
+  algo/blake/decred-4way.c \
+  algo/blake/pentablake-gate.c \
+  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
+  algo/bmw/sph_bmw.c \
  algo/bmw/bmw256.c \
-  algo/cubehash/sse2/cubehash_sse2.c\
  algo/cryptonight/cryptolight.c \
  algo/cryptonight/cryptonight-common.c\
  algo/cryptonight/cryptonight-aesni.c\
  algo/cryptonight/cryptonight.c\
+  algo/cubehash/sph_cubehash.c \
+  algo/cubehash/sse2/cubehash_sse2.c\
  algo/drop.c \
+  algo/echo/sph_echo.c \
  algo/echo/aes_ni/hash.c\
  algo/fresh.c \
+  algo/gost/sph_gost.c \
+  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
  algo/groestl/myr-groestl.c \
  algo/groestl/aes_ni/hash-groestl.c \
  algo/groestl/aes_ni/hash-groestl256.c \
+  algo/fugue/sph_fugue.c \
+  algo/hamsi/sph_hamsi.c \
  algo/haval/haval.c\
+  algo/heavy/sph_hefty1.c \
  algo/heavy/heavy.c \
  algo/heavy/bastion.c \
  algo/hmq1725.c \
@@ -90,7 +85,19 @@ cpuminer_SOURCES = \
  algo/hodl/hodl-wolf.c \
  algo/hodl/sha512_avx.c \
  algo/hodl/sha512_avx2.c \
+  algo/jh/sph_jh.c \
+  algo/jh/jh-hash-4way.c \
+  algo/jh/jha-gate.c \
+  algo/jh/jha-4way.c \
+  algo/jh/jha.c \
+  algo/keccak/sph_keccak.c \
+  algo/keccak/keccak.c\
+  algo/keccak/keccak-hash-4way.c \
+  algo/keccak/keccak-4way.c\
+  algo/keccak/keccak-gate.c \
+  algo/keccak/sse2/keccak.c \
  algo/lbry.c \
+  algo/luffa/sph_luffa.c \
  algo/luffa/luffa.c \
  algo/luffa/sse2/luffa_for_sse2.c \
  algo/lyra2/lyra2.c \
@@ -99,41 +106,65 @@ cpuminer_SOURCES = \
  algo/lyra2/lyra2re.c \
  algo/lyra2/zcoin.c \
  algo/lyra2/lyra2z330.c \
-  algo/keccak/sse2/keccak.c \
  algo/m7m.c \
  algo/neoscrypt.c \
-  algo/nist5.c \
+  algo/nist5/nist5-gate.c \
+  algo/nist5/nist5-4way.c \
+  algo/nist5/nist5.c \
  algo/pluck.c \
+  algo/polytimos/polytimos-gate.c \
+  algo/polytimos/polytimos.c \
  algo/quark/quark.c \
  algo/qubit/qubit.c \
  algo/qubit/deep.c \
  algo/ripemd/sph_ripemd.c \
  algo/scrypt.c \
  algo/scryptjane/scrypt-jane.c \
+  algo/sha/sph_sha2.c \
+  algo/sha/sph_sha2big.c \
  algo/sha/sha2.c \
  algo/sha/sha256t.c \
+  algo/shabal/sph_shabal.c \
+  algo/shavite/sph_shavite.c \
+  algo/shavite/shavite.c \
+  algo/simd/sph_simd.c \
  algo/simd/sse2/nist.c \
  algo/simd/sse2/vector.c \
+  algo/skein/sph_skein.c \
+  algo/skein/skein-hash-4way.c \
  algo/skein/skein.c \
+  algo/skein/skein-4way.c \
+  algo/skein/skein-gate.c \  
  algo/skein/skein2.c \
-  algo/s3.c \
+  algo/skein/skein2-4way.c \
+  algo/skein/skein2-gate.c \
+  algo/skunk.c \
+  algo/sm3/sm3.c \
  algo/tiger/sph_tiger.c \
  algo/timetravel.c \
  algo/timetravel10.c \
+  algo/tribus/tribus-gate.c \
+  algo/tribus/tribus.c \
+  algo/tribus/tribus-4way.c \
  algo/veltor.c \
+  algo/whirlpool/sph_whirlpool.c \
+  algo/whirlpool/whirlpool-hash-4way.c \
+  algo/whirlpool/whirlpool-gate.c \
+  algo/whirlpool/whirlpool-4way.c \
  algo/whirlpool/whirlpool.c \
  algo/whirlpool/whirlpoolx.c \
+  algo/x11/phi1612.c \
  algo/x11/x11.c \
  algo/x11/x11evo.c \
  algo/x11/x11gost.c \
  algo/x11/c11.c \
  algo/x13/x13.c \
+  algo/x13/x13sm3.c \
  algo/x14/x14.c \
  algo/x15/x15.c \
  algo/x17/x17.c \
  algo/xevan.c \
  algo/yescrypt/yescrypt.c \
-  algo/yescrypt/yescrypt-common.c \
  algo/yescrypt/sha256_Y.c\
  algo/yescrypt/yescrypt-simd.c\
  algo/zr5.c
--- a/README.md
+++ b/README.md
@@ -35,19 +35,22 @@ Supported Algorithms
                          heavy        Heavy
                          hmq1725      Espers
                          hodl         Hodlcoin
+                          jha          jackpotcoin
                          keccak       Keccak
                          lbry         LBC, LBRY Credits
                          luffa        Luffa
                          lyra2re      lyra2
-                          lyra2rev2    lyrav2, Vertcoin
+                          lyra2rev2    lyra2v2, Vertcoin
                          lyra2z       Zcoin (XZC)
                          lyra2z330    Lyra2 330 rows, Zoin (ZOI)
                          m7m          Magi (XMG)
                          myr-gr       Myriad-Groestl
                          neoscrypt    NeoScrypt(128, 2, 1)
                          nist5        Nist5
-                          pluck        Pluck:128 (Supcoin)
                          pentablake   Pentablake
+                          phi1612      phi, LUX coin
+                          pluck        Pluck:128 (Supcoin)
+                          polytimos
                          quark        Quark
                          qubit        Qubit
                          scrypt       scrypt(1024, 1, 1) (default)
@@ -58,7 +61,10 @@ Supported Algorithms
                          shavite3     Shavite3
                          skein        Skein+Sha (Skeincoin)
                          skein2       Double Skein (Woodcoin)
+                          skunk        Signatum (SIGT)
                          timetravel   Machinecoin (MAC)
+                          timetravel10 Bitcore
+                          tribus       Denarius (DNR)
                          vanilla      blake256r8vnl (VCash)
                          veltor
                          whirlpool
@@ -67,11 +73,13 @@ Supported Algorithms
                          x11evo       Revolvercoin
                          x11gost      sib (SibCoin)
                          x13          X13
+                          x13sm3       hsr (Hshare)
                          x14          X14
                          x15          X15
                          x17
                          xevan        Bitsend
-                          yescrypt
+                          yescrypt     Globalboost-Y (BSTY)
+                          yescryptr16  Yenten (YTN)
                          zr5          Ziftr

 Requirements
@@ -115,6 +123,10 @@ forum at:

 https://bitcointalk.org/index.php?topic=1326803.0

+All problem reports must be accompanied by a proper definition.
+This should include how the problem occurred, the command line and
+output from the miner showing the startup and any errors.
+
 Donations
 ---------

--- a/README.txt
+++ b/README.txt
@@ -1,6 +1,9 @@
 This file is included in the Windows binary package. Compile instructions
 for Linux and Windows can be found in RELEASE_NOTES.

+cpuminer is a console program that is executed from a DOS command prompt.
+There is no GUI and no mouse support.
+
 Choose the exe that best matches you CPU's features or use trial and
 error to find the fastest one that doesn't crash. Pay attention to
 the features listed at cpuminer startup to ensure you are mining at
@@ -8,15 +11,23 @@ optimum speed using all the available features.

 Architecture names and compile options used are only provided for Intel
 Core series. Pentium and Celeron often have fewer features.
-AMD is YMMV, see previous paragraph.

-Exe name                  Compile opts       Arch name
+AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
+supported by cpuminer-opt due to an incompatible implementation of SSE2 on
+these CPUs. Some algos may crash the miner with an invalid instruction.
+Users are recommended to use an unoptimized miner such as cpuminer-multi.

-cpuminer-sse2.exe         -march=core2,      Core2   
-cpuminer-sse42.exe        -march=corei7,     Nehalem
-cpuminer-aes-sse42.exe    -maes -msse4.2     Westmere
-cpuminer-aes-avx.exe      -march=corei7-avx, Sandybridge, Ivybridge
-cpuminer-aes-avx2.exe     -march=core-avx2,  Haswell, Broadwell, Skylake, Kabylake
+Exe name                  Compile opts         Arch name

+cpuminer-sse2.exe         -march=core2         Core2   
+cpuminer-sse42.exe        -march=corei7        Nehalem
+cpuminer-aes-sse42.exe    -maes -msse4.2"      Westmere
+cpuminer-aes-avx.exe      -march=corei7-avx"   Sandybridge, Ivybridge
+cpuminer-aes-avx2.exe     "-march=core-avx2"   Haswell, Broadwell, Skylake, Kabylake
+cpuminer-4way.exe         "-march=core-avx2 -DFOUR_WAY"

+4way requires a CPU with AES and AVX2. It is still under development and
+only a few algos are supported. See change log in RELEASE_NOTES in source
+package for supported algos.

+There is no binary support available for SHA on AMD Ryzen CPUs.
--- a/124
+++ b/124
@@ -6,6 +6,22 @@ compile flag.
 HW SHA support is only available when compiled from source, Windows binaries
 are not yet available.

+cpuminer-opt is a console program, if you're using a mouse you're doing it
+wrong.
+
+Security warning
+----------------
+
+Miner programs are often flagged as malware by antivirus programs. This is
+a false positive, they are flagged simply because they are miners. The source
+code is open for anyone to inspect. If you don't trust the software, don't use
+it.
+
+The cryptographic code has been taken from trusted sources but has been
+modified for speed at the expense of accepted security practices. This
+code should not be imported into applications where secure cryptography is
+required.
+
 Compile Instructions
 --------------------

@@ -46,9 +62,16 @@ pthreads
 zlib

 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and openssl 1.1
-or higher. Additional compile options may also be required such as
+or higher. Reports of improved performiance on Ryzen when using openssl 1.0.2
+have been due to AVX and AVX2 optimizations added to that version.
+Additional improvements are expected on Ryzen with openssl 1.1.
 "-march-znver1" or "-msha".

+Additional instructions for static compilalation can be found here:
+https://lxadm.com/Static_compilation_of_cpuminer
+Static builds should only considered in a homogeneous HW and SW environment.
+Local builds will always have the best performance and compatibility.
+
 Extract cpuminer source.

 tar xvzf cpuminer-opt-x.y.z.tar.gz
@@ -60,10 +83,29 @@ Run ./build.sh to build on Linux or execute the following commands.
 CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 make

+Additional optional compile flags, add the following to CFLAGS to activate:
+
+-DUSE_SPH_SHA
+
+SPH may give slightly better performance on algos that use sha256 when using
+openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
+better than SPH.
+
+-DFOUR_WAY
+
+4 way will give much better performance on supported algos with CPUs
+that have AVX2 and should only be used on CPUs with AVX2. 4 way algo
+support will be added incrementally, see change log below for supported algos.
+ 
 Start mining.

 ./cpuminer -a algo -o url -u username -p password

+Windows
+
+The following in how the Windows binary releases are built. It's old and
+not very good but it works, for me anyway.
+
 Building on Windows prerequisites:

 msys
@@ -100,6 +142,10 @@ Run winbuild.sh to build on Windows or execute the following commands.
 CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
 make

+Start mining
+
+cpuminer.exe -a algo -o url -u user -p password
+
 The following tips may be useful for older AMD CPUs.

 AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
@@ -118,6 +164,82 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.7.4
+
+Removed unnecessary build options.
+
+Added 4way support for tribus and nist5.
+
+v3.7.3
+
+Added polytimos algo.
+
+Introducing 4-way AVX2 optimization giving up to 4x performance inprovement
+on many compute bound algos. First supported algos: skein, skein2, blake &
+keccak. This feature is only available when compiled from source. See above
+for instcuctions how to enable 4-way during compilation.
+
+Updated Dockerfile.
+
+v3.7.2
+
+Fixed yescryptr16
+Changed default sha256 and sha512 to openssl. This should be used when
+compiling with openssl 1.0.2 or higher (Ubuntu 16.04).
+This should increase the hashrate for yescrypt, yescryptr16, m7m, xevan, skein,
+myr-gr & others  when openssl 1.0.2 is installed.
+Users with openssl 1.0.1 (Ubuntu 14.04) may get better perforance by adding
+"-DUSE_SPH_SHA" to CLAGS. 
+Windows binaries are compiled with -DUSE_SPH_SHA and won't get the speedup.
+
+v3.7.1
+
+Added yescryptr16 algo for Yenten coin
+Added SHA support to yescrypt and yescryptr16
+Small code cleanup
+
+v3.7.0
+
+Fixed x14 misalignment bug.
+Fixed decred stake version bug.
+Getwork fixes for algos that use big endian data encoding: m7m, zr5, neoscrypt,
+decred.
+
+v3.6.10
+
+Fixed misalignment bug in hsr.
+
+v3.6.9
+
+Added phi1612 algo for LUX coin
+Added x13sm3 algo, alias hsr, for Hshare coin
+
+v3.6.8
+
+Fixed timetravel10 on Windows.
+
+v3.6.7
+
+Skunk algo added.
+Tribus a little faster.
+Minor restructuring.
+
+v3.6.6
+
+added tribus algo for Denarius (DNR)
+
+configure removed from .gitignore. This should allow git clone to compile
+on Windows/mingw.
+
+Fixed CPU temperature monitoring on some CPUs (Linux only).
+
+Fixed a compile error on FreeBSD (unsupported YMMV).
+
+v3.6.5
+
+Cryptonight a little faster.
+Added jha algo (Jackpotcoin) with AES optimizations.
+
 v3.6.4

 Added support for Bitcore (BTX) using the timetravel10 algo, optimized for
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -114,8 +114,8 @@ void init_algo_gate( algo_gate_t* gate )
   gate->stratum_gen_work        = (void*)&std_stratum_gen_work;
   gate->build_stratum_request   = (void*)&std_le_build_stratum_request;
   gate->set_target              = (void*)&std_set_target;
-   gate->work_decode             = (void*)&std_work_decode;
-   gate->submit_getwork_result   = (void*)&std_submit_getwork_result;
+   gate->work_decode             = (void*)&std_le_work_decode;
+   gate->submit_getwork_result   = (void*)&std_le_submit_getwork_result;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
   gate->calc_network_diff       = (void*)&std_calc_network_diff;
@@ -169,6 +169,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_HEAVY:        register_heavy_algo       ( gate ); break;
     case ALGO_HMQ1725:      register_hmq1725_algo     ( gate ); break;
     case ALGO_HODL:         register_hodl_algo        ( gate ); break;
+     case ALGO_JHA:          register_jha_algo         ( gate ); break;
     case ALGO_KECCAK:       register_keccak_algo      ( gate ); break;
     case ALGO_LBRY:         register_lbry_algo        ( gate ); break;
     case ALGO_LUFFA:        register_luffa_algo       ( gate ); break;
@@ -181,7 +182,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_NEOSCRYPT:    register_neoscrypt_algo   ( gate ); break;
     case ALGO_NIST5:        register_nist5_algo       ( gate ); break;
     case ALGO_PENTABLAKE:   register_pentablake_algo  ( gate ); break;
+     case ALGO_PHI1612:      register_phi1612_algo     ( gate ); break;
     case ALGO_PLUCK:        register_pluck_algo       ( gate ); break;
+     case ALGO_POLYTIMOS:    register_polytimos_algo   ( gate ); break;
     case ALGO_QUARK:        register_quark_algo       ( gate ); break;
     case ALGO_QUBIT:        register_qubit_algo       ( gate ); break;
     case ALGO_SCRYPT:       register_scrypt_algo      ( gate ); break;
@@ -191,9 +194,10 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_SHAVITE3:     register_shavite_algo     ( gate ); break;
     case ALGO_SKEIN:        register_skein_algo       ( gate ); break;
     case ALGO_SKEIN2:       register_skein2_algo      ( gate ); break;
-     case ALGO_S3:           register_s3_algo          ( gate ); break;
+     case ALGO_SKUNK:        register_skunk_algo       ( gate ); break;
     case ALGO_TIMETRAVEL:   register_timetravel_algo  ( gate ); break;
     case ALGO_TIMETRAVEL10: register_timetravel10_algo( gate ); break;
+     case ALGO_TRIBUS:       register_tribus_algo      ( gate ); break;
     case ALGO_VANILLA:      register_vanilla_algo     ( gate ); break;
     case ALGO_VELTOR:       register_veltor_algo      ( gate ); break;
     case ALGO_WHIRLPOOL:    register_whirlpool_algo   ( gate ); break;
@@ -202,11 +206,13 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_X11EVO:       register_x11evo_algo      ( gate ); break;
     case ALGO_X11GOST:      register_sib_algo         ( gate ); break;
     case ALGO_X13:          register_x13_algo         ( gate ); break;
+     case ALGO_X13SM3:       register_x13sm3_algo      ( gate ); break;
     case ALGO_X14:          register_x14_algo         ( gate ); break;
     case ALGO_X15:          register_x15_algo         ( gate ); break;
     case ALGO_X17:          register_x17_algo         ( gate ); break;
     case ALGO_XEVAN:        register_xevan_algo       ( gate ); break;
     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
+     case ALGO_YESCRYPTR16:  register_yescryptr16_algo ( gate ); break;
     case ALGO_ZR5:          register_zr5_algo         ( gate ); break;

 // restore warnings
@@ -276,17 +282,22 @@ const char* const algo_alias_map[][2] =
  { "droplp",            "drop"         },
  { "espers",            "hmq1725"      },
  { "flax",              "c11"          },
+  { "hsr",               "x13sm3"       },
+  { "jackpot",           "jha"          },
  { "jane",              "scryptjane"   }, 
  { "lyra2",             "lyra2re"      },
  { "lyra2v2",           "lyra2rev2"    },
  { "lyra2zoin",         "lyra2z330"    },
  { "myriad",            "myr-gr"       },
  { "neo",               "neoscrypt"    },
+  { "phi",               "phi1612"      },
 //  { "sia",               "blake2b"      },
  { "sib",               "x11gost"      },
  { "timetravel8",       "timetravel"   },
  { "yes",               "yescrypt"     },
  { "ziftr",             "zr5"          },
+  { "yenten",            "yescryptr16"  },
+  { "yescryptr8",        "yescrypt"     },
  { "zcoin",             "lyra2z"       },
  { "zoin",              "lyra2z330"    },
  { NULL,                NULL           }   
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -215,18 +215,20 @@ int64_t get_max64_0xffffLL();
 void std_set_target   ( struct work *work, double job_diff );
 void scrypt_set_target( struct work *work, double job_diff );

-bool std_work_decode( const json_t *val, struct work *work );
+bool std_le_work_decode( const json_t *val, struct work *work );
+bool std_be_work_decode( const json_t *val, struct work *work );
 bool jr2_work_decode( const json_t *val, struct work *work );

-bool std_submit_getwork_result( CURL *curl, struct work *work );
+bool std_le_submit_getwork_result( CURL *curl, struct work *work );
+bool std_be_submit_getwork_result( CURL *curl, struct work *work );
 bool jr2_submit_getwork_result( CURL *curl, struct work *work );

 void std_le_build_stratum_request( char *req, struct work *work );
 void std_be_build_stratum_request( char *req, struct work *work );
 void jr2_build_stratum_request   ( char *req, struct work *work );

-// set_work_data_endian target, default is do_nothing;
-void swab_work_data( struct work *work );
+// Default is do_nothing (assumed LE)
+void set_work_data_big_endian( struct work *work );

 double std_calc_network_diff( struct work *work );

--- a/algo/argon2/argon2a.c
+++ b/algo/argon2/argon2a.c
@@ -1,5 +1,3 @@
-#include "miner.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
--- a/algo/axiom.c
+++ b/algo/axiom.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <string.h>
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -0,0 +1,114 @@
+#include "blake-gate.h"
+#include "sph_blake.h"
+#include "blake-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+
+#if defined (BLAKE_4WAY)
+
+void blakehash_4way(void *state, const void *input)
+{
+     uint32_t hash0[16] __attribute__ ((aligned (64)));
+     uint32_t hash1[16] __attribute__ ((aligned (64)));
+     uint32_t hash2[16] __attribute__ ((aligned (64)));
+     uint32_t hash3[16] __attribute__ ((aligned (64)));
+     uint32_t vhash[16*4] __attribute__ ((aligned (64)));
+     blake256_4way_context ctx;
+
+     blake256_4way_init( &ctx );
+     blake256_4way( &ctx, input, 16 );
+     blake256_4way_close( &ctx, vhash );
+
+     m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash1, 32 );
+     memcpy( state+96, hash1, 32 );
+}
+
+int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+//   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) endiandata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+
+//   if (opt_benchmark)
+//      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( endiandata, pdata, 20 );
+
+   m128_interleave_4x32( vdata, endiandata, endiandata, endiandata,
+                         endiandata, 640 );
+
+   uint32_t *noncep = vdata + 76;   // 19*4
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep,    n   );
+      be32enc( noncep +2, n+1 );
+      be32enc( noncep +4, n+2 );
+      be32enc( noncep +6, n+3 );
+
+      blakehash_4way( hash, vdata );
+
+      if ( hash[7] == 0 )
+      {
+         if ( fulltest( hash, ptarget ) )
+         {
+             found[0] = true;
+             num_found++;
+             nonces[0] = n;
+             pdata[19] = n;
+         }
+      }
+      if ( (hash+8)[7] == 0 ) 
+      {
+         if ( fulltest( hash, ptarget ) ) 
+         {
+             found[1] = true;
+             num_found++;
+             nonces[1] = n+1;
+         }
+      }
+      if ( (hash+16)[7] == 0 )
+      {
+          if ( fulltest( hash, ptarget ) )
+          {
+              found[2] = true;
+              num_found++;
+              nonces[2] = n+2;
+          }
+      }
+      if ( (hash+24)[7] == 0 )
+      {
+         if ( fulltest( hash, ptarget ) )
+         {
+              found[3] = true;
+              num_found++;
+              nonces[3] = n+3;
+         }
+      }
+ 
+      n += 4;
+      *hashes_done = n - first_nonce + 1;
+
+   } while ( (num_found == 0) && (n < max_nonce) 
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
+
--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -0,0 +1,26 @@
+#include "blake-gate.h"
+
+int64_t blake_get_max64 ()
+{
+  return 0x7ffffLL;
+}
+
+bool register_blake_algo( algo_gate_t* gate )
+{
+  gate->get_max64 = (void*)&blake_get_max64;
+//#if defined (__AVX2__) && defined (FOUR_WAY)
+//   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
+//  gate->scanhash  = (void*)&scanhash_blake_8way;
+//  gate->hash      = (void*)&blakehash_8way;
+#if defined(BLAKE_4WAY)
+  gate->optimizations = SSE2_OPT | AVX_OPT;
+  gate->scanhash  = (void*)&scanhash_blake_4way;
+  gate->hash      = (void*)&blakehash_4way;
+#else
+  gate->optimizations = SSE2_OPT;
+  gate->scanhash  = (void*)&scanhash_blake;
+  gate->hash      = (void*)&blakehash;
+#endif
+  return true;
+}
+
--- a/algo/blake/blake-gate.h
+++ b/algo/blake/blake-gate.h
@@ -0,0 +1,21 @@
+#ifndef __BLAKE_GATE_H__
+#define __BLAKE_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX__)
+  #define BLAKE_4WAY
+#endif
+
+#if defined (BLAKE_4WAY)
+void blakehash_4way(void *state, const void *input);
+int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
+void blakehash( void *state, const void *input );
+int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done );
+
+#endif
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -0,0 +1,105 @@
+/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
+/**
+ * BLAKE interface. BLAKE is a family of functions which differ by their
+ * output size; this implementation defines BLAKE for output sizes 224,
+ * 256, 384 and 512 bits. This implementation conforms to the "third
+ * round" specification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_blake.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef __BLAKE_HASH_4WAY__
+#define __BLAKE_HASH_4WAY___
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+/**
+ * Output size (in bits) for BLAKE-256.
+ */
+#define SPH_SIZE_blake256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BLAKE-512.
+ */
+#define SPH_SIZE_blake512   512
+
+#endif
+
+#ifdef __AVX__
+typedef struct {
+        __m128i buf[16] __attribute__ ((aligned (64)));
+        __m128i H[8];
+        __m128i S[4];    
+        size_t ptr;
+	sph_u32 T0, T1;
+} blake_4way_small_context;
+
+typedef blake_4way_small_context blake256_4way_context;
+
+void blake256_4way_init(void *cc);
+void blake256_4way(void *cc, const void *data, size_t len);
+void blake256_4way_close(void *cc, void *dst);
+void blake256_4way_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __AVX2__
+
+typedef struct {
+        __m256i buf[16] __attribute__ ((aligned (64)));
+        __m256i H[8];
+        __m256i S[4];   
+        size_t ptr;
+	sph_u64 T0, T1;
+} blake_4way_big_context;
+
+typedef blake_4way_big_context blake512_4way_context;
+
+void blake512_4way_init(void *cc);
+void blake512_4way(void *cc, const void *data, size_t len);
+void blake512_4way_close(void *cc, void *dst);
+void blake512_4way_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/blake/blake.c
+++ b/algo/blake/blake.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"
 #include "sph_blake.h"

@@ -90,18 +89,3 @@ int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
-int64_t blake_get_max64 ()
-{
-  return 0x7ffffLL;
-}
-
-bool register_blake_algo( algo_gate_t* gate )
-{
-  gate->scanhash  = (void*)&scanhash_blake;
-  gate->hash      = (void*)&blakehash;
-  gate->get_max64 = (void*)&blake_get_max64;
-  return true;
-}
-
-
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -3,16 +3,13 @@
 * tpruvot@github 2015-2016
 */

-#include "miner.h"
 #include "algo-gate-api.h"
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/sph_blake2b.h"

-
-static __thread sph_blake2b_ctx s_midstate;
-static __thread sph_blake2b_ctx s_ctx;
+//static __thread sph_blake2b_ctx s_midstate;
+//static __thread sph_blake2b_ctx s_ctx;
 #define MIDLEN 76
 #define A 64

@@ -28,6 +25,7 @@ void blake2b_hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

+/*
 static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
 {
 	s_ctx.outlen = MIDLEN;
@@ -35,6 +33,7 @@ static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
 	sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
 	sph_blake2b_final(&s_ctx, (uint8_t*) output);
 }
+*/

 int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done )
@@ -220,6 +219,8 @@ bool register_blake2b_algo( algo_gate_t* gate )
  gate->hash                  = (void*)&blake2b_hash;
  gate->calc_network_diff     = (void*)&blake2b_calc_network_diff;
  gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
  gate->build_extraheader     = (void*)&blake2b_build_extraheader;
  gate->get_new_work          = (void*)&blake2b_get_new_work;
  gate->get_max64             = (void*)&blake2b_get_max64;
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <string.h>
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"
 #define BLAKE32_ROUNDS 8
 #include "sph_blake.h"
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -0,0 +1,153 @@
+#include "decred-gate.h"
+#include "sph_blake.h"
+#include "blake-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+#include <unistd.h>
+
+#if defined (DECRED_4WAY)
+
+static __thread blake256_4way_context blake_mid;
+static __thread bool ctx_midstate_done = false;
+
+void decred_hash_4way( void *state, const void *input )
+{
+     uint32_t hash0[16] __attribute__ ((aligned (64)));
+     uint32_t hash1[16] __attribute__ ((aligned (64)));
+     uint32_t hash2[16] __attribute__ ((aligned (64)));
+     uint32_t hash3[16] __attribute__ ((aligned (64)));
+     uint32_t vhash[16*4] __attribute__ ((aligned (64)));
+     blake256_4way_context ctx __attribute__ ((aligned (64)));
+
+     sph_blake256_context ctx2 __attribute__ ((aligned (64)));
+     uint32_t hash[16] __attribute__ ((aligned (64)));
+     uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
+     m128_deinterleave_4x32( sin0, sin1, sin2, sin3, (uint32_t*)input, 180*8 );
+
+     void *tail = input + DECRED_MIDSTATE_LEN;
+     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
+//     #define MIDSTATE_LEN 128
+/*
+        uint8_t *ending = (uint8_t*) input;
+        ending += MIDSTATE_LEN;
+
+     if ( !ctx_midstate_done )
+     {
+          blake256_4way_init( &blake_mid );
+          blake256_4way( &blake_mid, input, DECRED_MIDSTATE_LEN );
+          ctx_midstate_done = true;
+     }
+     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
+
+     blake256_4way( &ctx, tail, tail_len );
+     blake256_4way_close( &ctx, vhash );
+*/
+
+
+     sph_blake256_init( &ctx2 );
+     sph_blake256( &ctx2, sin0, 180 );
+     sph_blake256_close( &ctx2, hash );
+
+     blake256_4way_init( &ctx );
+     blake256_4way( &ctx, input, 180 );
+     blake256_4way_close( &ctx, vhash );
+
+     m128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+/*
+        for ( int i = 0; i < 8; i++ )
+          if ( hash[i] != hash0[i] )
+            printf(" hash mismatch, i = %u\n",i);
+
+printf("hash:  %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
+                             *(hash+2), *(hash+3) );
+printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
+                             *(hash0+2), *(hash0+3) );
+printf("\n");
+*/
+
+//     memcpy( state,    hash0, 32 );
+//     memcpy( state+32, hash1, 32 );
+//     memcpy( state+64, hash1, 32 );
+//     memcpy( state+96, hash1, 32 );
+
+     memcpy( state, hash, 32 );
+
+}
+
+int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done)
+{
+   uint32_t vdata[45*4] __attribute__ ((aligned (64)));
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+        uint32_t _ALIGN(64) endiandata[48];
+//        uint32_t _ALIGN(64) hash32[8];
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
+        uint32_t n = first_nonce;
+        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+
+//        #define DCR_NONCE_OFT32 35
+
+        ctx_midstate_done = false;
+
+//        memcpy(endiandata, pdata, 180);
+
+   m128_interleave_4x32( vdata, pdata, pdata, pdata, pdata, 180*8 );
+
+   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      * noncep    = n;
+      *(noncep+2) = n+1;
+      *(noncep+4) = n+2;
+      *(noncep+6) = n+3;
+
+      decred_hash_4way( hash, vdata );
+
+//                endiandata[DCR_NONCE_OFT32] = n;
+//                decred_hash(hash32, endiandata);
+
+      if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
+      {
+          work_set_target_ratio( work, hash );
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          pdata[DECRED_NONCE_INDEX] = n;
+      }
+/*      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
+      {
+          work_set_target_ratio( work, hash+8 );
+          found[1] = true;
+          num_found++;
+          nonces[1] = n;
+      }
+      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
+      {
+          work_set_target_ratio( work, hash+16 );
+          found[2] = true;
+          num_found++;
+          nonces[2] = n;
+      }
+      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
+      {
+          work_set_target_ratio( work, hash+24 );
+          found[3] = true;
+          num_found++;
+          nonces[3] = n;
+      }
+*/
+      n += 4;
+  } while ( (num_found == 0) && (n < max_nonce) 
+            && !work_restart[thr_id].restart );
+
+  *hashes_done = n - first_nonce + 1;
+  return num_found;
+}
+
+#endif
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -0,0 +1,176 @@
+#include "decred-gate.h"
+#include <unistd.h>
+#include <memory.h>
+#include <string.h>
+
+uint32_t *decred_get_nonceptr( uint32_t *work_data )
+{
+   return &work_data[ DECRED_NONCE_INDEX ];
+}
+
+double decred_calc_network_diff( struct work* work )
+{
+   // sample for diff 43.281 : 1c05ea29
+   // todo: endian reversed on longpoll could be zr5 specific...
+   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
+   uint32_t bits = ( nbits & 0xffffff );
+   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
+   int m;
+   double d = (double)0x0000ffff / (double)bits;
+
+   for ( m = shift; m < 29; m++ )
+       d *= 256.0;
+   for ( m = 29; m < shift; m++ )
+       d /= 256.0;
+   if ( shift == 28 )
+       d *= 256.0; // testnet
+   if ( opt_debug_diff )
+       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
+                           shift, bits );
+   return net_diff;
+}
+
+void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
+{
+   // some random extradata to make the work unique
+   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
+   work->height = work->data[32];
+   if (!have_longpoll && work->height > *net_blocks + 1)
+   {
+      char netinfo[64] = { 0 };
+      if (opt_showdiff && net_diff > 0.)
+      {
+         if (net_diff != work->targetdiff)
+            sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
+                   work->targetdiff);
+         else
+             sprintf(netinfo, ", diff %.3f", net_diff);
+       }
+       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
+                       netinfo);
+       *net_blocks = work->height - 1;
+   }
+}
+
+void decred_be_build_stratum_request( char *req, struct work *work,
+                                      struct stratum_ctx *sctx )
+{
+   unsigned char *xnonce2str;
+   uint32_t ntime, nonce;
+   char ntimestr[9], noncestr[9];
+
+   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
+   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
+   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
+   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
+   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
+                                     sctx->xnonce1_size );
+   snprintf( req, JSON_BUF_LEN,
+        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
+         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
+   free(xnonce2str);
+}
+#define min(a,b) (a>b ? (b) :(a))
+
+void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
+{
+   uchar merkle_root[64] = { 0 };
+   uint32_t extraheader[32] = { 0 };
+   int headersize = 0;
+   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
+   size_t t;
+   int i;
+
+   // getwork over stratum, getwork merkle + header passed in coinb1
+   memcpy(merkle_root, sctx->job.coinbase, 32);
+   headersize = min((int)sctx->job.coinbase_size - 32,
+                  sizeof(extraheader) );
+   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
+
+   // Increment extranonce2 
+   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
+
+   // Assemble block header 
+   memset( g_work->data, 0, sizeof(g_work->data) );
+   g_work->data[0] = le32dec( sctx->job.version );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[1 + i] = swab32(
+                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
+
+//   for ( i = 0; i < 8; i++ ) // prevhash
+//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
+//   for ( i = 0; i < 8; i++ ) // merkle
+//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
+
+   for ( i = 0; i < headersize/4; i++ ) // header
+      g_work->data[17 + i] = extraheader[i];
+   // extradata
+
+   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
+      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
+   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
+      g_work->data[i] = 0;
+   g_work->data[37] = (rand()*4) << 8;
+   // block header suffix from coinb2 (stake version)
+   memcpy( &g_work->data[44],
+           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
+   sctx->bloc_height = g_work->data[32];
+   //applog_hex(work->data, 180);
+   //applog_hex(&work->data[36], 36);
+}
+
+#undef min
+
+bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
+                           int thr_id )
+{
+   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
+      // need to regen g_work..
+      return false;
+   if ( have_stratum && !work->data[0] && !opt_benchmark )
+   {
+      sleep(1);
+      return false;
+   }
+   // extradata: prevent duplicates
+   work->data[ DECRED_XNONCE_INDEX     ] += 1;
+   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
+   return true;
+}
+
+
+bool register_decred_algo( algo_gate_t* gate )
+{
+#if defined(DECRED_4WAY)
+  gate->optimizations = SSE2_OPT | AVX_OPT;
+  gate->scanhash  = (void*)&scanhash_decred_4way;
+  gate->hash      = (void*)&decred_hash_4way;
+#else
+  gate->optimizations = SSE2_OPT;
+  gate->scanhash  = (void*)&scanhash_decred;
+  gate->hash      = (void*)&decred_hash;
+#endif
+
+//  gate->optimizations         = SSE2_OPT;
+//  gate->scanhash              = (void*)&scanhash_decred;
+//  gate->hash                  = (void*)&decred_hash;
+  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
+  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
+  gate->display_extra_data    = (void*)&decred_decode_extradata;
+  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
+  gate->build_extraheader     = (void*)&decred_build_extraheader;
+  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
+  gate->nbits_index           = DECRED_NBITS_INDEX;
+  gate->ntime_index           = DECRED_NTIME_INDEX;
+  gate->nonce_index           = DECRED_NONCE_INDEX;
+  gate->work_data_size        = DECRED_DATA_SIZE;
+  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE;
+  allow_mininginfo            = false;
+  have_gbt                    = false;
+  return true;
+}
+
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -0,0 +1,36 @@
+#ifndef __DECRED_GATE_H__
+#define __DECRED_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#define DECRED_NBITS_INDEX 29
+#define DECRED_NTIME_INDEX 34
+#define DECRED_NONCE_INDEX 35
+#define DECRED_XNONCE_INDEX 36
+#define DECRED_DATA_SIZE 192
+#define DECRED_WORK_COMPARE_SIZE 140
+#define DECRED_MIDSTATE_LEN 128
+
+#if defined (__AVX2__) 
+//void blakehash_84way(void *state, const void *input);
+//int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
+//                         uint64_t *hashes_done );
+#endif
+
+#if defined(FOUR_WAY) && defined(__AVX__)
+  #define DECRED_4WAY
+#endif
+
+#if defined (DECRED_4WAY)
+void decred_hash_4way(void *state, const void *input);
+int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+#endif
+
+void decred_hash( void *state, const void *input );
+int scanhash_decred( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+
+#endif
+
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -1,5 +1,4 @@
-#include "miner.h"
-#include "algo-gate-api.h"
+#include "decred-gate.h"
 #include "sph_blake.h"

 #include <string.h>
@@ -15,33 +14,33 @@
 #define max(a,b) (a<b ? b : a)
 #endif
 */
-
+/*
 #define DECRED_NBITS_INDEX 29
 #define DECRED_NTIME_INDEX 34
 #define DECRED_NONCE_INDEX 35
 #define DECRED_XNONCE_INDEX 36
 #define DECRED_DATA_SIZE 192
 #define DECRED_WORK_COMPARE_SIZE 140
-
+*/
 static __thread sph_blake256_context blake_mid;
 static __thread bool ctx_midstate_done = false;

 void decred_hash(void *state, const void *input)
 {
-        #define MIDSTATE_LEN 128
+//        #define MIDSTATE_LEN 128
        sph_blake256_context ctx __attribute__ ((aligned (64)));

        uint8_t *ending = (uint8_t*) input;
-        ending += MIDSTATE_LEN;
+        ending += DECRED_MIDSTATE_LEN;

        if (!ctx_midstate_done) {
                sph_blake256_init(&blake_mid);
-                sph_blake256(&blake_mid, input, MIDSTATE_LEN);
+                sph_blake256(&blake_mid, input, DECRED_MIDSTATE_LEN);
                ctx_midstate_done = true;
        }
        memcpy(&ctx, &blake_mid, sizeof(blake_mid));

-        sph_blake256(&ctx, ending, (180 - MIDSTATE_LEN));
+        sph_blake256(&ctx, ending, (180 - DECRED_MIDSTATE_LEN));
        sph_blake256_close(&ctx, state);
 }

@@ -60,9 +59,9 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;

-        #define DCR_NONCE_OFT32 35
+//        #define DCR_NONCE_OFT32 35

-        const uint32_t first_nonce = pdata[DCR_NONCE_OFT32];
+        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];

        uint32_t n = first_nonce;
@@ -82,7 +81,7 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t

        do {
                //be32enc(&endiandata[DCR_NONCE_OFT32], n);
-                endiandata[DCR_NONCE_OFT32] = n;
+                endiandata[DECRED_NONCE_INDEX] = n;
                decred_hash(hash32, endiandata);

                if (hash32[7] <= HTarget && fulltest(hash32, ptarget)) {
@@ -93,7 +92,7 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
                        applog_hash(ptarget);
                        applog_compare_hash(hash32, ptarget);
 #endif
-                        pdata[DCR_NONCE_OFT32] = n;
+                        pdata[DECRED_NONCE_INDEX] = n;
                        return 1;
                }

@@ -102,24 +101,17 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
        } while (n < max_nonce && !work_restart[thr_id].restart);

        *hashes_done = n - first_nonce + 1;
-        pdata[DCR_NONCE_OFT32] = n;
+        pdata[DECRED_NONCE_INDEX] = n;
        return 0;
 }

+/*
 uint32_t *decred_get_nonceptr( uint32_t *work_data )
 {
   return &work_data[ DECRED_NONCE_INDEX ];
 }

-// does decred need a custom stratum_get_g_work to fix nicehash
-//  bad extranonce2 size?
-// 
-// does decred need a custom init_nonce?
-// does it need to increment nonce, seems not because gen_work_now always
-// returns true
-
 double decred_calc_network_diff( struct work* work )
-//void decred_calc_network_diff( struct work* work )
 {
   // sample for diff 43.281 : 1c05ea29
   // todo: endian reversed on longpoll could be zr5 specific...
@@ -181,7 +173,7 @@ void decred_be_build_stratum_request( char *req, struct work *work,
         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
   free(xnonce2str);
 }
-
+*/
 /*
 // data shared between gen_merkle_root and build_extraheader.
 __thread uint32_t decred_extraheader[32] = { 0 };
@@ -197,7 +189,7 @@ void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 }
 */

-
+/*
 #define min(a,b) (a>b ? (b) :(a))

 void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
@@ -235,11 +227,15 @@ void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   for ( i = 0; i < headersize/4; i++ ) // header
      g_work->data[17 + i] = extraheader[i];
   // extradata
+
   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
      g_work->data[i] = 0;
   g_work->data[37] = (rand()*4) << 8;
+   // block header suffix from coinb2 (stake version)
+   memcpy( &g_work->data[44],
+           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
   sctx->bloc_height = g_work->data[32];
   //applog_hex(work->data, 180);
   //applog_hex(&work->data[36], 36);
@@ -274,6 +270,8 @@ bool register_decred_algo( algo_gate_t* gate )
  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
  gate->display_extra_data    = (void*)&decred_decode_extradata;
  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
  gate->build_extraheader     = (void*)&decred_build_extraheader;
  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
  gate->nbits_index           = DECRED_NBITS_INDEX;
@@ -285,4 +283,4 @@ bool register_decred_algo( algo_gate_t* gate )
  have_gbt                    = false;
  return true;
 }
-
+*/
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -0,0 +1,206 @@
+#include "pentablake-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "blake-hash-4way.h"
+#include "sph_blake.h"
+
+//#define DEBUG_ALGO
+
+#ifdef PENTABLAKE_4WAY
+
+extern void pentablakehash_4way( void *output, const void *input )
+{
+	unsigned char _ALIGN(32) hash[128];
+//	// same as uint32_t hashA[16], hashB[16];
+//	#define hashB hash+64
+
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake512_4way_context ctx;
+
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, input, 80 );
+     blake512_4way_close( &ctx, vhash );
+
+uint64_t sin0[10], sin1[10], sin2[10], sin3[10];
+m256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
+sph_blake512_context ctx2_blake;
+sph_blake512_init(&ctx2_blake);
+sph_blake512(&ctx2_blake, sin0, 80);
+sph_blake512_close(&ctx2_blake, (void*) hash);
+
+m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+uint64_t* hash64 = (uint64_t*)hash;
+for( int i = 0; i < 8; i++ )
+{
+   if ( hash0[i] != hash64[i] )
+      printf("hash mismatch %u\n",i);
+}
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+
+/*
+     uint64_t sin0[10] __attribute__ ((aligned (64)));
+     uint64_t sin1[10] __attribute__ ((aligned (64)));
+     uint64_t sin2[10] __attribute__ ((aligned (64)));
+     uint64_t sin3[10] __attribute__ ((aligned (64)));
+
+	sph_blake512_context     ctx_blake;
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+	memcpy(output, hash, 32);
+*/
+}
+
+int scanhash_pentablake_4way( int thr_id, struct work *work,
+                              uint32_t max_nonce, uint64_t *hashes_done )
+{
+    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t endiandata[32] __attribute__ ((aligned (64)));
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19] - 1;
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t Htarg = ptarget[7];
+    uint32_t *nonces = work->nonces;
+    bool *found = work->nfound;
+    int num_found = 0;
+    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+    uint32_t *noncep1 = vdata + 75;
+    uint32_t *noncep2 = vdata + 77;
+    uint32_t *noncep3 = vdata + 79;
+
+//    uint32_t _ALIGN(32) hash64[8];
+//    uint32_t _ALIGN(32) endiandata[32];
+
+    uint64_t htmax[] = {
+	0,
+	0xF,
+	0xFF,
+	0xFFF,
+	0xFFFF,
+	0x10000000
+    };
+    uint32_t masks[] = {
+ 	0xFFFFFFFF,
+	0xFFFFFFF0,
+	0xFFFFFF00,
+	0xFFFFF000,
+	0xFFFF0000,
+	0
+    };
+
+	// we need bigendian data...
+    swab32_array( endiandata, pdata, 20 );
+
+    uint64_t *edata = (uint64_t*)endiandata;
+    m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+    for ( int m=0; m < 6; m++ )
+    {
+        if ( Htarg <= htmax[m] )
+        {
+           uint32_t mask = masks[m];
+           do {
+              found[0] = found[1] = found[2] = found[3] = false;
+              be32enc( noncep0, n   );
+              be32enc( noncep1, n+1 );
+              be32enc( noncep2, n+2 );
+              be32enc( noncep3, n+3 );
+
+              pentablakehash_4way( hash, vdata );
+
+              // return immediately on nonce found, only one submit
+              if ( ( !(hash[7] & mask) ) && fulltest( hash, ptarget ) )
+              {
+                  found[0] = true;
+                  num_found++;
+                  nonces[0] = n;
+                  pdata[19] = n;
+                  *hashes_done = n - first_nonce + 1;
+                  return 1;
+              }
+              if ( (! ((hash+8)[7] & mask) ) && fulltest( hash+8, ptarget ) )
+              {
+                  found[1] = true;
+                  num_found++;
+                  nonces[1] = n;
+                  *hashes_done = n - first_nonce + 1;
+                  return 1;
+              }
+              if ( ( !((hash+16)[7] & mask) ) && fulltest( hash+16, ptarget ) )
+              {
+                  found[2] = true;
+                  num_found++;
+                  nonces[2] = n;
+                  *hashes_done = n - first_nonce + 1;
+                  return 1;
+              }
+              if ( ( !((hash+24)[7] & mask) ) && fulltest( hash+24, ptarget ) )
+              {
+                  found[3] = true;
+                  num_found++;
+                  nonces[3] = n;
+                  *hashes_done = n - first_nonce + 1;
+                  return 1;
+              }
+              n += 4;
+
+           } while (n < max_nonce && !work_restart[thr_id].restart);
+           break;
+        }
+    }
+
+    *hashes_done = n - first_nonce + 1;
+    pdata[19] = n;
+    return 0;
+} 
+
+#endif
--- a/algo/blake/pentablake-gate.c
+++ b/algo/blake/pentablake-gate.c
@@ -0,0 +1,16 @@
+#include "pentablake-gate.h"
+
+bool register_pentablake_algo( algo_gate_t* gate )
+{
+#if defined (PENTABLAKE_4WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT;
+    gate->scanhash  = (void*)&scanhash_pentablake_4way;
+    gate->hash      = (void*)&pentablakehash_4way;
+#else
+    gate->scanhash  = (void*)&scanhash_pentablake;
+    gate->hash      = (void*)&pentablakehash;
+#endif
+    gate->get_max64 = (void*)&get_max64_0x3ffff;
+    return true;
+};
+
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -0,0 +1,21 @@
+#ifndef __PENTABLAKE_GATE_H__
+#define __PENTABLAKE_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX__)
+  #define PENTABLAKE_4WAY
+#endif
+
+#if defined(PENTABLAKE_4WAY)
+void pentablakehash_4way( void *state, const void *input );
+int scanhash_pentablake_4way( int thr_id, struct work *work,
+                              uint32_t max_nonce, uint64_t *hashes_done );
+#endif
+
+void pentablakehash( void *state, const void *input );
+int scanhash_pentablake( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
--- a/algo/blake/pentablake.c
+++ b/algo/blake/pentablake.c
@@ -1,5 +1,4 @@
-#include "miner.h"
-#include "algo-gate-api.h"
+#include "pentablake-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -111,11 +110,3 @@ int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 } 

-bool register_pentablake_algo( algo_gate_t* gate )
-{
-    gate->scanhash  = (void*)&scanhash_pentablake;
-    gate->hash      = (void*)&pentablakehash;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-};
-
--- a/algo/blake/sph_blake.c
+++ b/algo/blake/sph_blake.c
@@ -813,6 +813,7 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)

 	buf = sc->buf;
 	ptr = sc->ptr;
+
 	if (len < (sizeof sc->buf) - ptr) {
 		memcpy(buf + ptr, data, len);
 		ptr += len;
@@ -890,9 +891,9 @@ blake32_close(sph_blake_small_context *sc,
 		sph_enc32be_aligned(u.buf + 60, tl);
 		blake32(sc, u.buf, 64);
 	}
-	out = dst;
-	for (k = 0; k < out_size_w32; k ++)
-		sph_enc32be(out + (k << 2), sc->H[k]);
+        out = dst;
+        for (k = 0; k < out_size_w32; k ++)
+                sph_enc32be(out + (k << 2), sc->H[k]);
 }

 #if SPH_64
@@ -982,9 +983,11 @@ blake64_close(sph_blake_big_context *sc,
 			u.buf[111] |= 1;
 		sph_enc64be_aligned(u.buf + 112, th);
 		sph_enc64be_aligned(u.buf + 120, tl);
+
 		blake64(sc, u.buf + ptr, 128 - ptr);
 	} else {
 		memset(u.buf + ptr + 1, 0, 127 - ptr);
+
 		blake64(sc, u.buf + ptr, 128 - ptr);
 		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
 		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
@@ -993,6 +996,7 @@ blake64_close(sph_blake_big_context *sc,
 			u.buf[111] = 1;
 		sph_enc64be_aligned(u.buf + 112, th);
 		sph_enc64be_aligned(u.buf + 120, tl);
+
 		blake64(sc, u.buf, 128);
 	}
 	out = dst;
--- a/algo/bmw/bmw256.c
+++ b/algo/bmw/bmw256.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <string.h>
--- a/algo/cryptonight/cryptolight.c
+++ b/algo/cryptonight/cryptolight.c
@@ -2,7 +2,6 @@
 // Distributed under the MIT/X11 software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.

-#include "miner.h"
 #include "algo-gate-api.h"

 #if defined(__arm__) || defined(_MSC_VER)
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -109,43 +109,43 @@ static __thread cryptonight_ctx ctx;
 void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 {
 #ifndef NO_AES_NI
-    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
+
    uint8_t ExpandedKey[256] __attribute__((aligned(64)));
+    __m128i *longoutput, *expkey, *xmminput;
    size_t i, j;
    
-    memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
+    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
+    memcpy( ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE );
+    ExpandAESKey256( ExpandedKey );
+    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
    
-    __m128i *longoutput, *expkey, *xmminput;
-    longoutput = (__m128i *)ctx.long_state;
-    expkey     = (__m128i *)ExpandedKey;
-    xmminput   = (__m128i *)ctx.text;
+    longoutput = (__m128i*)ctx.long_state;
+    xmminput   = (__m128i*)ctx.text;
+    expkey     = (__m128i*)ExpandedKey;
    
-    //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
-    //    aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey);
-    
-    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
+    // prefetch expkey, xmminput and enough longoutput for 4 iterations
    _mm_prefetch( xmminput,     _MM_HINT_T0 );
    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
-    for ( i = 0; i < 64; i += 16 )
-    {
-       _mm_prefetch( longoutput + i,      _MM_HINT_T0 );
-       _mm_prefetch( longoutput + i +  4, _MM_HINT_T0 );
-       _mm_prefetch( longoutput + i +  8, _MM_HINT_T0 );
-       _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
-    }
    _mm_prefetch( expkey,     _MM_HINT_T0 );
    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
    _mm_prefetch( expkey + 8, _MM_HINT_T0 );
-
-    for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    for ( i = 0; i < 64; i += 16 )
    {
-        // prefetch 4 loops ahead,
+        __builtin_prefetch( longoutput + i,      1, 0 );
+        __builtin_prefetch( longoutput + i +  4, 1, 0 );
+        __builtin_prefetch( longoutput + i +  8, 1, 0 );
+        __builtin_prefetch( longoutput + i + 12, 1, 0 );
+    }
+
+    // n-4 iterations
+    for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
+                         i += INIT_SIZE_M128I )
+    {
+        // prefetch 4 iterations ahead.
        __builtin_prefetch( longoutput + i + 64, 1, 0 );
        __builtin_prefetch( longoutput + i + 68, 1, 0 );

-	for (j = 0; j < 10; j++ )
+	for ( j = 0; j < 10; j++ )
 	{
 		xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
 		xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
@@ -165,84 +165,99 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	_mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
 	_mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
    }
+    // last 4 iterations
+    for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    {
+        for ( j = 0; j < 10; j++ )
+        {
+                xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
+                xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
+                xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
+                xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
+                xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
+                xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
+                xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
+                xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
+        }
+        _mm_store_si128( &( longoutput[i  ] ), xmminput[0] );
+        _mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
+        _mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
+        _mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
+        _mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
+        _mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
+        _mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
+        _mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
+    }

-//     cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) ,
-//                                          casti_m128i( ctx.state.k, 2 ) );
-//     cast_m128i( ctx.b ) = _mm_xor_si128( casti_m128i( ctx.state.k, 1 ),
-//                                          casti_m128i( ctx.state.k, 3 ) );
+    ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
+    ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
+    ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
+    ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7];

-     ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
-     ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
-     ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
-     ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7];
-
-//    for (i = 0; i < 2; i++) 
-//    {
-//     ctx.a[i] = ((uint64_t *)ctx.state.k)[i] ^  ((uint64_t *)ctx.state.k)[i+4];
-//     ctx.b[i] = ((uint64_t *)ctx.state.k)[i+2] ^ ((uint64_t *)ctx.state.k)[i+6];
-//    }
-
-    __m128i b_x = _mm_load_si128((__m128i *)ctx.b);
-    uint64_t a[2] __attribute((aligned(16))), b[2] __attribute((aligned(16)));
+    uint64_t a[2] __attribute((aligned(16))),
+             b[2] __attribute((aligned(16))),
+             c[2] __attribute((aligned(16)));
    a[0] = ctx.a[0];
    a[1] = ctx.a[1];
-	
-    for(i = 0; __builtin_expect(i < 0x80000, 1); i++)
+    __m128i b_x = _mm_load_si128( (__m128i*)ctx.b );
+    __m128i a_x = _mm_load_si128( (__m128i*)a );
+    __m128i* lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
+    __m128i c_x = _mm_load_si128( lsa );
+    uint64_t *nextblock;
+    uint64_t hi, lo;
+
+    // n-1 iterations
+    for( i = 0; __builtin_expect( i < 0x7ffff, 1 ); i++ )
    {	  
-        uint64_t c[2];
-        __builtin_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], 0, 1 );
-
-	__m128i c_x = _mm_load_si128( 
-                              (__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
-	__m128i a_x = _mm_load_si128((__m128i *)a);
-	c_x = _mm_aesenc_si128(c_x, a_x);
-	_mm_store_si128((__m128i *)c, c_x);
-	
-	b_x = _mm_xor_si128(b_x, c_x);
-	_mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x);
-
-	uint64_t *nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
-//	uint64_t b[2];
+	c_x = _mm_aesenc_si128( c_x, a_x );
+	_mm_store_si128( (__m128i*)c, c_x );
+        b_x = _mm_xor_si128( b_x, c_x );
+        nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
+	_mm_store_si128( lsa, b_x );
 	b[0] = nextblock[0];
 	b[1] = nextblock[1];

-	{
-	  uint64_t hi, lo;
-	 // hi,lo = 64bit x 64bit multiply of c[0] and b[0]
+        // hi,lo = 64bit x 64bit multiply of c[0] and b[0]
+	__asm__( "mulq %3\n\t"
+	         : "=d" ( hi ),
+	           "=a" ( lo )
+	         : "%a" ( c[0] ),
+	           "rm" ( b[0] )
+		 : "cc" );

-	  __asm__("mulq %3\n\t"
-		  : "=d" (hi),
-		"=a" (lo)
-		  : "%a" (c[0]),
-		"rm" (b[0])
-		  : "cc" );
-	  
-	  a[0] += hi;
-	  a[1] += lo;
-	}
-	uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0];
-//        __m128i *dst = (__m128i*)&ctx.long_state[c[0] & 0x1FFFF0];
-
-//        *dst = cast_m128i( a ); 
-	dst[0] = a[0];
-	dst[1] = a[1];
-
-//        cast_m128i( a ) = _mm_xor_si128( cast_m128i( a ), cast_m128i( b ) );
-	a[0] ^= b[0];
-	a[1] ^= b[1];
-	b_x = c_x;
-	__builtin_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], 0, 3 );
+        b_x = c_x;
+        nextblock[0] = a[0] + hi;
+        nextblock[1] = a[1] + lo;
+        a[0] = b[0] ^ nextblock[0];
+        a[1] = b[1] ^ nextblock[1];
+        lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
+        a_x = _mm_load_si128( (__m128i*)a );
+        c_x = _mm_load_si128( lsa );
    }
+    // abreviated nth iteration
+    c_x = _mm_aesenc_si128( c_x, a_x );
+    _mm_store_si128( (__m128i*)c, c_x );
+    b_x = _mm_xor_si128( b_x, c_x );
+    nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
+    _mm_store_si128( lsa, b_x );
+    b[0] = nextblock[0];
+    b[1] = nextblock[1];
+
+    __asm__( "mulq %3\n\t"
+             : "=d" ( hi ),
+               "=a" ( lo )
+             : "%a" ( c[0] ),
+               "rm" ( b[0] )
+             : "cc" );
+
+    nextblock[0] = a[0] + hi;
+    nextblock[1] = a[1] + lo;

-    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
    memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
    ExpandAESKey256( ExpandedKey );
-    
-    //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
-    //    aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]);
+    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
    
    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
-
    _mm_prefetch( xmminput,     _MM_HINT_T0 );
    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
    for ( i = 0; i < 64; i += 16 )
@@ -256,9 +271,11 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
    _mm_prefetch( expkey + 8, _MM_HINT_T0 );

-    for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    // n-4 iterations
+    for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
+                         i += INIT_SIZE_M128I )
    {
-        // stay 4 loops ahead,
+        // stay 4 iterations ahead.
        _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
        _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );

@@ -283,10 +300,34 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	    xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
        }
    }
-        
+    // last 4 iterations 
+    for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    {
+        xmminput[0] = _mm_xor_si128( longoutput[i  ], xmminput[0] );
+        xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
+        xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
+        xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
+        xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
+        xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
+        xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
+        xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
+
+        for( j = 0; j < 10; j++ )
+        {
+            xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
+            xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
+            xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
+            xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
+            xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
+            xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
+            xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
+            xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
+        }
+    }
+
    memcpy( ctx.state.init, ctx.text, INIT_SIZE_BYTE);
    keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
-
    extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
+
 #endif
 }
--- a/algo/cryptonight/cryptonight-common.c
+++ b/algo/cryptonight/cryptonight-common.c
@@ -5,7 +5,6 @@
 // Modified for CPUminer by Lucas Jones

 #include "cpuminer-config.h"
-//#include "miner.h"
 #include "algo-gate-api.h"

 #ifndef NO_AES_NI
--- a/algo/drop.c
+++ b/algo/drop.c
@@ -32,7 +32,6 @@
 #define POK_BOOL_MASK 0x00008000
 #define POK_DATA_MASK 0xFFFF0000
 
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <string.h>
@@ -248,7 +247,9 @@ bool register_drop_algo( algo_gate_t* gate )
    gate->get_new_work          = (void*)&drop_get_new_work;
    gate->set_target            = (void*)&scrypt_set_target;
    gate->build_stratum_request = (void*)&std_be_build_stratum_request;
-    gate->set_work_data_endian  = (void*)&swab_work_data;
+    gate->work_decode           = (void*)&std_be_work_decode;
+    gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
+    gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
    gate->display_extra_data    = (void*)&drop_display_pok;
    gate->work_data_size        = 80;
    gate->work_cmp_size         = 72;
--- a/algo/echo/aes_ni/vperm.h
+++ b/algo/echo/aes_ni/vperm.h
@@ -53,11 +53,12 @@ extern const unsigned int _k_aesmix4[];
 	x  = _mm_shuffle_epi8(*((__m128i*)table + 0), x);\
 	x  = _mm_xor_si128(x, t1)

+#if 0
 // compiled erroneously with 32-bit msc compiler
-	//t2 = _mm_shuffle_epi8(table[0], x);\
-	//x  = _mm_shuffle_epi8(table[1], t1);\
-	//x  = _mm_xor_si128(x, t2)
-
+	t2 = _mm_shuffle_epi8(table[0], x);\
+	x  = _mm_shuffle_epi8(table[1], t1);\
+	x  = _mm_xor_si128(x, t2)
+#endif

 // input: x
 // output: t2, t3
--- a/algo/fresh.c
+++ b/algo/fresh.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -21,7 +21,7 @@

 #include "brg_endian.h"
 #define NEED_UINT_64T
-#include "brg_types.h"
+#include "algo/sha/brg_types.h"

 /* some sizes (number of bytes) */
 #define ROWS (8)
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -35,7 +35,7 @@ typedef crypto_uint64 u64;

 #include "brg_endian.h"
 #define NEED_UINT_64T
-#include "brg_types.h"
+#include "algo/sha/brg_types.h"

 #ifdef IACA_TRACE
  #include IACA_MARKS
--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdio.h>
@@ -99,22 +98,21 @@ void groestl_set_target( struct work* work, double job_diff )
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }

-bool register_groestl_algo( algo_gate_t* gate )
+bool register_dmd_gr_algo( algo_gate_t* gate )
 {
    init_groestl_ctx();
    gate->optimizations   = SSE2_OPT | AES_OPT;
    gate->scanhash        = (void*)&scanhash_groestl;
    gate->hash            = (void*)&groestlhash;
    gate->set_target      = (void*)&groestl_set_target;
-    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
    gate->get_max64       = (void*)&get_max64_0x3ffff;
    return true;
 };

-bool register_dmd_gr_algo( algo_gate_t* gate )
+bool register_groestl_algo( algo_gate_t* gate )
 {
-    register_groestl_algo( gate );
-    gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
+    register_dmd_gr_algo( gate );
+    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
    return true;
 };

--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdio.h>
@@ -12,11 +11,8 @@
  #include "aes_ni/hash-groestl.h"
 #endif

-#if defined __SHA__
-  #include <openssl/sha.h>
-#else
-  #include "algo/sha/sph_sha2.h"
-#endif
+#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"

 typedef struct {
 #ifdef NO_AES_NI
@@ -24,7 +20,7 @@ typedef struct {
 #else
    hashState_groestl       groestl;
 #endif
-#if defined __SHA__
+#ifndef USE_SPH_SHA
   SHA256_CTX         sha;
 #else
   sph_sha256_context sha;
@@ -40,7 +36,7 @@ void init_myrgr_ctx()
 #else
     init_groestl (&myrgr_ctx.groestl, 64 );
 #endif
-#if defined __SHA__
+#ifndef USE_SPH_SHA
   SHA256_Init( &myrgr_ctx.sha );
 #else
   sph_sha256_init( &myrgr_ctx.sha );
@@ -61,7 +57,7 @@ void myriadhash( void *output, const void *input )
                               (const char*)input, 640 );
 #endif

-#if defined __SHA__
+#ifndef USE_SPH_SHA
     SHA256_Update( &ctx.sha, hash, 64 );
     SHA256_Final( (unsigned char*) hash, &ctx.sha );
 #else
@@ -108,7 +104,7 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,

 bool register_myriad_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | SHA_OPT;
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
    init_myrgr_ctx();
    gate->scanhash = (void*)&scanhash_myriad;
    gate->hash     = (void*)&myriadhash;
--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdio.h>
--- a/algo/heavy/heavy.c
+++ b/algo/heavy/heavy.c
@@ -2,7 +2,6 @@
 #include <openssl/sha.h>
 #include <stdint.h>

-#include "miner.h"
 #include "algo-gate-api.h"
 #include "sph_hefty1.h"
 #include "algo/keccak/sph_keccak.h"
--- a/algo/hmq1725.c
+++ b/algo/hmq1725.c
@@ -1,16 +1,12 @@
-#include "miner.h"
 #include "algo-gate-api.h"
-
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/sph_blake.h"
 #include "algo/bmw/sph_bmw.h"
 #include "algo/groestl/sph_groestl.h"
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-
 #include "algo/luffa/sph_luffa.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/shavite/sph_shavite.h"
@@ -22,12 +18,11 @@
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sph_sha2.h"
 #include "algo/haval/sph-haval.h"
-
+#include <openssl/sha.h>
 #ifndef NO_AES_NI
  #include "algo/groestl/aes_ni/hash-groestl.h"
  #include "algo/echo/aes_ni/hash_api.h"
 #endif
-
 #include "algo/luffa/sse2/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/simd/sse2/nist.h"
@@ -47,7 +42,11 @@ typedef struct {
  sph_fugue512_context    fugue1, fugue2;
  sph_shabal512_context   shabal1;
  sph_whirlpool_context   whirlpool1, whirlpool2, whirlpool3, whirlpool4;
+#ifndef USE_SPH_SHA
+  SHA512_CTX              sha1, sha2;
+#else
  sph_sha512_context      sha1, sha2;
+#endif
  sph_haval256_5_context  haval1, haval2;
 #ifdef NO_AES_NI
  sph_groestl512_context  groestl1, groestl2;
@@ -102,9 +101,13 @@ void init_hmq1725_ctx()
    sph_whirlpool_init(&hmq1725_ctx.whirlpool3);
    sph_whirlpool_init(&hmq1725_ctx.whirlpool4);

+#ifndef USE_SPH_SHA
+    SHA512_Init( &hmq1725_ctx.sha1 );
+    SHA512_Init( &hmq1725_ctx.sha2 );
+#else
    sph_sha512_init(&hmq1725_ctx.sha1);
    sph_sha512_init(&hmq1725_ctx.sha2);
-
+#endif
    sph_haval256_5_init(&hmq1725_ctx.haval1);
    sph_haval256_5_init(&hmq1725_ctx.haval2);

@@ -271,8 +274,13 @@ extern void hmq1725hash(void *state, const void *input)
    }
    else
    {
+#ifndef USE_SPH_SHA
+        SHA512_Update( &h_ctx.sha1, hashB, 64 );
+        SHA512_Final( (unsigned char*) hashA, &h_ctx.sha1 );
+#else
        sph_sha512 (&h_ctx.sha1, hashB, 64); //7
        sph_sha512_close(&h_ctx.sha1, hashA); //8
+#endif
    }

 #ifdef NO_AES_NI
@@ -283,8 +291,13 @@ extern void hmq1725hash(void *state, const void *input)
                               (const char*)hashA, 512 );
 #endif

+#ifndef USE_SPH_SHA
+    SHA512_Update( &h_ctx.sha2, hashB, 64 );
+    SHA512_Final( (unsigned char*) hashA, &h_ctx.sha2 );
+#else
    sph_sha512 (&h_ctx.sha2, hashB, 64); //2 
    sph_sha512_close(&h_ctx.sha2, hashA); //3 
+#endif

    if ( hashA[0] & mask ) //4
    {
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -1,10 +1,7 @@
 #include <memory.h>
 #include <stdlib.h>

-#include "miner.h"
-//#include "algo-gate-api.h"
 #include "hodl-gate.h"
-//#include "hodl.h"
 #include "hodl-wolf.h"

 #define HODL_NSTARTLOC_INDEX 20
@@ -97,13 +94,7 @@ bool hodl_do_this_thread( int thr_id )
 int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done )
 {
-#ifdef NO_AES_NI
-  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
-  return false;
-//  GetPsuedoRandomData( hodl_scratchbuf, work->data, thr_id );
-//  pthread_barrier_wait( &hodl_barrier );
-//  return scanhash_hodl( thr_id, work, max_nonce, hashes_done );
-#else
+#ifndef NO_AES_NI
  GenRandomGarbage( hodl_scratchbuf, work->data, thr_id );
  pthread_barrier_wait( &hodl_barrier );
  return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
@@ -112,6 +103,10 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,

 bool register_hodl_algo( algo_gate_t* gate )
 {
+#ifdef NO_AES_NI
+  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
+  return false;
+#endif
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
  gate->optimizations         = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
--- a/algo/hodl/sha512_avx.c
+++ b/algo/hodl/sha512_avx.c
@@ -4,6 +4,11 @@
 //Dependencies
 #include <string.h>
 #include <stdlib.h>
+
+#ifdef __FreeBSD__
+#include <sys/endian.h>
+#endif 
+
 #include "tmmintrin.h"
 #include "smmintrin.h"

--- a/algo/hodl/sha512_avx2.c
+++ b/algo/hodl/sha512_avx2.c
@@ -3,6 +3,11 @@
 //Dependencies
 #include <string.h>
 #include <stdlib.h>
+
+#ifdef __FreeBSD__
+#include <sys/endian.h>
+#endif 
+
 #include "tmmintrin.h"
 #include "smmintrin.h"
 #include "immintrin.h"
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -0,0 +1,639 @@
+/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */
+/*
+ * JH implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifdef __AVX2__
+
+#include <stddef.h>
+#include <string.h>
+
+#include "jh-hash-4way.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH
+#define SPH_SMALL_FOOTPRINT_JH   1
+#endif
+
+#if !defined SPH_JH_64 && SPH_64_TRUE
+#define SPH_JH_64   1
+#endif
+
+#if !SPH_64
+#undef SPH_JH_64
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * The internal bitslice representation may use either big-endian or
+ * little-endian (true bitslice operations do not care about the bit
+ * ordering, and the bit-swapping linear operations in JH happen to
+ * be invariant through endianness-swapping). The constants must be
+ * defined according to the chosen endianness; we use some
+ * byte-swapping macros for that.
+ */
+
+#if SPH_LITTLE_ENDIAN
+
+#if SPH_64
+#define C64e(x)     ((SPH_C64(x) >> 56) \
+                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
+                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
+                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
+                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
+                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
+                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
+                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
+#define dec64e_aligned   sph_dec64le_aligned
+#define enc64e           sph_enc64le
+#endif
+
+#else
+
+#if SPH_64
+#define C64e(x)     SPH_C64(x)
+#define dec64e_aligned   sph_dec64be_aligned
+#define enc64e           sph_enc64be
+#endif
+
+#endif
+
+#define Sb(x0, x1, x2, x3, c) \
+do { \
+   __m256i cc = _mm256_set_epi64x( c, c, c, c ); \
+    x3 = mm256_bitnot( x3 ); \
+    x0 = _mm256_xor_si256( x0, _mm256_and_si256( cc, mm256_bitnot( x2 ) ) ); \
+    tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
+    x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
+    x3 = _mm256_xor_si256( x3, _mm256_and_si256( mm256_bitnot( x1 ), x2 ) ); \
+    x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
+    x2 = _mm256_xor_si256( x2, _mm256_and_si256( x0, mm256_bitnot( x3 ) ) ); \
+    x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
+    x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
+    x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
+    x2 = _mm256_xor_si256( x2, tmp ); \
+} while (0)
+
+/*
+#define Sb(x0, x1, x2, x3, c)   do { \
+		x3 = ~x3; \
+		x0 ^= (c) & ~x2; \
+		tmp = (c) ^ (x0 & x1); \
+		x0 ^= x2 & x3; \
+		x3 ^= ~x1 & x2; \
+		x1 ^= x0 & x2; \
+		x2 ^= x0 & ~x3; \
+		x0 ^= x1 | x3; \
+		x3 ^= x1 & x2; \
+		x1 ^= tmp & x0; \
+		x2 ^= tmp; \
+	} while (0)
+*/
+
+#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
+do { \
+    x4 = _mm256_xor_si256( x4, x1 ); \
+    x5 = _mm256_xor_si256( x5, x2 ); \
+    x6 = _mm256_xor_si256( x6, _mm256_xor_si256( x3, x0 ) ); \
+    x7 = _mm256_xor_si256( x7, x0 ); \
+    x0 = _mm256_xor_si256( x0, x5 ); \
+    x1 = _mm256_xor_si256( x1, x6 ); \
+    x2 = _mm256_xor_si256( x2, _mm256_xor_si256( x7, x4 ) ); \
+    x3 = _mm256_xor_si256( x3, x4 ); \
+} while (0)
+
+
+/*
+#define Lb(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		x4 ^= x1; \
+		x5 ^= x2; \
+		x6 ^= x3 ^ x0; \
+		x7 ^= x0; \
+		x0 ^= x5; \
+		x1 ^= x6; \
+		x2 ^= x7 ^ x4; \
+		x3 ^= x4; \
+	} while (0)
+*/
+
+#if SPH_JH_64
+
+static const sph_u64 C[] = {
+	C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557),
+	C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40),
+	C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a),
+	C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231),
+	C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410),
+	C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc),
+	C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0),
+	C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3),
+	C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce),
+	C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23),
+	C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8),
+	C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197),
+	C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95),
+	C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214),
+	C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80),
+	C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4),
+	C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989),
+	C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36),
+	C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7),
+	C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f),
+	C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727),
+	C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b),
+	C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e),
+	C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062),
+	C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984),
+	C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5),
+	C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2),
+	C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f),
+	C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465),
+	C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a),
+	C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1),
+	C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf),
+	C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48),
+	C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0),
+	C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134),
+	C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a),
+	C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff),
+	C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6),
+	C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae),
+	C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567),
+	C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a),
+	C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518),
+	C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446),
+	C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e),
+	C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee),
+	C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001),
+	C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779),
+	C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83),
+	C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a),
+	C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef),
+	C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d),
+	C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65),
+	C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a),
+	C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c),
+	C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d),
+	C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71),
+	C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc),
+	C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0),
+	C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c),
+	C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f),
+	C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751),
+	C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad),
+	C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56),
+	C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6),
+	C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a),
+	C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163),
+	C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826),
+	C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f),
+	C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30),
+	C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a),
+	C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3),
+	C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505),
+	C64e(0xb17681d913326cce), C64e(0x3c175284f805a262),
+	C64e(0xf42bcbb378471547), C64e(0xff46548223936a48),
+	C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e),
+	C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e),
+	C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd),
+	C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7),
+	C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be),
+	C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de),
+	C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9),
+	C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a),
+	C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b),
+	C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2)
+};
+
+#define Ceven_hi(r)   (C[((r) << 2) + 0])
+#define Ceven_lo(r)   (C[((r) << 2) + 1])
+#define Codd_hi(r)    (C[((r) << 2) + 2])
+#define Codd_lo(r)    (C[((r) << 2) + 3])
+
+#define S(x0, x1, x2, x3, cb, r)   do { \
+		Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
+		Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
+	} while (0)
+
+#define L(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
+			x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
+		Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
+			x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
+	} while (0)
+
+
+#define Wz(x, c, n) \
+do { \
+   __m256i t = _mm256_slli_epi64( _mm256_and_si256(x ## h, (c)), (n) ); \
+   x ## h = _mm256_or_si256( _mm256_and_si256( \
+                                _mm256_srli_epi64(x ## h, (n)), (c)), t ); \
+   t = _mm256_slli_epi64( _mm256_and_si256(x ## l, (c)), (n) ); \
+   x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
+} while (0)
+
+
+/*
+#define Wz(x, c, n)   do { \
+		sph_u64 t = (x ## h & (c)) << (n); \
+		x ## h = ((x ## h >> (n)) & (c)) | t; \
+		t = (x ## l & (c)) << (n); \
+		x ## l = ((x ## l >> (n)) & (c)) | t; \
+	} while (0)
+*/
+
+#define W0(x)   Wz(x, _mm256_set_epi64x( 0x5555555555555555, \
+       0x5555555555555555, 0x5555555555555555, 0x5555555555555555 ), 1 )
+#define W1(x)   Wz(x, _mm256_set_epi64x( 0x3333333333333333, \
+       0x3333333333333333, 0x3333333333333333, 0x3333333333333333 ), 2 )
+#define W2(x)   Wz(x, _mm256_set_epi64x( 0x0F0F0F0F0F0F0F0F, \
+       0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F ), 4 )
+#define W3(x)   Wz(x, _mm256_set_epi64x( 0x00FF00FF00FF00FF, \
+       0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF ), 8 ) 
+#define W4(x)   Wz(x, _mm256_set_epi64x( 0x0000FFFF0000FFFF, \
+       0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF ), 16 )
+#define W5(x)   Wz(x, _mm256_set_epi64x( 0x00000000FFFFFFFF, \
+       0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF ), 32 )
+#define W6(x) \
+do { \
+   __m256i t = x ## h; \
+   x ## h = x ## l; \
+   x ## l = t; \
+} while (0)
+
+/*
+#define W0(x)   Wz(x, SPH_C64(0x5555555555555555),  1)
+#define W1(x)   Wz(x, SPH_C64(0x3333333333333333),  2)
+#define W2(x)   Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F),  4)
+#define W3(x)   Wz(x, SPH_C64(0x00FF00FF00FF00FF),  8)
+#define W4(x)   Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
+#define W5(x)   Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
+#define W6(x)   do { \
+		sph_u64 t = x ## h; \
+		x ## h = x ## l; \
+		x ## l = t; \
+	} while (0)
+*/
+
+#define DECL_STATE \
+	__m256i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
+	__m256i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
+	__m256i tmp;
+
+#define READ_STATE(state)   do { \
+		h0h = (state)->H[ 0]; \
+		h0l = (state)->H[ 1]; \
+		h1h = (state)->H[ 2]; \
+		h1l = (state)->H[ 3]; \
+		h2h = (state)->H[ 4]; \
+		h2l = (state)->H[ 5]; \
+		h3h = (state)->H[ 6]; \
+		h3l = (state)->H[ 7]; \
+		h4h = (state)->H[ 8]; \
+		h4l = (state)->H[ 9]; \
+		h5h = (state)->H[10]; \
+		h5l = (state)->H[11]; \
+		h6h = (state)->H[12]; \
+		h6l = (state)->H[13]; \
+		h7h = (state)->H[14]; \
+		h7l = (state)->H[15]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->H[ 0] = h0h; \
+		(state)->H[ 1] = h0l; \
+		(state)->H[ 2] = h1h; \
+		(state)->H[ 3] = h1l; \
+		(state)->H[ 4] = h2h; \
+		(state)->H[ 5] = h2l; \
+		(state)->H[ 6] = h3h; \
+		(state)->H[ 7] = h3l; \
+		(state)->H[ 8] = h4h; \
+		(state)->H[ 9] = h4l; \
+		(state)->H[10] = h5h; \
+		(state)->H[11] = h5l; \
+		(state)->H[12] = h6h; \
+		(state)->H[13] = h6l; \
+		(state)->H[14] = h7h; \
+		(state)->H[15] = h7l; \
+	} while (0)
+
+#define INPUT_BUF1 \
+	__m256i m0h = buf[0]; \
+	__m256i m0l = buf[1]; \
+	__m256i m1h = buf[2]; \
+	__m256i m1l = buf[3]; \
+	__m256i m2h = buf[4]; \
+	__m256i m2l = buf[5]; \
+	__m256i m3h = buf[6]; \
+	__m256i m3l = buf[7]; \
+        h0h = _mm256_xor_si256( h0h, m0h ); \
+        h0l = _mm256_xor_si256( h0l, m0l ); \
+        h1h = _mm256_xor_si256( h1h, m1h ); \
+        h1l = _mm256_xor_si256( h1l, m1l ); \
+        h2h = _mm256_xor_si256( h2h, m2h ); \
+        h2l = _mm256_xor_si256( h2l, m2l ); \
+        h3h = _mm256_xor_si256( h3h, m3h ); \
+        h3l = _mm256_xor_si256( h3l, m3l ); \
+
+#define INPUT_BUF2 \
+   h4h = _mm256_xor_si256( h4h, m0h ); \
+   h4l = _mm256_xor_si256( h4l, m0l ); \
+   h5h = _mm256_xor_si256( h5h, m1h ); \
+   h5l = _mm256_xor_si256( h5l, m1l ); \
+   h6h = _mm256_xor_si256( h6h, m2h ); \
+   h6l = _mm256_xor_si256( h6l, m2l ); \
+   h7h = _mm256_xor_si256( h7h, m3h ); \
+   h7l = _mm256_xor_si256( h7l, m3l ); \
+
+static const sph_u64 IV256[] = {
+	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
+	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
+	C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477),
+	C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8),
+	C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262),
+	C64e(0x277695f776248f94), C64e(0x87d5b6574780296c),
+	C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f),
+	C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769)
+};
+
+
+static const sph_u64 IV512[] = {
+	C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543),
+	C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361),
+	C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80),
+	C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7),
+	C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a),
+	C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199),
+	C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156),
+	C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b)
+};
+
+#else
+
+
+#endif
+
+#define SL(ro)   SLu(r + ro, ro)
+
+#define SLu(r, ro)   do { \
+		S(h0, h2, h4, h6, Ceven_, r); \
+		S(h1, h3, h5, h7, Codd_, r); \
+		L(h0, h2, h4, h6, h1, h3, h5, h7); \
+		W ## ro(h1); \
+		W ## ro(h3); \
+		W ## ro(h5); \
+		W ## ro(h7); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_JH
+
+#if SPH_JH_64
+
+/*
+ * The "small footprint" 64-bit version just uses a partially unrolled
+ * loop.
+ */
+
+#define E8   do { \
+		unsigned r; \
+		for (r = 0; r < 42; r += 7) { \
+			SL(0); \
+			SL(1); \
+			SL(2); \
+			SL(3); \
+			SL(4); \
+			SL(5); \
+			SL(6); \
+		} \
+	} while (0)
+
+#else
+
+
+#endif
+
+#else
+
+#if SPH_JH_64
+
+/*
+ * On a "true 64-bit" architecture, we can unroll at will.
+ */
+
+#define E8   do { \
+		SLu( 0, 0); \
+		SLu( 1, 1); \
+		SLu( 2, 2); \
+		SLu( 3, 3); \
+		SLu( 4, 4); \
+		SLu( 5, 5); \
+		SLu( 6, 6); \
+		SLu( 7, 0); \
+		SLu( 8, 1); \
+		SLu( 9, 2); \
+		SLu(10, 3); \
+		SLu(11, 4); \
+		SLu(12, 5); \
+		SLu(13, 6); \
+		SLu(14, 0); \
+		SLu(15, 1); \
+		SLu(16, 2); \
+		SLu(17, 3); \
+		SLu(18, 4); \
+		SLu(19, 5); \
+		SLu(20, 6); \
+		SLu(21, 0); \
+		SLu(22, 1); \
+		SLu(23, 2); \
+		SLu(24, 3); \
+		SLu(25, 4); \
+		SLu(26, 5); \
+		SLu(27, 6); \
+		SLu(28, 0); \
+		SLu(29, 1); \
+		SLu(30, 2); \
+		SLu(31, 3); \
+		SLu(32, 4); \
+		SLu(33, 5); \
+		SLu(34, 6); \
+		SLu(35, 0); \
+		SLu(36, 1); \
+		SLu(37, 2); \
+		SLu(38, 3); \
+		SLu(39, 4); \
+		SLu(40, 5); \
+		SLu(41, 6); \
+	} while (0)
+
+#else
+
+
+#endif
+
+#endif
+
+static void
+jh_4way_init( jh_4way_context *sc, const void *iv )
+{
+    uint64_t *v = (uint64_t*)iv;
+    
+    for ( int i = 0; i < 16; i++ )
+        sc->H[i] = _mm256_set_epi64x( v[i], v[i], v[i], v[i] );
+    sc->ptr = 0;
+    sc->block_count = 0;
+}
+
+static void
+jh_4way_core( jh_4way_context *sc, const void *data, size_t len )
+{
+    __m256i *buf;
+    __m256i *vdata = (__m256i*)data;
+   const int buf_size = 64;   // 64 * _m256i
+   size_t ptr;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr) )
+   {
+       memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
+       ptr += len;
+       sc->ptr = ptr;
+       return;
+   }
+
+   READ_STATE(sc);
+   while ( len > 0 )
+   {
+       size_t clen;
+       clen = buf_size - ptr;
+       if ( clen > len )
+          clen = len;
+
+       memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
+       ptr += clen;
+       vdata += (clen>>3);
+       len -= clen;
+       if ( ptr == buf_size )
+       {
+          INPUT_BUF1;
+          E8;
+          INPUT_BUF2;
+          sc->block_count ++;
+          ptr = 0;
+       }
+   }
+   WRITE_STATE(sc);
+   sc->ptr = ptr;
+}
+
+static void
+jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
+               size_t out_size_w32, const void *iv )
+{
+   __m256i buf[16*4];
+   __m256i *dst256 = (__m256i*)dst;
+   size_t numz, u;
+   sph_u64 l0, l1, l0e, l1e;
+
+   buf[0] = _mm256_set_epi64x( 0x80, 0x80, 0x80, 0x80 );
+
+   if ( sc->ptr == 0 )
+       numz = 48;
+   else
+       numz = 112 - sc->ptr;
+
+   memset_zero_m256i( buf+1, (numz>>3) - 1 );   
+
+   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
+   l1 = SPH_T64(sc->block_count >> 55);
+   sph_enc64be( &l0e, l0 );
+   sph_enc64be( &l1e, l1 );
+   *(buf + (numz>>3)    ) = _mm256_set_epi64x( l1e, l1e, l1e, l1e );
+   *(buf + (numz>>3) + 1) = _mm256_set_epi64x( l0e, l0e, l0e, l0e ); 
+
+   jh_4way_core( sc, buf, numz + 16 );
+
+   for ( u=0; u < 8; u++ )
+       buf[u] = sc->H[u+8];
+
+    memcpy_m256i( dst256, buf, 8 );
+}
+
+void
+jh256_4way_init(void *cc)
+{
+	jh_4way_init(cc, IV256);
+}
+
+void
+jh256_4way(void *cc, const void *data, size_t len)
+{
+	jh_4way_core(cc, data, len);
+}
+
+void
+jh256_4way_close(void *cc, void *dst)
+{
+	jh_4way_close(cc, 0, 0, dst, 8, IV256);
+}
+
+void
+jh512_4way_init(void *cc)
+{
+	jh_4way_init(cc, IV512);
+}
+
+void
+jh512_4way(void *cc, const void *data, size_t len)
+{
+	jh_4way_core(cc, data, len);
+}
+
+void
+jh512_4way_close(void *cc, void *dst)
+{
+	jh_4way_close(cc, 0, 0, dst, 16, IV512);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -0,0 +1,100 @@
+/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * JH interface. JH is a family of functions which differ by
+ * their output size; this implementation defines JH for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_jh.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef JH_HASH_4WAY_H__
+#define JH_HASH_4WAY_H__
+
+#ifdef __AVX2__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_jh256   256
+
+#define SPH_SIZE_jh512   512
+
+/**
+ * This structure is a context for JH computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a JH computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running JH computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+    __m256i buf[8] __attribute__ ((aligned (64)));
+    __m256i H[16];
+    size_t ptr;
+    uint64_t block_count;
+/*
+	unsigned char buf[64]; 
+	size_t ptr;
+	union {
+		sph_u64 wide[16];
+	} H;
+	sph_u64 block_count;
+*/
+} jh_4way_context;
+
+typedef jh_4way_context jh256_4way_context;
+
+typedef jh_4way_context jh512_4way_context;
+
+void jh256_4way_init(void *cc);
+
+void jh256_4way(void *cc, const void *data, size_t len);
+
+void jh256_4way_close(void *cc, void *dst);
+
+void jh512_4way_init(void *cc);
+
+void jh512_4way(void *cc, const void *data, size_t len);
+
+void jh512_4way_close(void *cc, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -0,0 +1,228 @@
+#if defined(JHA_4WAY)
+
+#include "jha-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "avxdefs.h"
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+
+//static __thread keccak512_4way_context jha_kec_mid
+//                                   __attribute__ ((aligned (64)));
+
+void jha_hash_4way( void *output, const void *input )
+{
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhasha[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashb[8*4] __attribute__ ((aligned (64)));
+    __m256i mask;
+    __m256i* vh256 = (__m256i*)vhash;
+    __m256i* vha256 = (__m256i*)vhasha;
+    __m256i* vhb256 = (__m256i*)vhashb;
+
+    blake512_4way_context  ctx_blake;
+    hashState_groestl      ctx_groestl;
+    jh512_4way_context     ctx_jh;
+    skein512_4way_context  ctx_skein;
+    keccak512_4way_context ctx_keccak;
+
+    keccak512_4way_init( &ctx_keccak );
+    keccak512_4way( &ctx_keccak, input, 80 );
+    keccak512_4way_close( &ctx_keccak, vhash );
+
+//    memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
+//    keccak512_4way( &ctx_keccak, input+64, 16 );
+//    keccak512_4way_close( &ctx_keccak, vhash );
+
+    // Heavy & Light Pair Loop
+    for ( int round = 0; round < 3; round++ )
+    {
+       memset_zero_m256i( vha256, 20 );
+       memset_zero_m256i( vhb256, 20 );
+
+       mask = _mm256_sub_epi64( _mm256_and_si256( vh256[0],
+                        mm256_vec_epi64( 0x1 ) ), mm256_vec_epi64( 0x1 ) );
+
+       // groestl (serial) v skein
+
+       m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash0,
+                     (char*)hash0, 512 );
+
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash1,
+                                          (char*)hash1, 512 );
+
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash2,
+                                          (char*)hash2, 512 );
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash3,
+                                          (char*)hash3, 512 );
+
+       m256_interleave_4x64( vhasha, hash0, hash1, hash2, hash3, 512 );
+
+       // skein
+
+       skein512_4way_init( &ctx_skein );
+       skein512_4way( &ctx_skein, vhash, 64 );
+       skein512_4way_close( &ctx_skein, vhashb );
+
+       // merge vectored hash
+       for ( int i = 0; i < 8; i++ )
+       {
+          vha256[i] = _mm256_maskload_epi64( 
+                                      vhasha + i*4, mm256_bitnot(mask ) );
+          vhb256[i] = _mm256_maskload_epi64(
+                                      vhashb + i*4, mask );
+          vh256[i]  = _mm256_or_si256( vha256[i], vhb256[i] );
+       }
+
+       // blake v jh
+
+       blake512_4way_init( &ctx_blake );
+       blake512_4way( &ctx_blake, vhash, 64 );
+       blake512_4way_close( &ctx_blake, vhasha );
+
+       jh512_4way_init( &ctx_jh );
+       jh512_4way( &ctx_jh, vhash, 64 );
+       jh512_4way_close( &ctx_jh, vhashb );
+
+       // merge vectored hash
+       for ( int i = 0; i < 8; i++ )
+       {
+          vha256[i] = _mm256_maskload_epi64(
+                                      vhasha + i*4, mm256_bitnot(mask ) );
+          vhb256[i] = _mm256_maskload_epi64(
+                                      vhashb + i*4, mask );
+          vh256[i]  = _mm256_or_si256( vha256[i], vhb256[i] );
+       }
+    }
+
+    m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+    memcpy( output,       hash0, 32 );
+    memcpy( output+32,    hash1, 32 );
+    memcpy( output+64,    hash2, 32 );
+    memcpy( output+96,    hash3, 32 );
+
+}
+
+int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+	uint32_t n = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+
+	uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+	uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+   // we need bigendian data...
+   for ( int i=0; i < 19; i++ )
+      be32enc( &endiandata[i], pdata[i] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   // precalc midstate for keccak
+//   keccak512_4way_init( &jha_kec_mid );
+//   keccak512_4way( &jha_kec_mid, vdata, 64 );
+
+   for ( int m = 0; m < 6; m++ )
+   {
+      if ( Htarg <= htmax[m] )
+      {
+         uint32_t mask = masks[m];
+         do {
+              found[0] = found[1] = found[2] = found[3] = false;
+              be32enc( noncep0, n   );
+              be32enc( noncep1, n+1 );
+              be32enc( noncep2, n+2 );
+              be32enc( noncep3, n+3 );
+
+              jha_hash_4way( hash, vdata );
+
+              pdata[19] = n;
+
+              if ( ( !(hash[7] & mask) )
+                   && fulltest( hash, ptarget ) )
+              {
+                 found[0] = true;
+                 num_found++;
+                 nonces[0] = n;
+                 work_set_target_ratio( work, hash );
+              }
+              if ( ( !((hash+8)[7] & mask) )
+                   && fulltest( hash+8, ptarget ) )
+              {
+                 found[1] = true;
+                 num_found++;
+                 nonces[1] = n+1;
+                 work_set_target_ratio( work, hash+8 );
+              }
+              if ( ( !((hash+16)[7] & mask) )
+                 && fulltest( hash+16, ptarget ) )
+              {
+                 found[2] = true;
+                 num_found++;
+                 nonces[2] = n+2;
+                 work_set_target_ratio( work, hash+16 );
+              }
+              if ( ( !((hash+24)[7] & mask) )
+                   && fulltest( hash+24, ptarget ) )
+              {
+                 found[3] = true;
+                 num_found++;
+                 nonces[3] = n+3;
+                 work_set_target_ratio( work, hash+24 );
+              }
+              n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                     && !work_restart[thr_id].restart );
+
+         break;
+      }
+   }
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+#endif
--- a/algo/jh/jha-gate.c
+++ b/algo/jh/jha-gate.c
@@ -0,0 +1,18 @@
+#include "jha-gate.h"
+
+
+bool register_jha_algo( algo_gate_t* gate )
+{
+//#if defined (JHA_4WAY)
+//  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+//  gate->scanhash         = (void*)&scanhash_jha_4way;
+//  gate->hash             = (void*)&jha_hash_4way;
+//#else
+  gate->optimizations = SSE2_OPT | AES_OPT;
+  gate->scanhash         = (void*)&scanhash_jha;
+  gate->hash             = (void*)&jha_hash;
+//#endif
+  gate->set_target       = (void*)&scrypt_set_target;
+  return true;
+};
+
--- a/algo/jh/jha-gate.h
+++ b/algo/jh/jha-gate.h
@@ -0,0 +1,27 @@
+#ifndef JHA_GATE_H__
+#define JHA_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+
+#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+  #define JHA_4WAY
+#endif
+
+//#if defined JHA_4WAY
+//void jha_hash_4way( void *state, const void *input );
+
+//int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
+//                       uint64_t *hashes_done );
+//#else
+
+void jha_hash( void *state, const void *input );
+
+int scanhash_jha( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+
+//#endif
+
+#endif
+
--- a/algo/jh/jha.c
+++ b/algo/jh/jha.c
@@ -0,0 +1,155 @@
+#include "jha-gate.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "algo/blake/sph_blake.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+
+#ifdef NO_AES_NI
+  #include "algo/groestl/sph_groestl.h"
+#else
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+#endif
+
+static __thread sph_keccak512_context jha_kec_mid __attribute__ ((aligned (64)));
+
+void jha_kec_midstate( const void* input )
+{
+    sph_keccak512_init( &jha_kec_mid );
+    sph_keccak512( &jha_kec_mid, input, 64 );
+}
+
+void jha_hash(void *output, const void *input)
+{
+	uint8_t _ALIGN(128) hash[64];
+
+#ifdef NO_AES_NI
+	sph_groestl512_context ctx_groestl;
+#else
+        hashState_groestl      ctx_groestl;
+#endif
+        sph_blake512_context ctx_blake;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+
+        memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
+        sph_keccak512(&ctx_keccak, input+64, 16 );
+	sph_keccak512_close(&ctx_keccak, hash );
+
+	// Heavy & Light Pair Loop
+	for (int round = 0; round < 3; round++)
+	{
+	   if (hash[0] & 0x01)
+           {
+#ifdef NO_AES_NI
+		sph_groestl512_init(&ctx_groestl);
+		sph_groestl512(&ctx_groestl, hash, 64 );
+		sph_groestl512_close(&ctx_groestl, hash );
+#else
+                init_groestl( &ctx_groestl, 64 );
+                update_and_final_groestl( &ctx_groestl, (char*)hash,
+                                          (char*)hash, 512 );
+#endif
+	    }
+            else
+            {
+		sph_skein512_init(&ctx_skein);
+		sph_skein512(&ctx_skein, hash, 64);
+		sph_skein512_close(&ctx_skein, hash );
+	    }
+
+	    if (hash[0] & 0x01)
+            {
+		sph_blake512_init(&ctx_blake);
+		sph_blake512(&ctx_blake, hash, 64);
+		sph_blake512_close(&ctx_blake, hash );
+	    }
+            else
+            {
+		sph_jh512_init(&ctx_jh);
+		sph_jh512(&ctx_jh, hash, 64 );
+		sph_jh512_close(&ctx_jh, hash );
+	    }
+	}
+
+	memcpy(output, hash, 32);
+}
+
+int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+{
+	uint32_t _ALIGN(128) hash32[8];
+	uint32_t _ALIGN(128) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+	uint32_t n = pdata[19] - 1;
+
+	uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+	uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+	// we need bigendian data...
+	for (int i=0; i < 19; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+
+        jha_kec_midstate( endiandata );
+
+#ifdef DEBUG_ALGO
+	printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
+	for (int m=0; m < 6; m++) {
+		if (Htarg <= htmax[m]) {
+			uint32_t mask = masks[m];
+			do {
+				pdata[19] = ++n;
+				be32enc(&endiandata[19], n);
+				jha_hash(hash32, endiandata);
+#ifndef DEBUG_ALGO
+				if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) {
+					work_set_target_ratio(work, hash32);
+					*hashes_done = n - first_nonce + 1;
+					return 1;
+				}
+#else
+				if (!(n % 0x1000) && !thr_id) printf(".");
+				if (!(hash32[7] & mask)) {
+					printf("[%d]",thr_id);
+					if (fulltest(hash32, ptarget)) {
+						work_set_target_ratio(work, hash32);
+						*hashes_done = n - first_nonce + 1;
+						return 1;
+					}
+				}
+#endif
+			} while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
+			break;
+		}
+	}
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
+
--- a/algo/jh/sph_jh.c
+++ b/algo/jh/sph_jh.c
@@ -914,6 +914,7 @@ jh_core(sph_jh_context *sc, const void *data, size_t len)

 	buf = sc->buf;
 	ptr = sc->ptr;
+
 	if (len < (sizeof sc->buf) - ptr) {
 		memcpy(buf + ptr, data, len);
 		ptr += len;
--- a/algo/jh/sse2/jh_sse2_opt64.h
+++ b/algo/jh/sse2/jh_sse2_opt64.h
@@ -22,15 +22,12 @@
 */


-
 #include <emmintrin.h>
 #include <stdint.h>
 #include <string.h>
+#include "algo/sha/sha3-defs.h"

 typedef __m128i  word128;   /*word128 defines a 128-bit SSE2 word*/
-
-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
 typedef enum {jhSUCCESS = 0, jhFAIL = 1, jhBAD_HASHLEN = 2} jhReturn;

 /*define data alignment for different C compilers*/
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -0,0 +1,104 @@
+#include "keccak-gate.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "sph_keccak.h"
+#include "keccak-hash-4way.h"
+
+#ifdef KECCAK_4WAY
+
+void keccakhash_4way(void *state, const void *input)
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     keccak256_4way_context ctx;
+
+     keccak256_4way_init( &ctx );
+     keccak256_4way( &ctx, input, 80 );
+     keccak256_4way_close( &ctx, vhash );
+
+     m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done)
+{
+   uint32_t hash[4*8] __attribute__ ((aligned (64)));
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+//   const uint32_t Htarg = ptarget[7];
+   uint32_t endiandata[20];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+
+   for ( int i=0; i < 19; i++ ) 
+      be32enc( &endiandata[i], pdata[i] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   m256_interleave_4x64x( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+	
+      keccakhash_4way( hash, vdata );
+
+      if ( ( ( hash[7] & 0xFFFFFF00 ) == 0 )
+         && fulltest( hash, ptarget) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          pdata[19] = n;
+      }
+      if ( ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 )
+         && fulltest( hash+8, ptarget) ) 
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+      }
+      if ( ( ( (hash+16) [7] & 0xFFFFFF00 ) == 0 )
+         && fulltest( hash+16, ptarget) )
+      {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+      }
+      if ( ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 )
+         && fulltest( hash+24, ptarget) )
+      {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+      }
+      n += 4;
+
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -0,0 +1,27 @@
+#include "keccak-gate.h"
+
+void keccak_set_target( struct work* work, double job_diff )
+{
+  work_set_target( work, job_diff / (128.0 * opt_diff_factor) );
+}
+
+int64_t keccak_get_max64() { return 0x7ffffLL; }
+
+bool register_keccak_algo( algo_gate_t* gate )
+{
+  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->set_target      = (void*)&keccak_set_target;
+  gate->get_max64       = (void*)&keccak_get_max64;
+#if defined (KECCAK_4WAY)
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
+  gate->scanhash  = (void*)&scanhash_keccak_4way;
+  gate->hash      = (void*)&keccakhash_4way;
+#else
+  gate->optimizations = SSE2_OPT;
+  gate->scanhash        = (void*)&scanhash_keccak;
+  gate->hash            = (void*)&keccakhash;
+#endif
+  return true;
+};
+
+
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -0,0 +1,23 @@
+#ifndef __KECCAK_GATE_H__
+#define __KECCAK_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX2__)
+  #define KECCAK_4WAY
+#endif
+
+#if defined(KECCAK_4WAY)
+
+void keccakhash_4way( void *state, const void *input );
+int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+#endif
+
+void keccakhash( void *state, const void *input );
+int scanhash_keccak( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+#endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -0,0 +1,507 @@
+#include <stddef.h>
+#include "keccak-hash-4way.h"
+
+#if defined(__AVX2__)
+
+static const sph_u64 RC[] = {
+        SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
+        SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
+        SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
+        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
+        SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
+        SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
+        SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
+        SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
+        SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
+        SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
+        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
+        SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
+};
+
+#define a00   (kc->w[ 0])
+#define a10   (kc->w[ 1])
+#define a20   (kc->w[ 2])
+#define a30   (kc->w[ 3])
+#define a40   (kc->w[ 4])
+#define a01   (kc->w[ 5])
+#define a11   (kc->w[ 6])
+#define a21   (kc->w[ 7])
+#define a31   (kc->w[ 8])
+#define a41   (kc->w[ 9])
+#define a02   (kc->w[10])
+#define a12   (kc->w[11])
+#define a22   (kc->w[12])
+#define a32   (kc->w[13])
+#define a42   (kc->w[14])
+#define a03   (kc->w[15])
+#define a13   (kc->w[16])
+#define a23   (kc->w[17])
+#define a33   (kc->w[18])
+#define a43   (kc->w[19])
+#define a04   (kc->w[20])
+#define a14   (kc->w[21])
+#define a24   (kc->w[22])
+#define a34   (kc->w[23])
+#define a44   (kc->w[24])
+
+#define DECL_STATE
+#define READ_STATE(sc)
+#define WRITE_STATE(sc)
+
+#define INPUT_BUF(size)   do { \
+    size_t j; \
+    for (j = 0; j < (size>>3); j++ ) \
+        kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
+} while (0)
+
+#define mm256_neg1 \
+        (_mm256_set_epi64x( 0xffffffffffffffff, 0xffffffffffffffff, \
+                            0xffffffffffffffff, 0xffffffffffffffff ) )
+
+#define DECL64(x)        __m256i x
+#define MOV64(d, s)      (d = s)
+#define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
+#define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
+#define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
+#define NOT64(d, s)      (d = _mm256_xor_si256(s,mm256_neg1))
+#define ROL64(d, v, n)   (d = mm256_rotl_64(v, n))
+#define XOR64_IOTA       XOR64
+
+#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
+                DECL64(tt0); \
+                DECL64(tt1); \
+                DECL64(tt2); \
+                DECL64(tt3); \
+                XOR64(tt0, d0, d1); \
+                XOR64(tt1, d2, d3); \
+                XOR64(tt0, tt0, d4); \
+                XOR64(tt0, tt0, tt1); \
+                ROL64(tt0, tt0, 1); \
+                XOR64(tt2, c0, c1); \
+                XOR64(tt3, c2, c3); \
+                XOR64(tt0, tt0, c4); \
+                XOR64(tt2, tt2, tt3); \
+                XOR64(t, tt0, tt2); \
+        } while (0)
+
+#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                DECL64(t0); \
+                DECL64(t1); \
+                DECL64(t2); \
+                DECL64(t3); \
+                DECL64(t4); \
+                TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
+                TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
+                TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
+                TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
+                TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
+                XOR64(b00, b00, t0); \
+                XOR64(b01, b01, t0); \
+                XOR64(b02, b02, t0); \
+                XOR64(b03, b03, t0); \
+                XOR64(b04, b04, t0); \
+                XOR64(b10, b10, t1); \
+                XOR64(b11, b11, t1); \
+                XOR64(b12, b12, t1); \
+                XOR64(b13, b13, t1); \
+                XOR64(b14, b14, t1); \
+                XOR64(b20, b20, t2); \
+                XOR64(b21, b21, t2); \
+                XOR64(b22, b22, t2); \
+                XOR64(b23, b23, t2); \
+                XOR64(b24, b24, t2); \
+                XOR64(b30, b30, t3); \
+                XOR64(b31, b31, t3); \
+                XOR64(b32, b32, t3); \
+                XOR64(b33, b33, t3); \
+                XOR64(b34, b34, t3); \
+                XOR64(b40, b40, t4); \
+                XOR64(b41, b41, t4); \
+                XOR64(b42, b42, t4); \
+                XOR64(b43, b43, t4); \
+                XOR64(b44, b44, t4); \
+        } while (0)
+
+#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                /* ROL64(b00, b00,  0); */ \
+                ROL64(b01, b01, 36); \
+                ROL64(b02, b02,  3); \
+                ROL64(b03, b03, 41); \
+                ROL64(b04, b04, 18); \
+                ROL64(b10, b10,  1); \
+                ROL64(b11, b11, 44); \
+                ROL64(b12, b12, 10); \
+                ROL64(b13, b13, 45); \
+                ROL64(b14, b14,  2); \
+                ROL64(b20, b20, 62); \
+                ROL64(b21, b21,  6); \
+                ROL64(b22, b22, 43); \
+                ROL64(b23, b23, 15); \
+                ROL64(b24, b24, 61); \
+                ROL64(b30, b30, 28); \
+                ROL64(b31, b31, 55); \
+                ROL64(b32, b32, 25); \
+                ROL64(b33, b33, 21); \
+                ROL64(b34, b34, 56); \
+                ROL64(b40, b40, 27); \
+                ROL64(b41, b41, 20); \
+                ROL64(b42, b42, 39); \
+                ROL64(b43, b43,  8); \
+                ROL64(b44, b44, 14); \
+        } while (0)
+
+/*
+ * The KHI macro integrates the "lane complement" optimization. On input,
+ * some words are complemented:
+ *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
+ * On output, the following words are complemented:
+ *    a04 a10 a20 a22 a23 a31
+ *
+ * The (implicit) permutation and the theta expansion will bring back
+ * the input mask for the next round.
+ */
+
+#define KHI_XO(d, a, b, c)   do { \
+                DECL64(kt); \
+                OR64(kt, b, c); \
+                XOR64(d, a, kt); \
+        } while (0)
+
+#define KHI_XA(d, a, b, c)   do { \
+                DECL64(kt); \
+                AND64(kt, b, c); \
+                XOR64(d, a, kt); \
+        } while (0)
+
+#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                DECL64(c0); \
+                DECL64(c1); \
+                DECL64(c2); \
+                DECL64(c3); \
+                DECL64(c4); \
+                DECL64(bnn); \
+                NOT64(bnn, b20); \
+                KHI_XO(c0, b00, b10, b20); \
+                KHI_XO(c1, b10, bnn, b30); \
+                KHI_XA(c2, b20, b30, b40); \
+                KHI_XO(c3, b30, b40, b00); \
+                KHI_XA(c4, b40, b00, b10); \
+                MOV64(b00, c0); \
+                MOV64(b10, c1); \
+                MOV64(b20, c2); \
+                MOV64(b30, c3); \
+                MOV64(b40, c4); \
+                NOT64(bnn, b41); \
+                KHI_XO(c0, b01, b11, b21); \
+                KHI_XA(c1, b11, b21, b31); \
+                KHI_XO(c2, b21, b31, bnn); \
+                KHI_XO(c3, b31, b41, b01); \
+                KHI_XA(c4, b41, b01, b11); \
+                MOV64(b01, c0); \
+                MOV64(b11, c1); \
+                MOV64(b21, c2); \
+                MOV64(b31, c3); \
+                MOV64(b41, c4); \
+                NOT64(bnn, b32); \
+                KHI_XO(c0, b02, b12, b22); \
+                KHI_XA(c1, b12, b22, b32); \
+                KHI_XA(c2, b22, bnn, b42); \
+                KHI_XO(c3, bnn, b42, b02); \
+                KHI_XA(c4, b42, b02, b12); \
+                MOV64(b02, c0); \
+                MOV64(b12, c1); \
+                MOV64(b22, c2); \
+                MOV64(b32, c3); \
+                MOV64(b42, c4); \
+                NOT64(bnn, b33); \
+                KHI_XA(c0, b03, b13, b23); \
+                KHI_XO(c1, b13, b23, b33); \
+                KHI_XO(c2, b23, bnn, b43); \
+                KHI_XA(c3, bnn, b43, b03); \
+                KHI_XO(c4, b43, b03, b13); \
+                MOV64(b03, c0); \
+                MOV64(b13, c1); \
+                MOV64(b23, c2); \
+                MOV64(b33, c3); \
+                MOV64(b43, c4); \
+                NOT64(bnn, b14); \
+                KHI_XA(c0, b04, bnn, b24); \
+                KHI_XO(c1, bnn, b24, b34); \
+                KHI_XA(c2, b24, b34, b44); \
+                KHI_XO(c3, b34, b44, b04); \
+                KHI_XA(c4, b44, b04, b14); \
+                MOV64(b04, c0); \
+                MOV64(b14, c1); \
+                MOV64(b24, c2); \
+                MOV64(b34, c3); \
+                MOV64(b44, c4); \
+        } while (0)
+
+#define IOTA(r)   XOR64_IOTA(a00, a00, r)
+
+#define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
+              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
+#define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
+              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
+#define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
+              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
+#define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
+              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
+#define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
+              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
+#define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
+              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
+#define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
+              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
+#define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
+              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
+#define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
+              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
+#define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
+              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
+#define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
+              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
+#define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
+              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
+#define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
+              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
+#define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
+              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
+#define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
+              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
+#define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
+              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
+#define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
+              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
+#define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
+              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
+#define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
+              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
+#define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
+              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
+#define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
+              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
+#define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
+              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
+#define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
+              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
+#define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
+              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
+
+#define P8_TO_P0   do { \
+                DECL64(t); \
+                MOV64(t, a01); \
+                MOV64(a01, a11); \
+                MOV64(a11, a43); \
+                MOV64(a43, t); \
+                MOV64(t, a02); \
+                MOV64(a02, a22); \
+                MOV64(a22, a31); \
+                MOV64(a31, t); \
+                MOV64(t, a03); \
+                MOV64(a03, a33); \
+                MOV64(a33, a24); \
+                MOV64(a24, t); \
+                MOV64(t, a04); \
+                MOV64(a04, a44); \
+                MOV64(a44, a12); \
+                MOV64(a12, t); \
+                MOV64(t, a10); \
+                MOV64(a10, a32); \
+                MOV64(a32, a13); \
+                MOV64(a13, t); \
+                MOV64(t, a14); \
+                MOV64(a14, a21); \
+                MOV64(a21, a20); \
+                MOV64(a20, t); \
+                MOV64(t, a23); \
+                MOV64(a23, a42); \
+                MOV64(a42, a40); \
+                MOV64(a40, t); \
+                MOV64(t, a30); \
+                MOV64(a30, a41); \
+                MOV64(a41, a34); \
+                MOV64(a34, t); \
+        } while (0)
+
+#define LPAR   (
+#define RPAR   )
+
+#define KF_ELT(r, s, k)   do { \
+                THETA LPAR P ## r RPAR; \
+                RHO LPAR P ## r RPAR; \
+                KHI LPAR P ## s RPAR; \
+                IOTA(k); \
+        } while (0)
+
+#define DO(x)   x
+
+#define KECCAK_F_1600   DO(KECCAK_F_1600_)
+
+#define KECCAK_F_1600_   do { \
+    int j; \
+    for (j = 0; j < 24; j += 8) \
+    { \
+       KF_ELT( 0,  1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \
+                                       RC[j + 0], RC[j + 0])) ); \
+       KF_ELT( 1,  2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \
+                                       RC[j + 1], RC[j + 1])) ); \
+       KF_ELT( 2,  3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \
+                                       RC[j + 2], RC[j + 2])) ); \
+       KF_ELT( 3,  4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \
+                                       RC[j + 3], RC[j + 3])) ); \
+       KF_ELT( 4,  5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \
+                                       RC[j + 4], RC[j + 4])) ); \
+       KF_ELT( 5,  6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \
+                                       RC[j + 5], RC[j + 5])) ); \
+       KF_ELT( 6,  7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \
+                                       RC[j + 6], RC[j + 6])) ); \
+       KF_ELT( 7,  8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \
+                                       RC[j + 7], RC[j + 7])) ); \
+       P8_TO_P0; \
+    } \
+} while (0)
+
+
+static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
+{
+   int i;
+   for (i = 0; i < 25; i ++)
+          kc->w[i] = _mm256_setzero_si256();
+
+   // Initialization for the "lane complement".
+   kc->w[ 1] = mm256_neg1;
+   kc->w[ 2] = mm256_neg1;
+   kc->w[ 8] = mm256_neg1;
+   kc->w[12] = mm256_neg1;
+   kc->w[17] = mm256_neg1;
+   kc->w[20] = mm256_neg1;
+   kc->ptr = 0;
+   kc->lim = 200 - (out_size >> 2);
+}
+
+static void
+keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
+               size_t lim )
+{
+    __m256i *buf;
+    __m256i *vdata = (__m256i*)data;
+    size_t ptr;
+    DECL_STATE
+
+    buf = kc->buf;
+    ptr = kc->ptr;
+
+    if ( len < (lim - ptr) )
+    {
+        memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
+        kc->ptr = ptr + len;
+        return;
+    }
+
+    READ_STATE( kc );
+    while ( len > 0 )
+    {
+        size_t clen;
+
+        clen = (lim - ptr);
+        if ( clen > len )
+             clen = len;
+        memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
+        ptr += clen;
+        vdata = vdata + (clen>>3);
+        len -= clen;
+        if ( ptr == lim )
+        {
+            INPUT_BUF( lim );
+            KECCAK_F_1600;
+            ptr = 0;
+        }
+    }
+    WRITE_STATE( kc );
+    kc->ptr = ptr;
+}
+
+static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
+            size_t lim )
+{
+    unsigned eb;
+    union {
+       __m256i tmp[lim + 1];
+       sph_u64 dummy;   /* for alignment */
+    } u;
+    size_t j;
+    size_t m256_len = byte_len >> 3;
+
+    eb = 0x100  >> 8;
+    if ( kc->ptr == (lim - 8) )
+    {
+        uint64_t t = eb | 0x8000000000000000;
+        u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
+        j = 8;
+    }
+    else
+    {
+        j = lim - kc->ptr;
+        u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
+        memset_zero_m256i( u.tmp + 1, (j>>3) - 2 );
+        u.tmp[ (j>>3) - 1] = _mm256_set_epi64x( 0x8000000000000000,
+                0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+    }
+    keccak64_core( kc, u.tmp, j, lim );
+    /* Finalize the "lane complement" */
+    NOT64( kc->w[ 1], kc->w[ 1] );
+    NOT64( kc->w[ 2], kc->w[ 2] );
+    NOT64( kc->w[ 8], kc->w[ 8] );
+    NOT64( kc->w[12], kc->w[12] );
+    NOT64( kc->w[17], kc->w[17] );
+    NOT64( kc->w[20], kc->w[20] );
+    for ( j = 0; j < m256_len; j++ )
+         u.tmp[j] =  kc->w[j]; 
+    memcpy_m256i( dst, u.tmp, m256_len );
+}
+
+void keccak256_4way_init( void *kc )
+{
+   keccak64_init( kc, 256 );
+}
+
+void
+keccak256_4way(void *cc, const void *data, size_t len)
+{
+    keccak64_core(cc, data, len, 136);
+}
+
+void
+keccak256_4way_close(void *cc, void *dst)
+{
+    keccak64_close(cc, dst, 32, 136);
+}
+
+void keccak512_4way_init( void *kc )
+{
+   keccak64_init( kc, 512 );
+}
+
+void
+keccak512_4way(void *cc, const void *data, size_t len)
+{
+        keccak64_core(cc, data, len, 72);
+}
+
+void
+keccak512_4way_close(void *cc, void *dst)
+{
+        keccak64_close(cc, dst, 64, 72);
+}
+
+#endif
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -0,0 +1,94 @@
+/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Keccak interface. This is the interface for Keccak with the
+ * recommended parameters for SHA-3, with output lengths 224, 256,
+ * 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_keccak.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef KECCAK_HASH_4WAY_H__
+#define KECCAK_HASH_4WAY_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef  __AVX2__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_keccak256   256
+
+/**
+ * Output size (in bits) for Keccak-512.
+ */
+#define SPH_SIZE_keccak512   512
+
+/**
+ * This structure is a context for Keccak computations: it contains the
+ * intermediate values and some data from the last entered block. Once a
+ * Keccak computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running Keccak computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+
+typedef struct {
+        __m256i buf[144*8];    /* first field, for alignment */
+        __m256i w[25];
+        size_t ptr, lim;
+//        sph_u64 wide[25];
+} keccak64_ctx_m256i;
+
+typedef keccak64_ctx_m256i keccak256_4way_context;
+typedef keccak64_ctx_m256i keccak512_4way_context;
+
+void keccak256_4way_init(void *cc);
+void keccak256_4way(void *cc, const void *data, size_t len);
+void keccak256_4way_close(void *cc, void *dst);
+
+
+void keccak512_4way_init(void *cc);
+void keccak512_4way(void *cc, const void *data, size_t len);
+void keccak512_4way_close(void *cc, void *dst);
+void keccak512_4way_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/keccak/keccak.c
+++ b/algo/keccak/keccak.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
@@ -51,17 +50,3 @@ int scanhash_keccak(int thr_id, struct work *work,
 	return 0;
 }

-void keccak_set_target( struct work* work, double job_diff )
-{
-  work_set_target( work, job_diff / (128.0 * opt_diff_factor) );
-}
-
-bool register_keccak_algo( algo_gate_t* gate )
-{
-  gate->scanhash        = (void*)&scanhash_keccak;
-  gate->hash            = (void*)&keccakhash;
-  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
-  gate->set_target      = (void*)&keccak_set_target;
-  return true;
-};
-
--- a/algo/keccak/sph_keccak.c
+++ b/algo/keccak/sph_keccak.c
@@ -955,6 +955,7 @@ static const struct {

 #endif

+
 #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
 		DECL64(tt0); \
 		DECL64(tt1); \
@@ -1643,8 +1644,7 @@ keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
 		for (j = 0; j < d; j += 8) \
 			sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \
 		memcpy(dst, u.tmp, d); \
-		keccak_init(kc, (unsigned)d << 3); \
-	} \
+}

 #else

--- a/algo/lbry.c
+++ b/algo/lbry.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"
 #include <stdlib.h>
 #include <stdint.h>
@@ -6,9 +5,7 @@
 #include <stdio.h>
 #include "ripemd/sph_ripemd.h"
 #include "sha/sph_sha2.h"
-#if defined __SHA__
- #include <openssl/sha.h>
-#endif
+#include <openssl/sha.h>

 #define LBRY_NTIME_INDEX 25
 #define LBRY_NBITS_INDEX 26
@@ -19,18 +16,19 @@

 void lbry_hash(void* output, const void* input)
 {
-#if defined __SHA__
+#ifndef USE_SPH_SHA
   SHA256_CTX              ctx_sha256 __attribute__ ((aligned (64)));
+   SHA512_CTX              ctx_sha512 __attribute__ ((aligned (64)));
 #else
   sph_sha256_context      ctx_sha256 __attribute__ ((aligned (64)));
-#endif
   sph_sha512_context      ctx_sha512 __attribute__ ((aligned (64)));
+#endif
   sph_ripemd160_context   ctx_ripemd __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) hashA[16];
   uint32_t _ALIGN(64) hashB[16];
   uint32_t _ALIGN(64) hashC[16];

-#if defined __SHA__
+#ifndef USE_SPH_SHA
   SHA256_Init( &ctx_sha256 );
   SHA256_Update( &ctx_sha256, input, 112 );
   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
@@ -38,6 +36,10 @@ void lbry_hash(void* output, const void* input)
   SHA256_Init( &ctx_sha256 );
   SHA256_Update( &ctx_sha256, hashA, 32 );
   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
+
+   SHA512_Init( &ctx_sha512 );
+   SHA512_Update( &ctx_sha512, hashA, 32 );
+   SHA512_Final( (unsigned char*) hashA, &ctx_sha512 );
 #else
   sph_sha256_init( &ctx_sha256 );
   sph_sha256 ( &ctx_sha256, input, 112 );
@@ -46,11 +48,11 @@ void lbry_hash(void* output, const void* input)
   sph_sha256_init( &ctx_sha256 );
   sph_sha256 ( &ctx_sha256, hashA, 32 );
   sph_sha256_close( &ctx_sha256, hashA );
-#endif

   sph_sha512_init( &ctx_sha512 );
   sph_sha512 ( &ctx_sha512, hashA, 32 );
-   sph_sha512_close( &ctx_sha512, hashA );  
+   sph_sha512_close( &ctx_sha512, hashA );
+#endif

   sph_ripemd160_init( &ctx_ripemd );
   sph_ripemd160 ( &ctx_ripemd, hashA, 32 );
@@ -60,7 +62,7 @@ void lbry_hash(void* output, const void* input)
   sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
   sph_ripemd160_close( &ctx_ripemd, hashC );

-#if defined __SHA__
+#ifndef USE_SPH_SHA
   SHA256_Init( &ctx_sha256 );
   SHA256_Update( &ctx_sha256, hashB, 20 );
   SHA256_Update( &ctx_sha256, hashC, 20 );
@@ -219,7 +221,7 @@ int64_t lbry_get_max64() { return 0x1ffffLL; }

 bool register_lbry_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | SHA_OPT;
+  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
  gate->scanhash              = (void*)&scanhash_lbry;
  gate->hash                  = (void*)&lbry_hash;
  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
--- a/algo/luffa/luffa.c
+++ b/algo/luffa/luffa.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -21,8 +21,9 @@
 #define LYRA2_H_

 #include <stdint.h>
+#include "algo/sha/sha3-defs.h"

-typedef unsigned char byte;
+//typedef unsigned char byte;

 //Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
 #define BLOCK_LEN_BLAKE2_SAFE_INT64 8                                   //512 bits (=64 bytes, =8 uint64_t)
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -1,6 +1,5 @@
 #include <memory.h>

-#include "miner.h"
 #include "algo/blake/sph_blake.h"
 #include "algo/groestl/sph_groestl.h"
 #include "algo/skein/sph_skein.h"
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -1,6 +1,5 @@
 #include <memory.h>

-#include "miner.h"
 #include "algo-gate-api.h"

 #include "algo/blake/sph_blake.h"
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -1,5 +1,4 @@
 #include <memory.h>
-#include "miner.h"
 #include "algo-gate-api.h"
 #include "lyra2.h"
 #include "avxdefs.h"
--- a/algo/lyra2/zcoin.c
+++ b/algo/lyra2/zcoin.c
@@ -1,6 +1,5 @@
 #include <memory.h>
 #include <mm_malloc.h>
-#include "miner.h"
 #include "algo-gate-api.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -1,5 +1,4 @@
 #include "cpuminer-config.h"
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <gmp.h>
@@ -14,9 +13,7 @@
 #include "algo/tiger/sph_tiger.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/ripemd/sph_ripemd.h"
-#if defined __SHA__
- #include <openssl/sha.h>
-#endif
+#include <openssl/sha.h>


 #define EPSa DBL_EPSILON
@@ -120,12 +117,13 @@ uint32_t sw2_(int nnounce)
 }

 typedef struct {
-#if defined __SHA__
+#ifndef USE_SPH_SHA
    SHA256_CTX               sha256;
+    SHA512_CTX               sha512;
 #else
    sph_sha256_context       sha256;
-#endif
    sph_sha512_context       sha512;
+#endif
    sph_keccak512_context    keccak;
    sph_whirlpool_context    whirlpool;
    sph_haval256_5_context   haval;
@@ -137,12 +135,13 @@ m7m_ctx_holder m7m_ctx;

 void init_m7m_ctx()
 {
-#if defined __SHA__
+#ifndef USE_SPH_SHA
    SHA256_Init( &m7m_ctx.sha256 );
+    SHA512_Init( &m7m_ctx.sha512 );
 #else
    sph_sha256_init( &m7m_ctx.sha256 );
-#endif
    sph_sha512_init( &m7m_ctx.sha512 );
+#endif
    sph_keccak512_init( &m7m_ctx.keccak );
    sph_whirlpool_init( &m7m_ctx.whirlpool );
    sph_haval256_5_init( &m7m_ctx.haval );
@@ -177,7 +176,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

    m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
    memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
-#if defined __SHA__
+#ifndef USE_SPH_SHA
    SHA256_CTX         ctxf_sha256;
 #else
    sph_sha256_context ctxf_sha256;
@@ -185,18 +184,20 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

    memcpy(data, pdata, 80);

-#if defined __SHA__
+#ifndef USE_SPH_SHA
    SHA256_Update(  &ctx1.sha256,    data, M7_MIDSTATE_LEN );
+    SHA512_Update(  &ctx1.sha512,    data, M7_MIDSTATE_LEN );
 #else
    sph_sha256(     &ctx1.sha256,    data, M7_MIDSTATE_LEN );
-#endif
    sph_sha512(     &ctx1.sha512,    data, M7_MIDSTATE_LEN );
+#endif
    sph_keccak512(  &ctx1.keccak,    data, M7_MIDSTATE_LEN );
    sph_whirlpool(  &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
    sph_haval256_5( &ctx1.haval,     data, M7_MIDSTATE_LEN );
    sph_tiger(      &ctx1.tiger,     data, M7_MIDSTATE_LEN );
    sph_ripemd160(  &ctx1.ripemd,    data, M7_MIDSTATE_LEN );

+// the following calculations can be performed once and the results shared
    mpz_t magipi, magisw, product, bns0, bns1;
    mpf_t magifpi, magifpi0, mpt1, mpt2, mptmp, mpten;
    
@@ -221,16 +222,22 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

        memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );

-#if defined __SHA__
+// with 4 way can a single midstate be shared among lanes?
+// do sinlge round of midstate and inyerleave for final
+
+#ifndef USE_SPH_SHA
        SHA256_Update(  &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
        SHA256_Final( (unsigned char*) (bhash[0]), &ctx2.sha256 );
+
+        SHA512_Update(  &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
+        SHA512_Final( (unsigned char*) (bhash[1]), &ctx2.sha512 );
 #else
        sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_sha256_close( &ctx2.sha256, (void*)(bhash[0]) );
-#endif
+
        sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_sha512_close( &ctx2.sha512, (void*)(bhash[1]) );
-
+#endif
        sph_keccak512( &ctx2.keccak, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_keccak512_close( &ctx2.keccak, (void*)(bhash[2]) );

@@ -246,6 +253,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
        sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );

+// 4 way serial
 	mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
        mpz_set(bns1, bns0);
 	mpz_set(product, bns0);
@@ -261,7 +269,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
        bytes = mpz_sizeinbase(product, 256);
        mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);

-#if defined __SHA__
+#ifndef USE_SPH_SHA
        SHA256_Init( &ctxf_sha256 );
        SHA256_Update(  &ctxf_sha256, bdata, bytes );
        SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
@@ -271,6 +279,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
        sph_sha256_close( &ctxf_sha256, (void*)(hash) );
 #endif

+// do once and share
        digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
        mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
 	mpf_set_prec_raw(magifpi, prec);
@@ -293,7 +302,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
 	    mpz_set_f(magipi, magifpi);
            mpz_add(magipi,magipi,magisw);
            mpz_add(product,product,magipi);
-			
+// share magipi, product and do serial			
 	    mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
            mpz_add(bns1, bns1, bns0);
            mpz_mul(product,product,bns1);
@@ -303,7 +312,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
            mpzscale=bytes;
            mpz_export(bdata, NULL, -1, 1, 0, 0, product);

-#if defined __SHA__
+#ifndef USE_SPH_SHA
            SHA256_Init( &ctxf_sha256 );
            SHA256_Update(  &ctxf_sha256, bdata, bytes );
            SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
@@ -314,6 +323,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
 #endif
 	}

+// this is the scanhash part
 	const unsigned char *hash_ = (const unsigned char *)hash;
 	const unsigned char *target_ = (const unsigned char *)ptarget;
 	for ( i = 31; i >= 0; i-- )
@@ -343,6 +353,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

     pdata[19] = n;

+// do this in hashm7m
 out:
     mpf_set_prec_raw(magifpi, prec0);
     mpf_set_prec_raw(magifpi0, prec0);
@@ -361,21 +372,17 @@ out:
    return rc;
 }

-void m7m_reverse_endian( struct work *work )
-{
-   swab32_array( work->data, work->data, 20 );
-}
-
 bool register_m7m_algo( algo_gate_t *gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | SHA_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
  init_m7m_ctx();
  gate->scanhash              = (void*)scanhash_m7m_hash;
  gate->build_stratum_request = (void*)&std_be_build_stratum_request;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
  gate->set_target            = (void*)&scrypt_set_target;
  gate->get_max64             = (void*)&get_max64_0x1ffff;
-  gate->set_work_data_endian  = (void*)&m7m_reverse_endian;
-  gate->work_data_size        = 80;
+  gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
  return true;
 }

--- a/algo/neoscrypt.c
+++ b/algo/neoscrypt.c
@@ -31,7 +31,6 @@
 #include <string.h>
 #include <unistd.h>

-#include "miner.h"
 #include "algo-gate-api.h"

 #define USE_CUSTOM_BLAKE2S
@@ -1089,7 +1088,9 @@ bool register_neoscrypt_algo( algo_gate_t* gate )
  gate->set_target            = (void*)&scrypt_set_target;
  gate->wait_for_diff         = (void*)&neoscrypt_wait_for_diff;
  gate->build_stratum_request = (void*)&std_be_build_stratum_request;
-  gate->set_work_data_endian  = (void*)&swab_work_data;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
+  gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
  gate->work_data_size        = 80;
  return true;
 };
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -0,0 +1,178 @@
+#include "nist5-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(NIST5_4WAY)
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+
+// no improvement with midstate
+//static __thread blake512_4way_context ctx_mid;
+
+void nist5hash_4way( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake512_4way_context  ctx_blake;
+     hashState_groestl      ctx_groestl;
+     jh512_4way_context     ctx_jh;
+     skein512_4way_context  ctx_skein;
+     keccak512_4way_context ctx_keccak;
+
+//     memcpy( &ctx_blake, &ctx_mid, sizeof(ctx_mid) );
+//     blake512_4way( &ctx_blake, input + (64<<2), 16 );
+
+     blake512_4way_init( &ctx_blake );
+     blake512_4way( &ctx_blake, input, 80 );
+     blake512_4way_close( &ctx_blake, vhash );
+
+     m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash0,
+                               (const char*)hash0, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash1,
+                               (const char*)hash1, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash2,
+                               (const char*)hash2, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash3,
+                               (const char*)hash3, 512 );
+
+     m256_interleave_4x64x( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     jh512_4way_init( &ctx_jh );
+     jh512_4way( &ctx_jh, vhash, 64 );
+     jh512_4way_close( &ctx_jh, vhash );
+
+     keccak512_4way_init( &ctx_keccak );
+     keccak512_4way( &ctx_keccak, vhash, 64 );
+     keccak512_4way_close( &ctx_keccak, vhash );
+
+     skein512_4way_init( &ctx_skein );
+     skein512_4way( &ctx_skein, vhash, 64 );
+     skein512_4way_close( &ctx_skein, vhash );
+
+     m256_deinterleave_4x64x( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     memcpy( output,       hash0, 32 );
+     memcpy( output+32,    hash1, 32 );
+     memcpy( output+64,    hash2, 32 );
+     memcpy( output+96,    hash3, 32 );
+}
+
+int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done)
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t Htarg = ptarget[7];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+
+     uint64_t htmax[] = {          0,
+                                 0xF,
+                                0xFF,
+                               0xFFF,
+                              0xFFFF,
+                          0x10000000 };
+
+     uint32_t masks[] = { 0xFFFFFFFF,
+                          0xFFFFFFF0,
+                          0xFFFFFF00,
+                          0xFFFFF000,
+                          0xFFFF0000,
+                                   0 };
+
+     // we need bigendian data...
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     // precalc midstate
+//     blake512_4way_init( &ctx_mid );
+//     blake512_4way( &ctx_mid, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ )
+     {
+        if (Htarg <= htmax[m])
+        {
+           uint32_t mask = masks[m];
+
+           do {
+              found[0] = found[1] = found[2] = found[3] = false;
+              be32enc( noncep0, n   );
+              be32enc( noncep1, n+1 );
+              be32enc( noncep2, n+2 );
+              be32enc( noncep3, n+3 );
+
+              nist5hash_4way( hash, vdata );
+
+              pdata[19] = n;
+
+              if ( ( !(hash[7] & mask) )
+                   && fulltest( hash, ptarget ) ) 
+              {
+                 found[0] = true;
+                 num_found++;
+                 nonces[0] = n; 
+                 work_set_target_ratio( work, hash );
+              }
+              if ( ( !((hash+8)[7] & mask) )
+                   && fulltest( hash+8, ptarget ) )
+              {
+                 found[1] = true;
+                 num_found++;
+                 nonces[1] = n+1;
+                 work_set_target_ratio( work, hash+8 );
+              }
+              if ( ( !((hash+16)[7] & mask) )
+                 && fulltest( hash+16, ptarget ) )
+              {
+                 found[2] = true;
+                 num_found++;
+                 nonces[2] = n+2;
+                 work_set_target_ratio( work, hash+16 );
+              }
+              if ( ( !((hash+24)[7] & mask) )
+                   && fulltest( hash+24, ptarget ) )
+              {
+                 found[3] = true;
+                 num_found++;
+                 nonces[3] = n+3;
+                 work_set_target_ratio( work, hash+24 );
+              }
+              n += 4;
+           } while ( ( num_found == 0 ) && ( n < max_nonce )
+                     && !work_restart[thr_id].restart );
+           break;
+        }
+     }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/nist5/nist5-gate.c
+++ b/algo/nist5/nist5-gate.c
@@ -0,0 +1,17 @@
+#include "nist5-gate.h"
+
+bool register_nist5_algo( algo_gate_t* gate )
+{
+#if defined (NIST5_4WAY)
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+    gate->scanhash = (void*)&scanhash_nist5_4way;
+    gate->hash     = (void*)&nist5hash_4way;
+#else
+    gate->optimizations = SSE2_OPT | AES_OPT;
+    init_nist5_ctx();
+    gate->scanhash = (void*)&scanhash_nist5;
+    gate->hash     = (void*)&nist5hash;
+#endif
+    return true;
+};
+
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -0,0 +1,26 @@
+#ifndef __NIST5_GATE_H__
+#define __NIST5_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+  #define NIST5_4WAY
+#endif
+
+#if defined(NIST5_4WAY)
+
+void nist5hash_4way( void *state, const void *input );
+
+int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+#else
+
+void nist5hash( void *state, const void *input );
+
+int scanhash_nist5( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+#endif
+
+#endif
--- a/algo/nist5/nist5.c
+++ b/algo/nist5/nist5.c
@@ -1,5 +1,4 @@
-#include "miner.h"
-#include "algo-gate-api.h"
+#include "nist5-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -148,7 +147,7 @@ int scanhash_nist5(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
-
+/*
 bool register_nist5_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | AES_OPT;
@@ -157,4 +156,4 @@ bool register_nist5_algo( algo_gate_t* gate )
    gate->hash     = (void*)&nist5hash;
    return true;
 };
-
+*/
--- a/algo/pluck.c
+++ b/algo/pluck.c
@@ -25,7 +25,6 @@
 */

 #include "cpuminer-config.h"
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
--- a/algo/polytimos/polytimos-gate.c
+++ b/algo/polytimos/polytimos-gate.c
@@ -0,0 +1,12 @@
+#include "polytimos-gate.h"
+
+bool register_polytimos_algo( algo_gate_t* gate )
+{
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  init_polytimos_context();
+  gate->scanhash  = (void*)&scanhash_polytimos;
+  gate->hash      = (void*)&polytimos_hash;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/polytimos/polytimos-gate.h
+++ b/algo/polytimos/polytimos-gate.h
@@ -0,0 +1,12 @@
+#ifndef __POLYTIMOS_GATE_H__
+#define __POLYTIMOS_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+void polytimos_hash( void *state, const void *input );
+int scanhash_polytimos( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+void init_polytimos_context();
+
+#endif
--- a/algo/polytimos/polytimos.c
+++ b/algo/polytimos/polytimos.c
@@ -0,0 +1,115 @@
+#include "polytimos-gate.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "algo/skein/sph_skein.h"
+#include "algo/echo/sph_echo.h"
+#include "algo/fugue//sph_fugue.h"
+#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/shabal/sph_shabal.h"
+#include "algo/gost/sph_gost.h"
+#ifndef NO_AES_NI
+  #include "algo/echo/aes_ni/hash_api.h"
+#endif
+
+typedef struct {
+	sph_skein512_context    skein;
+        sph_shabal512_context   shabal;
+#ifdef NO_AES_NI
+	sph_echo512_context		echo;
+#else
+        hashState_echo          echo;
+#endif
+        hashState_luffa         luffa;
+	sph_fugue512_context    fugue;
+	sph_gost512_context     gost;
+} poly_ctx_holder;
+
+poly_ctx_holder poly_ctx;
+
+void init_polytimos_context()
+{
+	sph_skein512_init(&poly_ctx.skein);
+        sph_shabal512_init(&poly_ctx.shabal);
+#ifdef NO_AES_NI
+        sph_echo512_init(&poly_ctx.echo);
+#else
+        init_echo( &poly_ctx.echo, 512 );
+#endif
+        init_luffa( &poly_ctx.luffa, 512 );
+        sph_fugue512_init(&poly_ctx.fugue);
+        sph_gost512_init(&poly_ctx.gost);
+}
+
+void polytimos_hash(void *output, const void *input)
+{
+        uint32_t hashA[16] __attribute__ ((aligned (64)));
+        poly_ctx_holder ctx __attribute__ ((aligned (64)));
+        memcpy( &ctx, &poly_ctx, sizeof(poly_ctx) );
+
+	sph_skein512(&ctx.skein, input, 80);
+	sph_skein512_close(&ctx.skein, hashA);
+
+	sph_shabal512(&ctx.shabal, hashA, 64);
+	sph_shabal512_close(&ctx.shabal, hashA);
+
+#ifdef NO_AES_NI
+	sph_echo512(&ctx.echo, hashA, 64);
+	sph_echo512_close(&ctx.echo, hashA);
+#else
+        update_final_echo ( &ctx.echo, (BitSequence *)hashA,
+                            (const BitSequence *)hashA, 512 );
+#endif
+
+        update_and_final_luffa( &ctx.luffa, (BitSequence*)hashA,
+                                (const BitSequence*)hashA, 64 );
+
+	sph_fugue512(&ctx.fugue, hashA, 64);
+	sph_fugue512_close(&ctx.fugue, hashA);
+
+	sph_gost512(&ctx.gost, hashA, 64);
+	sph_gost512_close(&ctx.gost, hashA);
+
+	memcpy(output, hashA, 32);
+}
+
+int scanhash_polytimos(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+{
+	uint32_t _ALIGN(128) hash[8];
+	uint32_t _ALIGN(128) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+	volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x0cff;
+
+	// we need bigendian data...
+	for (int i=0; i < 19; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+	do {
+		be32enc(&endiandata[19], nonce);
+		polytimos_hash(hash, endiandata);
+
+		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+			work_set_target_ratio(work, hash);
+			pdata[19] = nonce;
+			*hashes_done = pdata[19] - first_nonce;
+			return 1;
+		}
+		nonce++;
+
+	} while (nonce < max_nonce && !(*restart));
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -1,5 +1,4 @@
 #include "cpuminer-config.h"
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdio.h>
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
--- a/algo/s3.c
+++ b/algo/s3.c
@@ -1,116 +0,0 @@
-#include "miner.h"
-#include "algo-gate-api.h"
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "algo/skein/sph_skein.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-
-void s3hash(void *output, const void *input)
-{
-
- 	sph_shavite512_context ctx_shavite;
-	sph_simd512_context ctx_simd;
-	sph_skein512_context ctx_skein;
-
-	unsigned char _ALIGN(128) hash[64];
-
-	sph_shavite512_init(&ctx_shavite);
-	sph_shavite512(&ctx_shavite, input, 80);
-	sph_shavite512_close(&ctx_shavite, (void*)hash);
-
-	sph_simd512_init(&ctx_simd);
-	sph_simd512(&ctx_simd, (const void*)hash, 64);
-	sph_simd512_close(&ctx_simd, (void*)hash);
-
-	sph_skein512_init(&ctx_skein);
-	sph_skein512(&ctx_skein, (const void*)hash, 64);
-	sph_skein512_close(&ctx_skein, (void*)hash);
-
-	memcpy(output, hash, 32);
-
-}
-
-int scanhash_s3(int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done)
-{
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	const uint32_t Htarg = ptarget[7];
-
-	uint32_t _ALIGN(32) hash64[8];
-	uint32_t endiandata[32];
-
-
-	uint64_t htmax[] = {
-		0,
-		0xF,
-		0xFF,
-		0xFFF,
-		0xFFFF,
-		0x10000000
-	};
-	uint32_t masks[] = {
-		0xFFFFFFFF,
-		0xFFFFFFF0,
-		0xFFFFFF00,
-		0xFFFFF000,
-		0xFFFF0000,
-		0
-	};
-
-	// we need bigendian data...
-	for (int kk=0; kk < 32; kk++) {
-		be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
-	};
-#ifdef DEBUG_ALGO
-	printf("[%d] Htarg=%X\n", thr_id, Htarg);
-#endif
-	for (int m=0; m < 6; m++) {
-		if (Htarg <= htmax[m]) {
-			uint32_t mask = masks[m];
-			do {
-				pdata[19] = ++n;
-				be32enc(&endiandata[19], n);
-				s3hash(hash64, endiandata);
-#ifndef DEBUG_ALGO
-				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
-					*hashes_done = n - first_nonce + 1;
-					return true;
-				}
-#else
-				if (!(n % 0x1000) && !thr_id) printf(".");
-				if (!(hash64[7] & mask)) {
-					printf("[%d]",thr_id);
-					if (fulltest(hash64, ptarget)) {
-						*hashes_done = n - first_nonce + 1;
-						return true;
-					}
-				}
-#endif
-			} while (n < max_nonce && !work_restart[thr_id].restart);
-			// see blake.c if else to understand the loop on htmax => mask
-			break;
-		}
-	}
-
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
-}
-
-bool register_s3_algo( algo_gate_t* gate )
-{
-    algo_not_tested();
-    gate->scanhash = (void*)&scanhash_s3;
-    gate->hash     = (void*)&s3hash;
-//  gate->get_max64 = &s3_get_max64;
-    return true;
-};
-
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -27,7 +27,6 @@
 * online backup system.
 */

-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
@@ -780,7 +779,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
 {
  gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
  gate->scanhash         = (void*)&scanhash_scrypt;
-  gate->hash             = (void*)&scrypt_1024_1_1_256_24way;
+//  gate->hash             = (void*)&scrypt_1024_1_1_256_24way;
  gate->set_target       = (void*)&scrypt_set_target;
  gate->get_max64        = (void*)&scrypt_get_max64;

--- a/algo/scryptjane/scrypt-jane.c
+++ b/algo/scryptjane/scrypt-jane.c
@@ -1,5 +1,3 @@
-#include "miner.h"
-
 #include <stdlib.h>
 #include <string.h>
 #include "inttypes.h"
--- a/algo/groestl/aes_ni/brg_types.h
+++ b/algo/groestl/aes_ni/brg_types.h
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -8,7 +8,6 @@
 * any later version.  See COPYING for more details.
 */

-#include "miner.h"
 #include "algo-gate-api.h"

 #include <string.h>
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -1,16 +1,13 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
 #include "sph_sha2.h"
+#include <openssl/sha.h>

-#if defined __SHA__
- #include <openssl/sha.h>
-
+#ifndef USE_SPH_SHA
 static SHA256_CTX sha256t_ctx __attribute__ ((aligned (64)));
 static __thread SHA256_CTX sha256t_mid  __attribute__ ((aligned (64)));
 #else
@@ -21,7 +18,7 @@
 void sha256t_midstate( const void* input )
 {
    memcpy( &sha256t_mid, &sha256t_ctx, sizeof sha256t_mid );
-#if defined __SHA__
+#ifndef USE_SPH_SHA
    SHA256_Update( &sha256t_mid, input, 64 );
 #else
    sph_sha256( &sha256t_mid, input, 64 );
@@ -34,7 +31,7 @@ void sha256t_hash(void* output, const void* input,  uint32_t len)
        const int midlen = 64;            // bytes
        const int tail   = 80 - midlen;   // 16

-#if defined __SHA__
+#ifndef USE_SPH_SHA 
        SHA256_CTX ctx_sha256 __attribute__ ((aligned (64)));
        memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );

@@ -150,12 +147,12 @@ void sha256t_set_target( struct work* work, double job_diff )

 bool register_sha256t_algo( algo_gate_t* gate )
 {
-#if defined __SHA__
+#ifndef USE_SPH_SHA
    SHA256_Init( &sha256t_ctx );
 #else
    sph_sha256_init( &sha256t_ctx );
 #endif
-    gate->optimizations = SSE2_OPT | SHA_OPT;
+    gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t;
    gate->hash       = (void*)&sha256t_hash;
    gate->set_target = (void*)&sha256t_set_target;
--- a/algo/simd/sse2/nist.h
+++ b/algo/simd/sse2/nist.h
@@ -8,28 +8,12 @@
 #define DATA_ALIGN(x) __declspec(align(16)) x
 #endif

-#include "compat.h"
+#include "simd-compat.h"
 #include "algo/sha/sha3-defs.h"
 /*
 * NIST API Specific types.
 */

-//typedef unsigned char BitSequence;
-
-//#ifdef HAS_64
- // typedef u64 DataLength;
-//#else
- // typedef unsigned long DataLength;
-//#endif
-
-// can't find u32 or fft-t
-#include <stdint.h>
-typedef uint32_t u32;
-typedef int fft_t;
-
-
-//typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
-
 typedef struct {
  unsigned int hashbitlen;
  unsigned int blocksize;
--- a/algo/simd/sse2/simd-compat.h
+++ b/algo/simd/sse2/simd-compat.h
@@ -1,5 +1,5 @@
-#ifndef __COMPAT_H__
-#define __COMPAT_H__
+#ifndef __SIMD_COMPAT_H__
+#define __SIMD_COMPAT_H__

 #include <limits.h>

@@ -24,14 +24,7 @@
 */

 #include <stdint.h>
-
-#ifdef UINT32_MAX
-typedef uint32_t u32;
-#else
-typedef uint_fast32_t u32;
-#endif
-
-typedef unsigned long long u64;
+#include "algo/sha/brg_types.h"

 #define C32(x)    ((u32)(x))

--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -0,0 +1,120 @@
+#include "skein-gate.h"
+#include <string.h>
+#include <stdint.h>
+#include <openssl/sha.h>
+#include "skein-hash-4way.h"
+
+#if defined (__AVX2__)
+
+void skeinhash_4way( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     skein512_4way_context ctx_skein;
+     SHA256_CTX            ctx_sha256;
+
+     skein512_4way_init( &ctx_skein );
+     skein512_4way( &ctx_skein, input, 80 );
+     skein512_4way_close( &ctx_skein, vhash );
+
+     m256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
+     SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
+     SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
+     SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
+     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
+
+     memcpy(  (char*)state,       (char*)hash0, 32 );
+     memcpy( ((char*)state) + 32, (char*)hash1, 32 );
+     memcpy( ((char*)state) + 64, (char*)hash2, 32 );
+     memcpy( ((char*)state) + 96, (char*)hash3, 32 );
+}
+
+int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done )
+{
+    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t endiandata[20] __attribute__ ((aligned (64)));
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint64_t *edata = (uint64_t*)endiandata;
+    const uint32_t Htarg = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t n = first_nonce;
+    // hash is returned deinterleaved
+    uint32_t *nonces = work->nonces;
+    bool *found = work->nfound;
+    int num_found = 0;
+
+// data is 80 bytes, 20 u32 or 4 u64.
+	
+    swab32_array( endiandata, pdata, 20 );
+ 
+    m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+    uint32_t *noncep1 = vdata + 75;
+    uint32_t *noncep2 = vdata + 77;
+    uint32_t *noncep3 = vdata + 79;
+
+   do
+   {
+       found[0] = found[1] = found[2] = found[3] = false;
+       be32enc( noncep0, n   );
+       be32enc( noncep1, n+1 );
+       be32enc( noncep2, n+2 );
+       be32enc( noncep3, n+3 );
+
+       skeinhash_4way( hash, vdata );
+
+       if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
+       {
+           found[0] = true;
+           num_found++;
+           nonces[0] = n;
+           // always put nonce0 in work data for compartibility with 
+           // non vectored algos.
+           pdata[19] = n;
+       }
+       if ( (hash+8)[7] < Htarg && fulltest( hash+8, ptarget ) )
+       {
+           found[1] = true;
+           num_found++;
+           nonces[1] = n+1;           
+       }
+       if ( (hash+16)[7] < Htarg && fulltest( hash+16, ptarget ) )
+       {
+           found[2] = true;
+           num_found++;
+           nonces[2] = n+2;           
+       }
+       if ( (hash+24)[7] < Htarg && fulltest( hash+24, ptarget ) )
+       {
+           found[3] = true;
+           num_found++;
+           nonces[3] = n+3;           
+       }
+       n += 4;
+    } while ( (num_found == 0) && (n < max_nonce)
+               && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce + 1;
+    return num_found;
+}
+
+#endif
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -0,0 +1,21 @@
+#include "skein-gate.h"
+#include "sph_skein.h"
+#include "skein-hash-4way.h"
+
+int64_t skein_get_max64() { return 0x7ffffLL; }
+
+bool register_skein_algo( algo_gate_t* gate )
+{
+#if defined (SKEIN_4WAY)
+    gate->optimizations = SSE2_OPT | AVX2_OPT | SHA_OPT;
+    gate->scanhash  = (void*)&scanhash_skein_4way;
+    gate->hash      = (void*)&skeinhash_4way;
+#else
+    gate->optimizations = SSE2_OPT | SHA_OPT;
+    gate->scanhash  = (void*)&scanhash_skein;
+    gate->hash      = (void*)&skeinhash;
+#endif
+    gate->get_max64 = (void*)&skein_get_max64;
+    return true;
+};
+
--- a/algo/skein/skein-gate.h
+++ b/algo/skein/skein-gate.h
@@ -0,0 +1,23 @@
+#ifndef __SKEIN_GATE_H__
+#define __SKEIN_GATE_H__
+#include <stdint.h>
+#include "algo-gate-api.h"
+
+#if defined(FOUR_WAY) && defined(__AVX2__)
+  #define SKEIN_4WAY
+#endif
+
+#if defined(SKEIN_4WAY)
+
+void skeinhash_4way( void *output, const void *input );
+
+int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
+void skeinhash( void *output, const void *input );
+
+int scanhash_skein( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+#endif
--- a/algo/skein/skein-hash-4way.c
+++ b/algo/skein/skein-hash-4way.c
@@ -0,0 +1,598 @@
+/* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */
+/*
+ * Skein implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#if defined (__AVX2__)
+
+#include <stddef.h>
+#include <string.h>
+#include "skein-hash-4way.h"
+
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * M9_ ## s ## _ ## i  evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
+ */
+
+#define M9_0_0    0
+#define M9_0_1    1
+#define M9_0_2    2
+#define M9_0_3    3
+#define M9_0_4    4
+#define M9_0_5    5
+#define M9_0_6    6
+#define M9_0_7    7
+
+#define M9_1_0    1
+#define M9_1_1    2
+#define M9_1_2    3
+#define M9_1_3    4
+#define M9_1_4    5
+#define M9_1_5    6
+#define M9_1_6    7
+#define M9_1_7    8
+
+#define M9_2_0    2
+#define M9_2_1    3
+#define M9_2_2    4
+#define M9_2_3    5
+#define M9_2_4    6
+#define M9_2_5    7
+#define M9_2_6    8
+#define M9_2_7    0
+
+#define M9_3_0    3
+#define M9_3_1    4
+#define M9_3_2    5
+#define M9_3_3    6
+#define M9_3_4    7
+#define M9_3_5    8
+#define M9_3_6    0
+#define M9_3_7    1
+
+#define M9_4_0    4
+#define M9_4_1    5
+#define M9_4_2    6
+#define M9_4_3    7
+#define M9_4_4    8
+#define M9_4_5    0
+#define M9_4_6    1
+#define M9_4_7    2
+
+#define M9_5_0    5
+#define M9_5_1    6
+#define M9_5_2    7
+#define M9_5_3    8
+#define M9_5_4    0
+#define M9_5_5    1
+#define M9_5_6    2
+#define M9_5_7    3
+
+#define M9_6_0    6
+#define M9_6_1    7
+#define M9_6_2    8
+#define M9_6_3    0
+#define M9_6_4    1
+#define M9_6_5    2
+#define M9_6_6    3
+#define M9_6_7    4
+
+#define M9_7_0    7
+#define M9_7_1    8
+#define M9_7_2    0
+#define M9_7_3    1
+#define M9_7_4    2
+#define M9_7_5    3
+#define M9_7_6    4
+#define M9_7_7    5
+
+#define M9_8_0    8
+#define M9_8_1    0
+#define M9_8_2    1
+#define M9_8_3    2
+#define M9_8_4    3
+#define M9_8_5    4
+#define M9_8_6    5
+#define M9_8_7    6
+
+#define M9_9_0    0
+#define M9_9_1    1
+#define M9_9_2    2
+#define M9_9_3    3
+#define M9_9_4    4
+#define M9_9_5    5
+#define M9_9_6    6
+#define M9_9_7    7
+
+#define M9_10_0   1
+#define M9_10_1   2
+#define M9_10_2   3
+#define M9_10_3   4
+#define M9_10_4   5
+#define M9_10_5   6
+#define M9_10_6   7
+#define M9_10_7   8
+
+#define M9_11_0   2
+#define M9_11_1   3
+#define M9_11_2   4
+#define M9_11_3   5
+#define M9_11_4   6
+#define M9_11_5   7
+#define M9_11_6   8
+#define M9_11_7   0
+
+#define M9_12_0   3
+#define M9_12_1   4
+#define M9_12_2   5
+#define M9_12_3   6
+#define M9_12_4   7
+#define M9_12_5   8
+#define M9_12_6   0
+#define M9_12_7   1
+
+#define M9_13_0   4
+#define M9_13_1   5
+#define M9_13_2   6
+#define M9_13_3   7
+#define M9_13_4   8
+#define M9_13_5   0
+#define M9_13_6   1
+#define M9_13_7   2
+
+#define M9_14_0   5
+#define M9_14_1   6
+#define M9_14_2   7
+#define M9_14_3   8
+#define M9_14_4   0
+#define M9_14_5   1
+#define M9_14_6   2
+#define M9_14_7   3
+
+#define M9_15_0   6
+#define M9_15_1   7
+#define M9_15_2   8
+#define M9_15_3   0
+#define M9_15_4   1
+#define M9_15_5   2
+#define M9_15_6   3
+#define M9_15_7   4
+
+#define M9_16_0   7
+#define M9_16_1   8
+#define M9_16_2   0
+#define M9_16_3   1
+#define M9_16_4   2
+#define M9_16_5   3
+#define M9_16_6   4
+#define M9_16_7   5
+
+#define M9_17_0   8
+#define M9_17_1   0
+#define M9_17_2   1
+#define M9_17_3   2
+#define M9_17_4   3
+#define M9_17_5   4
+#define M9_17_6   5
+#define M9_17_7   6
+
+#define M9_18_0   0
+#define M9_18_1   1
+#define M9_18_2   2
+#define M9_18_3   3
+#define M9_18_4   4
+#define M9_18_5   5
+#define M9_18_6   6
+#define M9_18_7   7
+
+/*
+ * M3_ ## s ## _ ## i  evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
+ */
+
+#define M3_0_0    0
+#define M3_0_1    1
+#define M3_1_0    1
+#define M3_1_1    2
+#define M3_2_0    2
+#define M3_2_1    0
+#define M3_3_0    0
+#define M3_3_1    1
+#define M3_4_0    1
+#define M3_4_1    2
+#define M3_5_0    2
+#define M3_5_1    0
+#define M3_6_0    0
+#define M3_6_1    1
+#define M3_7_0    1
+#define M3_7_1    2
+#define M3_8_0    2
+#define M3_8_1    0
+#define M3_9_0    0
+#define M3_9_1    1
+#define M3_10_0   1
+#define M3_10_1   2
+#define M3_11_0   2
+#define M3_11_1   0
+#define M3_12_0   0
+#define M3_12_1   1
+#define M3_13_0   1
+#define M3_13_1   2
+#define M3_14_0   2
+#define M3_14_1   0
+#define M3_15_0   0
+#define M3_15_1   1
+#define M3_16_0   1
+#define M3_16_1   2
+#define M3_17_0   2
+#define M3_17_1   0
+#define M3_18_0   0
+#define M3_18_1   1
+
+#define XCAT(x, y)     XCAT_(x, y)
+#define XCAT_(x, y)    x ## y
+
+
+#define SKBI(k, s, i)   XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
+#define SKBT(t, s, v)   XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
+
+// AVX2 all scalar vars are now vectors representing 4 nonces in parallel
+
+#define TFBIG_KINIT_4WAY( k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2 ) \
+do { \
+  k8 = _mm256_xor_si256( _mm256_xor_si256( \
+                            _mm256_xor_si256( _mm256_xor_si256( k0, k1 ), \
+                                              _mm256_xor_si256( k2, k3 ) ), \
+                            _mm256_xor_si256( _mm256_xor_si256( k4, k5 ), \
+                                              _mm256_xor_si256( k6, k7 ) ) ), \
+                         _mm256_set_epi64x( SPH_C64(0x1BD11BDAA9FC1A22), \
+                                            SPH_C64(0x1BD11BDAA9FC1A22), \
+                                            SPH_C64(0x1BD11BDAA9FC1A22), \
+                                            SPH_C64(0x1BD11BDAA9FC1A22) ) ); \
+  t2 = t0 ^ t1; \
+} while (0)
+
+#define TFBIG_ADDKEY_4WAY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) \
+do { \
+  w0 = _mm256_add_epi64( w0, SKBI(k,s,0) ); \
+  w1 = _mm256_add_epi64( w1, SKBI(k,s,1) ); \
+  w2 = _mm256_add_epi64( w2, SKBI(k,s,2) ); \
+  w3 = _mm256_add_epi64( w3, SKBI(k,s,3) ); \
+  w4 = _mm256_add_epi64( w4, SKBI(k,s,4) ); \
+  w5 = _mm256_add_epi64( w5, _mm256_add_epi64( SKBI(k,s,5), \
+                           _mm256_set_epi64x( SKBT(t,s,0), SKBT(t,s,0), \
+                                              SKBT(t,s,0), SKBT(t,s,0) ) ) ); \
+  w6 = _mm256_add_epi64( w6, _mm256_add_epi64( SKBI(k,s,6), \
+                           _mm256_set_epi64x( SKBT(t,s,1), SKBT(t,s,1), \
+                                              SKBT(t,s,1), SKBT(t,s,1) ) ) ); \
+  w7 = _mm256_add_epi64( w7, _mm256_add_epi64( SKBI(k,s,7), \
+                                      _mm256_set_epi64x( s, s, s, s ) ) ); \
+} while (0)
+
+
+#define TFBIG_MIX_4WAY(x0, x1, rc) \
+do { \
+     x0 = _mm256_add_epi64( x0, x1 ); \
+     x1 = _mm256_xor_si256( mm256_rotl_64( x1, rc ), x0 ); \
+} while (0)
+ 
+
+// typeless
+#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3)  do { \
+		TFBIG_MIX_4WAY(w0, w1, rc0); \
+		TFBIG_MIX_4WAY(w2, w3, rc1); \
+		TFBIG_MIX_4WAY(w4, w5, rc2); \
+		TFBIG_MIX_4WAY(w6, w7, rc3); \
+	} while (0)
+
+
+#define TFBIG_4e(s)   do { \
+		TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56); \
+	} while (0)
+
+#define TFBIG_4o(s)   do { \
+		TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \
+		TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \
+		TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \
+		TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \
+		TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3,  8, 35, 56, 22); \
+	} while (0)
+
+
+// scale buf offset by 4
+#define UBI_BIG_4WAY(etype, extra) \
+do { \
+  sph_u64 t0, t1, t2; \
+  __m256i h8; \
+/* can LE be assumed? \
+   dec64le does nothing when SPH_LITTLE endian is set, as it is. \
+  __m256i m0 = _mm256_dec64le( buf ); \
+  __m256i m1 = _mm256_dec64le( buf +  8*4 ); \
+  __m256i m2 = _mm256_dec64le( buf + 16*4 ); \
+  __m256i m3 = _mm256_dec64le( buf + 24*4 ); \
+  __m256i m4 = _mm256_dec64le( buf + 32*4 ); \
+  __m256i m5 = _mm256_dec64le( buf + 40*4 ); \
+  __m256i m6 = _mm256_dec64le( buf + 48*4 ); \
+  __m256i m7 = _mm256_dec64le( buf + 56*4 ); \
+*/ \
+  __m256i m0 =  buf[0]; \
+  __m256i m1 =  buf[1]; \
+  __m256i m2 =  buf[2]; \
+  __m256i m3 =  buf[3]; \
+  __m256i m4 =  buf[4]; \
+  __m256i m5 =  buf[5]; \
+  __m256i m6 =  buf[6]; \
+  __m256i m7 =  buf[7]; \
+\
+  __m256i p0 = m0; \
+  __m256i p1 = m1; \
+  __m256i p2 = m2; \
+  __m256i p3 = m3; \
+  __m256i p4 = m4; \
+  __m256i p5 = m5; \
+  __m256i p6 = m6; \
+  __m256i p7 = m7; \
+  t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \
+  t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \
+  TFBIG_KINIT_4WAY(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \
+  TFBIG_4e(0); \
+  TFBIG_4o(1); \
+  TFBIG_4e(2); \
+  TFBIG_4o(3); \
+  TFBIG_4e(4); \
+  TFBIG_4o(5); \
+  TFBIG_4e(6); \
+  TFBIG_4o(7); \
+  TFBIG_4e(8); \
+  TFBIG_4o(9); \
+  TFBIG_4e(10); \
+  TFBIG_4o(11); \
+  TFBIG_4e(12); \
+  TFBIG_4o(13); \
+  TFBIG_4e(14); \
+  TFBIG_4o(15); \
+  TFBIG_4e(16); \
+  TFBIG_4o(17); \
+  TFBIG_ADDKEY_4WAY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \
+  h0 = _mm256_xor_si256( m0, p0 );\
+  h1 = _mm256_xor_si256( m1, p1 );\
+  h2 = _mm256_xor_si256( m2, p2 );\
+  h3 = _mm256_xor_si256( m3, p3 );\
+  h4 = _mm256_xor_si256( m4, p4 );\
+  h5 = _mm256_xor_si256( m5, p5 );\
+  h6 = _mm256_xor_si256( m6, p6 );\
+  h7 = _mm256_xor_si256( m7, p7 );\
+} while (0)
+
+
+#define DECL_STATE_BIG_4WAY \
+  __m256i h0, h1, h2, h3, h4, h5, h6, h7; \
+  sph_u64 bcount;
+
+#define READ_STATE_BIG(sc)   do { \
+		h0 = (sc)->h0; \
+		h1 = (sc)->h1; \
+		h2 = (sc)->h2; \
+		h3 = (sc)->h3; \
+		h4 = (sc)->h4; \
+		h5 = (sc)->h5; \
+		h6 = (sc)->h6; \
+		h7 = (sc)->h7; \
+		bcount = sc->bcount; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		(sc)->h0 = h0; \
+		(sc)->h1 = h1; \
+		(sc)->h2 = h2; \
+		(sc)->h3 = h3; \
+		(sc)->h4 = h4; \
+		(sc)->h5 = h5; \
+		(sc)->h6 = h6; \
+		(sc)->h7 = h7; \
+		sc->bcount = bcount; \
+	} while (0)
+
+
+static void
+skein_big_init_4way( skein512_4way_context *sc, const sph_u64 *iv )
+{
+        sc->h0 = _mm256_set_epi64x( iv[0], iv[0],iv[0],iv[0] );
+        sc->h1 = _mm256_set_epi64x( iv[1], iv[1],iv[1],iv[1] );
+        sc->h2 = _mm256_set_epi64x( iv[2], iv[2],iv[2],iv[2] );
+        sc->h3 = _mm256_set_epi64x( iv[3], iv[3],iv[3],iv[3] );
+        sc->h4 = _mm256_set_epi64x( iv[4], iv[4],iv[4],iv[4] );
+        sc->h5 = _mm256_set_epi64x( iv[5], iv[5],iv[5],iv[5] );
+        sc->h6 = _mm256_set_epi64x( iv[6], iv[6],iv[6],iv[6] );
+        sc->h7 = _mm256_set_epi64x( iv[7], iv[7],iv[7],iv[7] );
+        sc->bcount = 0;
+        sc->ptr = 0;
+}
+
+static void
+skein_big_core_4way( skein512_4way_context *sc, const void *data,
+                     size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf;
+   size_t ptr;
+   unsigned first;
+   DECL_STATE_BIG_4WAY
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   const int buf_size = 64;   // 64 * _m256i
+
+   if ( len <= buf_size - ptr )
+   {
+       memcpy_m256i( buf + (ptr>>3), vdata, len>>3 );
+       sc->ptr = ptr + len;
+       return;
+   }
+
+   READ_STATE_BIG( sc );
+   first = ( bcount == 0 ) << 7;
+   do {
+       size_t clen;
+
+       if ( ptr == buf_size )
+       {
+            bcount ++;
+            UBI_BIG_4WAY( 96 + first, 0 );
+            first = 0;
+            ptr = 0;
+       }
+       clen = buf_size - ptr;
+       if ( clen > len )
+            clen = len;
+       memcpy_m256i( buf + (ptr>>3), vdata, clen>>3 );
+       ptr += clen;
+       vdata += (clen>>3);
+       len -= clen;
+   } while ( len > 0 );
+   WRITE_STATE_BIG( sc );
+   sc->ptr = ptr;
+}
+
+static void
+skein_big_close_4way( skein512_4way_context *sc, unsigned ub, unsigned n,
+                      void *dst, size_t out_len )
+{
+	__m256i *buf;
+	size_t ptr;
+	unsigned et;
+	DECL_STATE_BIG_4WAY
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+        const int buf_size = 64;
+
+	/*
+	 * At that point, if ptr == 0, then the message was empty;
+	 * otherwise, there is between 1 and 64 bytes (inclusive) which
+	 * are yet to be processed. Either way, we complete the buffer
+	 * to a full block with zeros (the Skein specification mandates
+	 * that an empty message is padded so that there is at least
+	 * one block to process).
+	 *
+	 * Once this block has been processed, we do it again, with
+	 * a block full of zeros, for the output (that block contains
+	 * the encoding of "0", over 8 bytes, then padded with zeros).
+	 */
+
+	READ_STATE_BIG(sc);
+
+        memset_zero_m256i( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+	et = 352 + ((bcount == 0) << 7);
+        UBI_BIG_4WAY( et, ptr );
+
+        memset_zero_m256i( buf, buf_size >> 3 );
+        bcount = 0;
+        UBI_BIG_4WAY( 510, 8 );
+
+        buf[0] = h0;
+        buf[1] = h1;
+        buf[2] = h2;
+        buf[3] = h3;
+        buf[4] = h4;
+        buf[5] = h5;
+        buf[6] = h6;
+        buf[7] = h7;
+
+        memcpy_m256i( dst, buf, out_len >> 3 );
+}
+
+static const sph_u64 IV256[] = {
+	SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB),
+	SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB),
+	SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251),
+	SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13)
+};
+
+static const sph_u64 IV512[] = {
+	SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03),
+	SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1),
+	SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4),
+	SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33)
+};
+
+
+void
+skein256_4way_init(void *cc)
+{
+	skein_big_init_4way(cc, IV256);
+}
+
+void
+skein256_4way(void *cc, const void *data, size_t len)
+{
+	skein_big_core_4way(cc, data, len);
+}
+
+void
+skein256_4way_close(void *cc, void *dst)
+{
+        skein_big_close_4way(cc, 0, 0, dst, 32);
+}
+
+void
+skein512_4way_init(void *cc)
+{
+	skein_big_init_4way(cc, IV512);
+}
+
+void
+skein512_4way(void *cc, const void *data, size_t len)
+{
+	skein_big_core_4way(cc, data, len);
+}
+
+void
+skein512_4way_close(void *cc, void *dst)
+{
+        skein_big_close_4way(cc, 0, 0, dst, 64);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/skein/skein-hash-4way.h
+++ b/algo/skein/skein-hash-4way.h
@@ -0,0 +1,93 @@
+/* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */
+/**
+ * Skein interface. The Skein specification defines three main
+ * functions, called Skein-256, Skein-512 and Skein-1024, which can be
+ * further parameterized with an output length. For the SHA-3
+ * competition, Skein-512 is used for output sizes of 224, 256, 384 and
+ * 512 bits; this is what this code implements. Thus, we hereafter call
+ * Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein
+ * specification defines as Skein-512-224, Skein-512-256, Skein-512-384
+ * and Skein-512-512, respectively.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_skein.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef __SKEIN_HASH_4WAY_H__
+#define __SKEIN_HASH_4WAY_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+// Output size in bits
+#define SPH_SIZE_skein256   256
+#define SPH_SIZE_skein512   512
+
+#ifdef __AVX2__
+
+typedef struct {
+        __m256i buf[8] __attribute__ ((aligned (32)));
+        __m256i h0, h1, h2, h3, h4, h5, h6, h7;
+        size_t ptr;
+	sph_u64 bcount;
+} skein512_4way_context;
+
+void skein512_4way_init(void *cc);
+void skein512_4way(void *cc, const void *data, size_t len);
+void skein512_4way_close(void *cc, void *dst);
+//void sph_skein512_addbits_and_close(
+//        void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __AVX__
+
+typedef struct {
+        __m128i buf[8] __attribute__ ((aligned (32)));
+        __m128i h0, h1, h2, h3, h4, h5, h6, h7;
+        size_t ptr;
+        sph_u64 bcount;
+} skein256_4way_context;
+
+void skein256_4way_init(void *cc);
+void skein256_4way(void *cc, const void *data, size_t len);
+void skein256_4way_close(void *cc, void *dst);
+//void sph_skein256_addbits_and_close(
+//	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/algo/skein/skein.c
+++ b/algo/skein/skein.c
@@ -1,53 +1,32 @@
-#include "miner.h"
 #include "algo-gate-api.h"
-
 #include <string.h>
 #include <stdint.h>
-
 #include "sph_skein.h"
-
-#if defined __SHA__
 #include <openssl/sha.h>
-#else
 #include "algo/sha/sph_sha2.h"
-#endif
-
-typedef struct {
-   sph_skein512_context skein;
-#if defined __SHA__
-   SHA256_CTX         sha256;
-#else
-   sph_sha256_context sha256;
-#endif
-} skein_ctx_holder;
-
-skein_ctx_holder skein_ctx;
-
-void init_skein_ctx()
-{
-   sph_skein512_init( &skein_ctx.skein );
-#if defined __SHA__
-   SHA256_Init( &skein_ctx.sha256 );
-#else
-   sph_sha256_init( &skein_ctx.sha256 );
-#endif
-}

 void skeinhash(void *state, const void *input)
 {
-     skein_ctx_holder ctx __attribute__ ((aligned (64)));
-     memcpy( &ctx, &skein_ctx, sizeof(skein_ctx) );
     uint32_t hash[16] __attribute__ ((aligned (64)));
-	
-     sph_skein512( &ctx.skein, input, 80 );
-     sph_skein512_close( &ctx.skein, hash );
-
-#if defined __SHA__
-     SHA256_Update( &ctx.sha256, hash, 64 );
-     SHA256_Final( (unsigned char*) hash, &ctx.sha256 );
+     sph_skein512_context ctx_skein;
+#ifndef USE_SPH_SHA
+     SHA256_CTX           ctx_sha256;
 #else
-     sph_sha256( &ctx.sha256, hash, 64 );
-     sph_sha256_close( &ctx.sha256, hash );
+     sph_sha256_context   ctx_sha256;
+#endif
+
+     sph_skein512_init( &ctx_skein );
+     sph_skein512( &ctx_skein, input, 80 );
+     sph_skein512_close( &ctx_skein, hash );
+
+#ifndef USE_SPH_SHA
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash, 64 );
+     SHA256_Final( (unsigned char*) hash, &ctx_sha256 );
+#else
+     sph_sha256_init( &ctx_sha256 );
+     sph_sha256( &ctx_sha256, hash, 64 );
+     sph_sha256_close( &ctx_sha256, hash );
 #endif

     memcpy(state, hash, 32);
@@ -84,15 +63,3 @@ int scanhash_skein(int thr_id, struct work *work,
 	return 0;
 }

-int64_t skein_get_max64() { return 0x7ffffLL; }
-
-bool register_skein_algo( algo_gate_t* gate )
-{
-    init_skein_ctx();
-    gate->optimizations = SSE2_OPT | SHA_OPT;
-    gate->scanhash  = (void*)&scanhash_skein;
-    gate->hash      = (void*)&skeinhash;
-    gate->get_max64 = (void*)&skein_get_max64;
-    return true;
-};
-
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -0,0 +1,93 @@
+#include "skein2-gate.h"
+#include <string.h>
+#include <stdint.h>
+#include "skein-hash-4way.h"
+
+#if defined(SKEIN2_4WAY)
+
+void skein2hash_4way( void *output, const void *input )
+{
+   skein512_4way_context ctx;
+   uint64_t hash[8*4] __attribute__ ((aligned (64)));
+   uint64_t *out64 = (uint64_t*)output;
+
+   skein512_4way_init( &ctx );
+   skein512_4way( &ctx, input, 80 );
+   skein512_4way_close( &ctx, hash );
+
+   skein512_4way_init( &ctx );
+   skein512_4way( &ctx, hash, 64 );
+   skein512_4way_close( &ctx, hash );
+
+   m256_deinterleave_4x64( out64, out64+4, out64+8, out64+12, hash, 256 );
+}
+
+int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t endiandata[20] __attribute__ ((aligned (64)));
+    uint64_t *edata = (uint64_t*)endiandata;
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t Htarg = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t n = first_nonce;
+    // hash is returned deinterleaved
+    uint32_t *nonces = work->nonces;
+    bool *found = work->nfound;
+    int num_found = 0;
+
+    swab32_array( endiandata, pdata, 20 );
+
+    m256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+    uint32_t *noncep1 = vdata + 75;
+    uint32_t *noncep2 = vdata + 77;
+    uint32_t *noncep3 = vdata + 79;
+
+    do 
+    {
+       found[0] = found[1] = found[2] = found[3] = false;
+       be32enc( noncep0, n   );
+       be32enc( noncep1, n+1 );
+       be32enc( noncep2, n+2 );
+       be32enc( noncep3, n+3 );
+
+       skein2hash( hash, vdata );
+
+       if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
+       {
+           found[0] = true;
+           num_found++;
+           nonces[0] = n;
+       }
+       if ( (hash+8)[7] < Htarg && fulltest( hash+8, ptarget ) )
+       {
+           found[1] = true;
+           num_found++;
+           nonces[1] = n+1;
+       }
+       if ( (hash+16)[7] < Htarg && fulltest( hash+16, ptarget ) )
+       {
+           found[2] = true;
+           num_found++;
+           nonces[2] = n+2;
+       }
+       if ( (hash+24)[7] < Htarg && fulltest( hash+24, ptarget ) )
+       {
+           found[3] = true;
+           num_found++;
+           nonces[3] = n+3;
+       }
+       n += 4;
+    } while ( (num_found == 0) && (n < max_nonce)
+             &&  !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce + 1;
+    return num_found;
+}
+
+#endif
--- a/algo/skein/skein2-gate.c
+++ b/algo/skein/skein2-gate.c
@@ -0,0 +1,24 @@
+#include "skein2-gate.h"
+#include <stdint.h>
+#include "sph_skein.h"
+
+int64_t skein2_get_max64 ()
+{
+  return 0x7ffffLL;
+}
+
+bool register_skein2_algo( algo_gate_t* gate )
+{
+#if defined (FOUR_WAY) && defined (__AVX2__)
+  gate->optimizations = SSE2_OPT | AVX2_OPT;
+  gate->scanhash  = (void*)&scanhash_skein2_4way;
+  gate->hash      = (void*)&skein2hash_4way;
+#else
+  gate->optimizations = SSE2_OPT;
+  gate->scanhash  = (void*)&scanhash_skein2;
+  gate->hash      = (void*)&skein2hash;
+#endif
+  gate->get_max64 = (void*)&skein2_get_max64;
+  return true;
+};
+
--- a/algo/skein/skein2-gate.h
+++ b/algo/skein/skein2-gate.h
@@ -0,0 +1,20 @@
+#ifndef __SKEIN2GATE_H__
+#define __SKEIN2_GATE_H__
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX2__)
+  #define SKEIN2_4WAY
+#endif
+
+#if defined(SKEIN2_4WAY)
+void skein2hash_4way( void *output, const void *input );
+int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t* hashes_done );
+#endif
+
+void skein2hash( void *output, const void *input );
+int scanhash_skein2( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+#endif
+
--- a/algo/skein/skein2.c
+++ b/algo/skein/skein2.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"
 #include <string.h>
 #include <stdint.h>
@@ -66,16 +65,4 @@ int scanhash_skein2(int thr_id, struct work *work,
 	return 0;
 }

-int64_t skein2_get_max64 ()
-{
-  return 0x7ffffLL;
-}
-
-bool register_skein2_algo( algo_gate_t* gate )
-{
-  gate->scanhash  = (void*)&scanhash_skein2;
-  gate->hash      = (void*)&skein2hash;
-  gate->get_max64 = (void*)&skein2_get_max64;
-  return true;
-};

--- a/algo/skein/sph_skein.c
+++ b/algo/skein/sph_skein.c
@@ -39,7 +39,6 @@
 extern "C"{
 #endif

-
 #if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN
 #define SPH_SMALL_FOOTPRINT_SKEIN   1
 #endif
@@ -883,6 +882,7 @@ skein_big_core(sph_skein_big_context *sc, const void *data, size_t len)
 	}

 	READ_STATE_BIG(sc);
+
 	first = (bcount == 0) << 7;
 	do {
 		size_t clen;
--- a/algo/skunk.c
+++ b/algo/skunk.c
@@ -0,0 +1,101 @@
+#include "algo-gate-api.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "algo/gost/sph_gost.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/fugue/sph_fugue.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+
+typedef struct {
+    sph_skein512_context  skein;
+    cubehashParam         cube;
+    sph_fugue512_context  fugue;
+    sph_gost512_context   gost;
+} skunk_ctx_holder;
+
+static __thread skunk_ctx_holder skunk_ctx;
+
+void skunkhash( void *output, const void *input )
+{
+     unsigned char hash[128] __attribute__ ((aligned (64)));
+
+     skunk_ctx_holder ctx __attribute__ ((aligned (64)));
+     memcpy( &ctx, &skunk_ctx, sizeof(skunk_ctx) );
+
+     sph_skein512( &ctx.skein, input+64, 16 );
+     sph_skein512_close( &ctx.skein, (void*) hash );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*) hash, (const byte*)hash, 64 );
+
+     sph_fugue512( &ctx.fugue, hash, 64 );
+     sph_fugue512_close( &ctx.fugue, hash );
+
+     sph_gost512( &ctx.gost, hash, 64 );
+     sph_gost512_close( &ctx.gost, hash );
+
+     memcpy(output, hash, 32);
+}
+
+int scanhash_skunk( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done )
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[19];
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t nonce = first_nonce;
+	volatile uint8_t *restart = &(work_restart[thr_id].restart);
+
+	if ( opt_benchmark )
+		((uint32_t*)ptarget)[7] = 0x0cff;
+
+	for ( int k = 0; k < 19; k++ )
+		be32enc( &endiandata[k], pdata[k] );
+
+        // precalc midstate
+        sph_skein512_init( &skunk_ctx.skein );
+        sph_skein512( &skunk_ctx.skein, endiandata, 64 );
+
+	const uint32_t Htarg = ptarget[7];
+	do
+        {
+	   uint32_t hash[8];
+	   be32enc( &endiandata[19], nonce );
+	   skunkhash( hash, endiandata );
+
+	   if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+           {
+		pdata[19] = nonce;
+		*hashes_done = pdata[19] - first_nonce;
+		return 1;
+	   }
+	   nonce++;
+	} while ( nonce < max_nonce && !(*restart) );
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
+bool skunk_thread_init()
+{
+   sph_skein512_init( &skunk_ctx.skein );
+   cubehashInit( &skunk_ctx.cube, 512, 16, 32 );
+   sph_fugue512_init( &skunk_ctx.fugue );
+   sph_gost512_init( &skunk_ctx.gost );
+   return true;
+}
+
+bool register_skunk_algo( algo_gate_t* gate )
+{
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+    gate->miner_thread_init = (void*)&skunk_thread_init;
+    gate->scanhash = (void*)&scanhash_skunk;
+    gate->hash     = (void*)&skunkhash;
+    return true;
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	4b57ac0eb9	v3.7.4	2017-11-28 16:32:04 -05:00
Jay D Dee	6d1361c87f	v3.7.3	2017-11-20 21:19:15 -05:00
Jay D Dee	ab39e88318	v3.7.2	2017-11-01 11:03:23 -04:00
Jay D Dee	8ff52e7ad6	v3.7.1	2017-10-31 00:25:24 -04:00
Jay D Dee	aaa48599ad	v3.7.0	2017-10-17 11:38:59 -04:00
Jay D Dee	c76574b2cd	v3.6.11	2017-10-12 15:14:37 -04:00
Jay D Dee	989fb42d20	v3.6.10	2017-10-12 11:49:40 -04:00
Jay D Dee	710c852f05	v3.6.9	2017-10-09 21:45:27 -04:00
Jay D Dee	39f089d3dc	v3.6.8	2017-07-31 20:02:45 -04:00
Jay D Dee	ec4f6028a2	v3.6.7	2017-07-24 21:38:32 -04:00
Jay D Dee	f8907677f6	v3.6.6	2017-07-01 14:37:11 -04:00
Jay D Dee	7544cb956c	v3.6.5	2017-05-19 16:38:26 -04:00