v3.7.9

v3.7.8
v3.7.7
2025-09-17 23:44:27 +00:00 · 2018-01-08 22:04:43 -05:00 · 2017-12-30 19:19:46 -05:00 · 2017-12-17 12:00:42 -05:00 · 2017-12-14 18:28:51 -05:00 · 2017-12-08 15:39:28 -05:00
204 changed files with 29142 additions and 3146 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -11,7 +11,6 @@ autom4te.cache
 Makefile
 Makefile.in
 INSTALL
-configure
 configure.lineno
 depcomp
 missing
--- a/12
+++ b/12
@@ -16,4 +16,16 @@ LucasJones

 tpruvot@github

+elmad
+
+djm34
+
+palmd
+
+ig0tik3d
+
+Wolf0
+
+Optiminer
+
 Jay D Dee
--- a/34
+++ b/34
@@ -5,19 +5,31 @@
 # ex: docker run -it --rm cpuminer-opt:latest -a cryptonight -o cryptonight.eu.nicehash.com:3355 -u 1MiningDW2GKzf4VQfmp4q2XoUvR6iy6PD.worker1 -p x -t 3
 #

-FROM ubuntu:16.04
-RUN BUILD_DEPS="build-essential \
-    libssl-dev \
-	  libgmp-dev \
-	  libcurl4-openssl-dev \
-	  libjansson-dev \
-	  automake" && \
+# Build
+FROM ubuntu:16.04 as builder

-	  apt-get update && \
-	  apt-get install -y ${BUILD_DEPS}
+RUN apt-get update \
+  && apt-get install -y \
+    build-essential \
+    libssl-dev \
+    libgmp-dev \
+    libcurl4-openssl-dev \
+    libjansson-dev \
+    automake \
+  && rm -rf /var/lib/apt/lists/*

 COPY . /app/
-RUN	cd /app/ && ./build.sh
+RUN cd /app/ && ./build.sh

-ENTRYPOINT ["/app/cpuminer"]
+# App
+FROM ubuntu:16.04
+
+RUN apt-get update \
+  && apt-get install -y \
+    libcurl3 \
+    libjansson4 \
+  && rm -rf /var/lib/apt/lists/*
+
+COPY --from=builder /app/cpuminer .
+ENTRYPOINT ["./cpuminer"]
 CMD ["-h"]
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,29 +22,6 @@ cpuminer_SOURCES = \
  api.c \
  sysinfos.c \
  algo-gate-api.c\
-  algo/groestl/sph_groestl.c \
-  algo/skein/sph_skein.c \
-  algo/bmw/sph_bmw.c \
-  algo/shavite/sph_shavite.c \
-  algo/shavite/shavite.c \
-  algo/echo/sph_echo.c \
-  algo/blake/sph_blake.c \
-  algo/blake/sph_blake2b.c \
-  algo/heavy/sph_hefty1.c \
-  algo/blake/mod_blakecoin.c \
-  algo/luffa/sph_luffa.c \
-  algo/cubehash/sph_cubehash.c \
-  algo/simd/sph_simd.c \
-  algo/hamsi/sph_hamsi.c \
-  algo/fugue/sph_fugue.c \
-  algo/gost/sph_gost.c \
-  algo/jh/sph_jh.c \
-  algo/keccak/sph_keccak.c \
-  algo/keccak/keccak.c\
-  algo/sha/sph_sha2.c \
-  algo/sha/sph_sha2big.c \
-  algo/shabal/sph_shabal.c \
-  algo/whirlpool/sph_whirlpool.c\
  crypto/blake2s.c \
  crypto/oaes_lib.c \
  crypto/c_keccak.c \
@@ -61,48 +38,80 @@ cpuminer_SOURCES = \
  algo/argon2/ar2/cores.c \
  algo/argon2/ar2/ar2-scrypt-jane.c \
  algo/argon2/ar2/blake2b.c \
-  algo/axiom.c \
+  algo/blake/sph_blake.c \
+  algo/blake/blake-hash-4way.c \
+  algo/blake/blake-gate.c \
  algo/blake/blake.c \
+  algo/blake/blake-4way.c \
+  algo/blake/sph_blake2b.c \
  algo/blake/blake2b.c \
  algo/blake/blake2s.c \
+  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
+  algo/blake/decred-gate.c \
  algo/blake/decred.c \
+  algo/blake/decred-4way.c \
+  algo/blake/pentablake-gate.c \
+  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
+  algo/bmw/sph_bmw.c \
+  algo/bmw/bmw-hash-4way.c \
  algo/bmw/bmw256.c \
-  algo/cubehash/sse2/cubehash_sse2.c\
  algo/cryptonight/cryptolight.c \
  algo/cryptonight/cryptonight-common.c\
  algo/cryptonight/cryptonight-aesni.c\
  algo/cryptonight/cryptonight.c\
-  algo/drop.c \
+  algo/cubehash/sph_cubehash.c \
+  algo/cubehash/sse2/cubehash_sse2.c\
+  algo/echo/sph_echo.c \
  algo/echo/aes_ni/hash.c\
-  algo/fresh.c \
+  algo/gost/sph_gost.c \
+  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
  algo/groestl/myr-groestl.c \
  algo/groestl/aes_ni/hash-groestl.c \
  algo/groestl/aes_ni/hash-groestl256.c \
+  algo/fugue/sph_fugue.c \
+  algo/hamsi/sph_hamsi.c \
  algo/haval/haval.c\
+  algo/heavy/sph_hefty1.c \
  algo/heavy/heavy.c \
  algo/heavy/bastion.c \
-  algo/hmq1725.c \
  algo/hodl/aes.c \
  algo/hodl/hodl-gate.c \
  algo/hodl/hodl-wolf.c \
  algo/hodl/sha512_avx.c \
  algo/hodl/sha512_avx2.c \
+  algo/jh/sph_jh.c \
+  algo/jh/jh-hash-4way.c \
+  algo/jh/jha-gate.c \
+  algo/jh/jha-4way.c \
+  algo/jh/jha.c \
+  algo/keccak/sph_keccak.c \
+  algo/keccak/keccak.c\
+  algo/keccak/keccak-hash-4way.c \
+  algo/keccak/keccak-4way.c\
+  algo/keccak/keccak-gate.c \
+  algo/keccak/sse2/keccak.c \
  algo/lbry.c \
+  algo/luffa/sph_luffa.c \
  algo/luffa/luffa.c \
  algo/luffa/sse2/luffa_for_sse2.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
  algo/lyra2/lyra2rev2.c \
  algo/lyra2/lyra2re.c \
-  algo/lyra2/zcoin.c \
+  algo/lyra2/lyra2z-gate.c \
+  algo/lyra2/lyra2z.c \
+  algo/lyra2/lyra2z-4way.c \
  algo/lyra2/lyra2z330.c \
-  algo/keccak/sse2/keccak.c \
+  algo/lyra2/lyra2h.c \
  algo/m7m.c \
-  algo/neoscrypt.c \
-  algo/nist5.c \
+  algo/neoscrypt/neoscrypt.c \
+  algo/nist5/nist5-gate.c \
+  algo/nist5/nist5-4way.c \
+  algo/nist5/nist5.c \
+  algo/nist5/zr5.c \
  algo/pluck.c \
  algo/quark/quark.c \
  algo/qubit/qubit.c \
@@ -110,33 +119,86 @@ cpuminer_SOURCES = \
  algo/ripemd/sph_ripemd.c \
  algo/scrypt.c \
  algo/scryptjane/scrypt-jane.c \
+  algo/sha/sph_sha2.c \
+  algo/sha/sph_sha2big.c \
  algo/sha/sha2.c \
  algo/sha/sha256t.c \
+  algo/shabal/sph_shabal.c \
+  algo/shabal/shabal-hash-4way.c \
+  algo/shavite/sph_shavite.c \
+  algo/shavite/sph-shavite-aesni.c \
+  algo/shavite/shavite.c \
+  algo/simd/sph_simd.c \
  algo/simd/sse2/nist.c \
  algo/simd/sse2/vector.c \
+  algo/skein/sph_skein.c \
+  algo/skein/skein-hash-4way.c \
  algo/skein/skein.c \
+  algo/skein/skein-4way.c \
+  algo/skein/skein-gate.c \
  algo/skein/skein2.c \
-  algo/s3.c \
+  algo/skein/skein2-4way.c \
+  algo/skein/skein2-gate.c \
+  algo/sm3/sm3.c \
  algo/tiger/sph_tiger.c \
  algo/timetravel.c \
-  algo/veltor.c \
+  algo/timetravel10.c \
+  algo/whirlpool/sph_whirlpool.c \
+  algo/whirlpool/whirlpool-hash-4way.c \
+  algo/whirlpool/whirlpool-gate.c \
+  algo/whirlpool/whirlpool-4way.c \
  algo/whirlpool/whirlpool.c \
  algo/whirlpool/whirlpoolx.c \
+  algo/x11/x11-gate.c \
  algo/x11/x11.c \
-  algo/x11/x11evo.c \
+  algo/x11/x11-4way.c \
+  algo/x11/x11gost-gate.c \
  algo/x11/x11gost.c \
+  algo/x11/x11gost-4way.c \
+  algo/x11/c11-gate.c \
  algo/x11/c11.c \
+  algo/x11/c11-4way.c \
+  algo/x11/tribus-gate.c \
+  algo/x11/tribus.c \
+  algo/x11/tribus-4way.c \
+  algo/x11/fresh.c \
+  algo/x11/x11evo.c \
+  algo/x13/x13-gate.c \
  algo/x13/x13.c \
+  algo/x13/x13-4way.c \
+  algo/x13/x13sm3-gate.c \
+  algo/x13/x13sm3.c \
+  algo/x13/x13sm3-4way.c \
+  algo/x13/phi1612-gate.c \
+  algo/x13/phi1612.c \
+  algo/x13/phi1612-4way.c \
+  algo/x13/skunk-gate.c \
+  algo/x13/skunk-4way.c \
+  algo/x13/skunk.c \
+  algo/x13/drop.c \
+  algo/x14/x14-gate.c \
  algo/x14/x14.c \
+  algo/x14/x14-4way.c \
+  algo/x14/veltor-gate.c \
+  algo/x14/veltor.c \
+  algo/x14/veltor-4way.c \
+  algo/x14/polytimos-gate.c \
+  algo/x14/polytimos.c \
+  algo/x14/polytimos-4way.c \
+  algo/x14/axiom.c \
+  algo/x15/x15-gate.c \
  algo/x15/x15.c \
+  algo/x15/x15-4way.c \
+  algo/x17/x17-gate.c \
  algo/x17/x17.c \
-  algo/xevan.c \
+  algo/x17/x17-4way.c \
+  algo/x17/xevan-gate.c \
+  algo/x17/xevan.c \
+  algo/x17/xevan-4way.c \
+  algo/x17/hmq1725.c \
  algo/yescrypt/yescrypt.c \
-  algo/yescrypt/yescrypt-common.c \
-  algo/yescrypt/sha256_Y.c\
-  algo/yescrypt/yescrypt-simd.c\
-  algo/zr5.c
-
+  algo/yescrypt/sha256_Y.c \
+  algo/yescrypt/yescrypt-simd.c

 disable_flags =

--- a/README.md
+++ b/README.md
@@ -28,25 +28,31 @@ Supported Algorithms
                          cryptonight  cryptonote, Monero (XMR)
                          decred
                          deep         Deepcoin (DCN)
+                          dmd-gr       Diamond-Groestl
                          drop         Dropcoin
                          fresh        Fresh
-                          groestl      dmd-gr, Groestl coin
+                          groestl      Groestl coin
                          heavy        Heavy
                          hmq1725      Espers
                          hodl         Hodlcoin
-                          keccak       Keccak
+                          jha          Jackpotcoin
+                          keccak       Maxcoin
+                          keccakc      Creative coin
                          lbry         LBC, LBRY Credits
                          luffa        Luffa
+                          lyra2h       Hppcoin
                          lyra2re      lyra2
-                          lyra2rev2    lyrav2, Vertcoin
+                          lyra2rev2    lyra2v2, Vertcoin
                          lyra2z       Zcoin (XZC)
                          lyra2z330    Lyra2 330 rows, Zoin (ZOI)
                          m7m          Magi (XMG)
                          myr-gr       Myriad-Groestl
                          neoscrypt    NeoScrypt(128, 2, 1)
                          nist5        Nist5
-                          pluck        Pluck:128 (Supcoin)
                          pentablake   Pentablake
+                          phi1612      phi, LUX coin
+                          pluck        Pluck:128 (Supcoin)
+                          polytimos    Ninja
                          quark        Quark
                          qubit        Qubit
                          scrypt       scrypt(1024, 1, 1) (default)
@@ -57,20 +63,26 @@ Supported Algorithms
                          shavite3     Shavite3
                          skein        Skein+Sha (Skeincoin)
                          skein2       Double Skein (Woodcoin)
+                          skunk        Signatum (SIGT)
                          timetravel   Machinecoin (MAC)
+                          timetravel10 Bitcore
+                          tribus       Denarius (DNR)
                          vanilla      blake256r8vnl (VCash)
-                          veltor
+                          veltor       (VLT)
                          whirlpool
                          whirlpoolx
                          x11          Dash
                          x11evo       Revolvercoin
                          x11gost      sib (SibCoin)
                          x13          X13
+                          x13sm3       hsr (Hshare)
                          x14          X14
                          x15          X15
                          x17
                          xevan        Bitsend
-                          yescrypt
+                          yescrypt     Globalboost-Y (BSTY)
+                          yescryptr8   BitZeny (ZNY)\n\
+                          yescryptr16  Yenten (YTN)
                          zr5          Ziftr

 Requirements
@@ -85,13 +97,16 @@ algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
 Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
 performance.

+ARM CPUs are not supported.
+
 2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
 Centos are known to work and have all dependencies in their repositories.
 Others may work but may require more effort.
 64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.

-3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
-may work wallet mining but there are no guarantees.
+MacOS, OSx is not supported.
+
+3. Stratum pool. Some algos may work wallet mining using getwork.

 Errata
 ------
@@ -114,6 +129,10 @@ forum at:

 https://bitcointalk.org/index.php?topic=1326803.0

+All problem reports must be accompanied by a proper definition.
+This should include how the problem occurred, the command line and
+output from the miner showing the startup and any errors.
+
 Donations
 ---------

--- a/README.txt
+++ b/README.txt
@@ -1,6 +1,9 @@
 This file is included in the Windows binary package. Compile instructions
 for Linux and Windows can be found in RELEASE_NOTES.

+cpuminer is a console program that is executed from a DOS command prompt.
+There is no GUI and no mouse support.
+
 Choose the exe that best matches you CPU's features or use trial and
 error to find the fastest one that doesn't crash. Pay attention to
 the features listed at cpuminer startup to ensure you are mining at
@@ -8,15 +11,27 @@ optimum speed using all the available features.

 Architecture names and compile options used are only provided for Intel
 Core series. Pentium and Celeron often have fewer features.
-AMD is YMMV, see previous paragraph.

-Exe name                  Compile opts       Arch name
+AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
+supported by cpuminer-opt due to an incompatible implementation of SSE2 on
+these CPUs. Some algos may crash the miner with an invalid instruction.
+Users are recommended to use an unoptimized miner such as cpuminer-multi.

-cpuminer-sse2.exe         -march=core2,      Core2   
-cpuminer-sse42.exe        -march=corei7,     Nehalem
-cpuminer-aes-sse42.exe    -maes -msse4.2     Westmere
-cpuminer-aes-avx.exe      -march=corei7-avx, Sandybridge, Ivybridge
-cpuminer-aes-avx2.exe     -march=core-avx2,  Haswell, Broadwell, Skylake, Kabylake
+Exe name                Compile flags              Arch name

+cpuminer-sse2.exe      "-march=core2"              Core2   
+cpuminer-sse42.exe     "-march=corei7"             Nehalem
+cpuminer-aes-sse42.exe "-maes -msse4.2"            Westmere
+cpuminer-avx.exe       "-march=corei7-avx"         Sandybridge, Ivybridge
+cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
+cpuminer-avx-sha       "-march=corei7-avx -msha"   Ryzen...
+cpuminer-4way.exe      "-march=core-avx2 -DFOUR_WAY"       same as avx2
+cpuminer-4way-sha.exe  "-march=core-avx2 -msha -DFOUR_WAY" same as avx2-sha

+4way requires a CPU with AES and AVX2. It is still under development and
+only a few algos are supported. See change log in RELEASE_NOTES in source
+package for supported algos.
+
+Ryzen CPus perform better with AVX than AVX2 therefore an avx-sha build
+is provided. Four way still uses AVX2. 

--- a/178
+++ b/178
@@ -6,9 +6,31 @@ compile flag.
 HW SHA support is only available when compiled from source, Windows binaries
 are not yet available.

+cpuminer-opt is a console program, if you're using a mouse you're doing it
+wrong.
+
+Security warning
+----------------
+
+Miner programs are often flagged as malware by antivirus programs. This is
+a false positive, they are flagged simply because they are miners. The source
+code is open for anyone to inspect. If you don't trust the software, don't use
+it.
+
+The cryptographic code has been taken from trusted sources but has been
+modified for speed at the expense of accepted security practices. This
+code should not be imported into applications where secure cryptography is
+required.
+
 Compile Instructions
 --------------------

+Requirements:
+
+Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
+supported.
+64 bit Linux or Windows operating system. Apple is not supported.
+
 Building on linux prerequisites:

 It is assumed users know how to install packages on their system and
@@ -25,14 +47,11 @@ are some of the ones that may not be in the default install and need to
 be installed manually. There may be others, read the error messages they
 will give a clue as to the missing package.

-The folliwing command should install everything you need on Debian based
-packages:
+The following command should install everything you need on Debian based
+distributions such as Ubuntu:

 sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake

-Building on Linux, see below for Windows.
-
-Dependencies

 build-essential  (for Ubuntu, Development Tools package group on Fedora)
 automake
@@ -44,9 +63,16 @@ pthreads
 zlib

 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and openssl 1.1
-or higher. Additional compile options may also be required such as
+or higher. Reports of improved performiance on Ryzen when using openssl 1.0.2
+have been due to AVX and AVX2 optimizations added to that version.
+Additional improvements are expected on Ryzen with openssl 1.1.
 "-march-znver1" or "-msha".

+Additional instructions for static compilalation can be found here:
+https://lxadm.com/Static_compilation_of_cpuminer
+Static builds should only considered in a homogeneous HW and SW environment.
+Local builds will always have the best performance and compatibility.
+
 Extract cpuminer source.

 tar xvzf cpuminer-opt-x.y.z.tar.gz
@@ -58,9 +84,28 @@ Run ./build.sh to build on Linux or execute the following commands.
 CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 make

+Additional optional compile flags, add the following to CFLAGS to activate:
+
+-DUSE_SPH_SHA
+
+SPH may give slightly better performance on algos that use sha256 when using
+openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
+better than SPH.
+
+-DFOUR_WAY
+
+4 way will give much better performance on supported algos with CPUs
+that have AVX2 and should only be used on CPUs with AVX2. 4 way algo
+support will be added incrementally, see change log below for supported algos.
+ 
 Start mining.

-./cpuminer -a algo ...
+./cpuminer -a algo -o url -u username -p password
+
+Windows
+
+The following in how the Windows binary releases are built. It's old and
+not very good but it works, for me anyway.

 Building on Windows prerequisites:

@@ -98,6 +143,10 @@ Run winbuild.sh to build on Windows or execute the following commands.
 CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
 make

+Start mining
+
+cpuminer.exe -a algo -o url -u user -p password
+
 The following tips may be useful for older AMD CPUs.

 AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
@@ -116,6 +165,121 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.7.9
+
+Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.
+Additional 4way optimizations for X algos.
+New algo yescryptr8 for BitZeny, not to be confused with original
+yescrypt Globalboost-Y.
+
+v3.7.8
+
+Partial 4way optimization for most X algos including c11, xevan, phi, hsr
+
+v3.7.7
+
+Fixed regression caused by 64 CPU support.
+Fixed lyra2h.
+
+v3.7.6
+
+Added lyra2h algo for Hppcoin.
+Added support for more than 64 CPUs.
+Optimized shavite512 with AES, improves x11 etc.
+
+v3.7.5
+
+New algo keccakc for Creative coin with 4way optimizations
+
+Rewrote some AVX/AVX2 code for more consistent implementation and some
+optimizing.
+
+Enhanced capabilities check to support 4way, more precise reporting of
+features (not all algos use SSE2), and better error messages when using
+an incompatible pre-built version (Windows users).
+
+v3.7.4
+
+Removed unnecessary build options.
+
+Added 4way support for tribus and nist5.
+
+v3.7.3
+
+Added polytimos algo.
+
+Introducing 4-way AVX2 optimization giving up to 4x performance inprovement
+on many compute bound algos. First supported algos: skein, skein2, blake &
+keccak. This feature is only available when compiled from source. See above
+for instcuctions how to enable 4-way during compilation.
+
+Updated Dockerfile.
+
+v3.7.2
+
+Fixed yescryptr16
+Changed default sha256 and sha512 to openssl. This should be used when
+compiling with openssl 1.0.2 or higher (Ubuntu 16.04).
+This should increase the hashrate for yescrypt, yescryptr16, m7m, xevan, skein,
+myr-gr & others  when openssl 1.0.2 is installed.
+Users with openssl 1.0.1 (Ubuntu 14.04) may get better perforance by adding
+"-DUSE_SPH_SHA" to CLAGS. 
+Windows binaries are compiled with -DUSE_SPH_SHA and won't get the speedup.
+
+v3.7.1
+
+Added yescryptr16 algo for Yenten coin
+Added SHA support to yescrypt and yescryptr16
+Small code cleanup
+
+v3.7.0
+
+Fixed x14 misalignment bug.
+Fixed decred stake version bug.
+Getwork fixes for algos that use big endian data encoding: m7m, zr5, neoscrypt,
+decred.
+
+v3.6.10
+
+Fixed misalignment bug in hsr.
+
+v3.6.9
+
+Added phi1612 algo for LUX coin
+Added x13sm3 algo, alias hsr, for Hshare coin
+
+v3.6.8
+
+Fixed timetravel10 on Windows.
+
+v3.6.7
+
+Skunk algo added.
+Tribus a little faster.
+Minor restructuring.
+
+v3.6.6
+
+added tribus algo for Denarius (DNR)
+
+configure removed from .gitignore. This should allow git clone to compile
+on Windows/mingw.
+
+Fixed CPU temperature monitoring on some CPUs (Linux only).
+
+Fixed a compile error on FreeBSD (unsupported YMMV).
+
+v3.6.5
+
+Cryptonight a little faster.
+Added jha algo (Jackpotcoin) with AES optimizations.
+
+v3.6.4
+
+Added support for Bitcore (BTX) using the timetravel10 algo, optimized for
+AES and AVX2. 
+"-a bitcore" works as an alias and is less typing that "-a timetravel10".
+
 v3.6.3

 Fixed all known issues with SHA support on AMD Ryzen CPUs, still no
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -77,6 +77,12 @@ void algo_not_tested()
  applog(LOG_WARNING,"and bad things may happen. Use at your own risk.");
 }

+void four_way_not_tested()
+{
+  applog( LOG_WARNING,"Algo %s has not been tested using 4way. It may not", algo_names[opt_algo] );
+  applog( LOG_WARNING,"work or may be slower. Please report your results.");
+}
+
 void algo_not_implemented()
 {
  applog(LOG_ERR,"Algo %s has not been Implemented.",algo_names[opt_algo]);
@@ -114,8 +120,8 @@ void init_algo_gate( algo_gate_t* gate )
   gate->stratum_gen_work        = (void*)&std_stratum_gen_work;
   gate->build_stratum_request   = (void*)&std_le_build_stratum_request;
   gate->set_target              = (void*)&std_set_target;
-   gate->work_decode             = (void*)&std_work_decode;
-   gate->submit_getwork_result   = (void*)&std_submit_getwork_result;
+   gate->work_decode             = (void*)&std_le_work_decode;
+   gate->submit_getwork_result   = (void*)&std_le_submit_getwork_result;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
   gate->calc_network_diff       = (void*)&std_calc_network_diff;
@@ -124,7 +130,7 @@ void init_algo_gate( algo_gate_t* gate )
   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
   gate->stratum_handle_response = (void*)&std_stratum_handle_response;
-   gate->optimizations           = SSE2_OPT;
+   gate->optimizations           = EMPTY_SET;
   gate->ntime_index             = STD_NTIME_INDEX;
   gate->nbits_index             = STD_NBITS_INDEX;
   gate->nonce_index             = STD_NONCE_INDEX;
@@ -132,6 +138,10 @@ void init_algo_gate( algo_gate_t* gate )
   gate->work_cmp_size           = STD_WORK_CMP_SIZE;
 }

+// Ignore warnings for not yet defined register functions
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
+
 // called by each thread that uses the gate
 bool register_algo_gate( int algo, algo_gate_t *gate )
 {
@@ -145,72 +155,73 @@ bool register_algo_gate( int algo, algo_gate_t *gate )

   switch (algo)
   {
-
-// Ignore warnings for not yet defined register fucntions
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
-
-     case ALGO_ARGON2:      register_argon2_algo     ( gate ); break;
-     case ALGO_AXIOM:       register_axiom_algo      ( gate ); break;
-     case ALGO_BASTION:     register_bastion_algo    ( gate ); break;
-     case ALGO_BLAKE:       register_blake_algo      ( gate ); break;
-     case ALGO_BLAKECOIN:   register_blakecoin_algo  ( gate ); break;
-//     case ALGO_BLAKE2B:     register_blake2b_algo    ( gate ); break;
-     case ALGO_BLAKE2S:     register_blake2s_algo    ( gate ); break;
-     case ALGO_C11:         register_c11_algo        ( gate ); break;
-     case ALGO_CRYPTOLIGHT: register_cryptolight_algo( gate ); break;
-     case ALGO_CRYPTONIGHT: register_cryptonight_algo( gate ); break;
-     case ALGO_DECRED:      register_decred_algo     ( gate ); break;
-     case ALGO_DEEP:        register_deep_algo       ( gate ); break;
-     case ALGO_DMD_GR:      register_dmd_gr_algo     ( gate ); break;
-     case ALGO_DROP:        register_drop_algo       ( gate ); break;
-     case ALGO_FRESH:       register_fresh_algo      ( gate ); break;
-     case ALGO_GROESTL:     register_groestl_algo    ( gate ); break;
-     case ALGO_HEAVY:       register_heavy_algo      ( gate ); break;
-     case ALGO_HMQ1725:     register_hmq1725_algo    ( gate ); break;
-     case ALGO_HODL:        register_hodl_algo       ( gate ); break;
-     case ALGO_KECCAK:      register_keccak_algo     ( gate ); break;
-     case ALGO_LBRY:        register_lbry_algo       ( gate ); break;
-     case ALGO_LUFFA:       register_luffa_algo      ( gate ); break;
-     case ALGO_LYRA2RE:     register_lyra2re_algo    ( gate ); break;
-     case ALGO_LYRA2REV2:   register_lyra2rev2_algo  ( gate ); break;
-     case ALGO_LYRA2Z:      register_zcoin_algo      ( gate ); break;
-     case ALGO_LYRA2Z330:   register_lyra2z330_algo  ( gate ); break;
-     case ALGO_M7M:         register_m7m_algo        ( gate ); break;
-     case ALGO_MYR_GR:      register_myriad_algo     ( gate ); break;
-     case ALGO_NEOSCRYPT:   register_neoscrypt_algo  ( gate ); break;
-     case ALGO_NIST5:       register_nist5_algo      ( gate ); break;
-     case ALGO_PENTABLAKE:  register_pentablake_algo ( gate ); break;
-     case ALGO_PLUCK:       register_pluck_algo      ( gate ); break;
-     case ALGO_QUARK:       register_quark_algo      ( gate ); break;
-     case ALGO_QUBIT:       register_qubit_algo      ( gate ); break;
-     case ALGO_SCRYPT:      register_scrypt_algo     ( gate ); break;
-     case ALGO_SCRYPTJANE:  register_scryptjane_algo ( gate ); break;
-     case ALGO_SHA256D:     register_sha256d_algo    ( gate ); break;
-     case ALGO_SHA256T:     register_sha256t_algo    ( gate ); break;
-     case ALGO_SHAVITE3:    register_shavite_algo    ( gate ); break;
-     case ALGO_SKEIN:       register_skein_algo      ( gate ); break;
-     case ALGO_SKEIN2:      register_skein2_algo     ( gate ); break;
-     case ALGO_S3:          register_s3_algo         ( gate ); break;
-     case ALGO_TIMETRAVEL:  register_timetravel_algo ( gate ); break;
-     case ALGO_VANILLA:     register_vanilla_algo    ( gate ); break;
-     case ALGO_VELTOR:      register_veltor_algo     ( gate ); break;
-     case ALGO_WHIRLPOOL:   register_whirlpool_algo  ( gate ); break;
-     case ALGO_WHIRLPOOLX:  register_whirlpoolx_algo ( gate ); break;
-     case ALGO_X11:         register_x11_algo        ( gate ); break;
-     case ALGO_X11EVO:      register_x11evo_algo     ( gate ); break;
-     case ALGO_X11GOST:     register_sib_algo        ( gate ); break;
-     case ALGO_X13:         register_x13_algo        ( gate ); break;
-     case ALGO_X14:         register_x14_algo        ( gate ); break;
-     case ALGO_X15:         register_x15_algo        ( gate ); break;
-     case ALGO_X17:         register_x17_algo        ( gate ); break;
-     case ALGO_XEVAN:       register_xevan_algo      ( gate ); break;
-     case ALGO_YESCRYPT:    register_yescrypt_algo   ( gate ); break;
-     case ALGO_ZR5:         register_zr5_algo        ( gate ); break;
-
-// restore warnings
-#pragma GCC diagnostic pop
-
+     case ALGO_ARGON2:       register_argon2_algo      ( gate ); break;
+     case ALGO_AXIOM:        register_axiom_algo       ( gate ); break;
+     case ALGO_BASTION:      register_bastion_algo     ( gate ); break;
+     case ALGO_BLAKE:        register_blake_algo       ( gate ); break;
+     case ALGO_BLAKECOIN:    register_blakecoin_algo   ( gate ); break;
+//     case ALGO_BLAKE2B:      register_blake2b_algo    ( gate ); break;
+     case ALGO_BLAKE2S:      register_blake2s_algo     ( gate ); break;
+     case ALGO_C11:          register_c11_algo         ( gate ); break;
+     case ALGO_CRYPTOLIGHT:  register_cryptolight_algo ( gate ); break;
+     case ALGO_CRYPTONIGHT:  register_cryptonight_algo ( gate ); break;
+     case ALGO_DECRED:       register_decred_algo      ( gate ); break;
+     case ALGO_DEEP:         register_deep_algo        ( gate ); break;
+     case ALGO_DMD_GR:       register_dmd_gr_algo      ( gate ); break;
+     case ALGO_DROP:         register_drop_algo        ( gate ); break;
+     case ALGO_FRESH:        register_fresh_algo       ( gate ); break;
+     case ALGO_GROESTL:      register_groestl_algo     ( gate ); break;
+     case ALGO_HEAVY:        register_heavy_algo       ( gate ); break;
+     case ALGO_HMQ1725:      register_hmq1725_algo     ( gate ); break;
+     case ALGO_HODL:         register_hodl_algo        ( gate ); break;
+     case ALGO_JHA:          register_jha_algo         ( gate ); break;
+     case ALGO_KECCAK:       register_keccak_algo      ( gate ); break;
+     case ALGO_KECCAKC:      register_keccakc_algo     ( gate ); break;
+     case ALGO_LBRY:         register_lbry_algo        ( gate ); break;
+     case ALGO_LUFFA:        register_luffa_algo       ( gate ); break;
+     case ALGO_LYRA2H:       register_lyra2h_algo      ( gate ); break;
+     case ALGO_LYRA2RE:      register_lyra2re_algo     ( gate ); break;
+     case ALGO_LYRA2REV2:    register_lyra2rev2_algo   ( gate ); break;
+     case ALGO_LYRA2Z:       register_lyra2z_algo      ( gate ); break;
+     case ALGO_LYRA2Z330:    register_lyra2z330_algo   ( gate ); break;
+     case ALGO_M7M:          register_m7m_algo         ( gate ); break;
+     case ALGO_MYR_GR:       register_myriad_algo      ( gate ); break;
+     case ALGO_NEOSCRYPT:    register_neoscrypt_algo   ( gate ); break;
+     case ALGO_NIST5:        register_nist5_algo       ( gate ); break;
+     case ALGO_PENTABLAKE:   register_pentablake_algo  ( gate ); break;
+     case ALGO_PHI1612:      register_phi1612_algo     ( gate ); break;
+     case ALGO_PLUCK:        register_pluck_algo       ( gate ); break;
+     case ALGO_POLYTIMOS:    register_polytimos_algo   ( gate ); break;
+     case ALGO_QUARK:        register_quark_algo       ( gate ); break;
+     case ALGO_QUBIT:        register_qubit_algo       ( gate ); break;
+     case ALGO_SCRYPT:       register_scrypt_algo      ( gate ); break;
+     case ALGO_SCRYPTJANE:   register_scryptjane_algo  ( gate ); break;
+     case ALGO_SHA256D:      register_sha256d_algo     ( gate ); break;
+     case ALGO_SHA256T:      register_sha256t_algo     ( gate ); break;
+     case ALGO_SHAVITE3:     register_shavite_algo     ( gate ); break;
+     case ALGO_SKEIN:        register_skein_algo       ( gate ); break;
+     case ALGO_SKEIN2:       register_skein2_algo      ( gate ); break;
+     case ALGO_SKUNK:        register_skunk_algo       ( gate ); break;
+     case ALGO_TIMETRAVEL:   register_timetravel_algo  ( gate ); break;
+     case ALGO_TIMETRAVEL10: register_timetravel10_algo( gate ); break;
+     case ALGO_TRIBUS:       register_tribus_algo      ( gate ); break;
+     case ALGO_VANILLA:      register_vanilla_algo     ( gate ); break;
+     case ALGO_VELTOR:       register_veltor_algo      ( gate ); break;
+     case ALGO_WHIRLPOOL:    register_whirlpool_algo   ( gate ); break;
+     case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo  ( gate ); break;
+     case ALGO_X11:          register_x11_algo         ( gate ); break;
+     case ALGO_X11EVO:       register_x11evo_algo      ( gate ); break;
+     case ALGO_X11GOST:      register_x11gost_algo     ( gate ); break;
+     case ALGO_X13:          register_x13_algo         ( gate ); break;
+     case ALGO_X13SM3:       register_x13sm3_algo      ( gate ); break;
+     case ALGO_X14:          register_x14_algo         ( gate ); break;
+     case ALGO_X15:          register_x15_algo         ( gate ); break;
+     case ALGO_X17:          register_x17_algo         ( gate ); break;
+     case ALGO_XEVAN:        register_xevan_algo       ( gate ); break;
+     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
+     case ALGO_YESCRYPTR8:   register_yescryptr8_algo  ( gate ); break;
+     case ALGO_YESCRYPTR16:  register_yescryptr16_algo ( gate ); break;
+     case ALGO_ZR5:          register_zr5_algo         ( gate ); break;
    default:
        applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
        return false;
@@ -225,6 +236,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
  return true;
 }

+// restore warnings
+#pragma GCC diagnostic pop
+
 // override std defaults with jr2 defaults
 bool register_json_rpc2( algo_gate_t *gate )
 {
@@ -253,42 +267,47 @@ void exec_hash_function( int algo, void *output, const void *pdata )
  gate.hash( output, pdata, 0 );  
 }

-// an algo can have multiple aliases but the aliases must be unique
-
 #define PROPER (1)
 #define ALIAS  (0)

 // The only difference between the alias and the proper algo name is the
-// proper name is the one that is defined in ALGO_NAMES, there may be
+// proper name is the one that is defined in ALGO_NAMES. There may be
 // multiple aliases that map to the same proper name.
 // New aliases can be added anywhere in the array as long as NULL is last.
 // Alphabetic order of alias is recommended.
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
-  { "blake256r8",        "blakecoin"   },
-  { "blake256r8vnl",     "vanilla"     },
-  { "sia",               "blake2b"     },
-  { "blake256r14",       "blake"       },
-  { "blake256r14dcr",    "decred"      },
-  { "cryptonote",        "cryptonight" },
-  { "cryptonight-light", "cryptolight" },
-  { "diamond",           "dmd-gr"      },
-  { "droplp",            "drop"        },
-  { "espers",            "hmq1725"     },
-  { "flax",              "c11"         },
-  { "jane",              "scryptjane"  }, 
-  { "lyra2",             "lyra2re"     },
-  { "lyra2v2",           "lyra2rev2"   },
-  { "lyra2zoin",         "lyra2z330"   },
-  { "myriad",            "myr-gr"      },
-  { "neo",               "neoscrypt"   },
-  { "sib",               "x11gost"     },
-  { "yes",               "yescrypt"    },
-  { "ziftr",             "zr5"         },
-  { "zcoin",             "lyra2z"      },
-  { "zoin",              "lyra2z330"   },
-  { NULL,                NULL          }   
+  { "bitcore",           "timetravel10" },
+  { "bitzeny",           "yescryptr8"   },
+  { "blake256r8",        "blakecoin"    },
+  { "blake256r8vnl",     "vanilla"      },
+  { "blake256r14",       "blake"        },
+  { "blake256r14dcr",    "decred"       },
+  { "cryptonote",        "cryptonight"  },
+  { "cryptonight-light", "cryptolight"  },
+  { "diamond",           "dmd-gr"       },
+  { "droplp",            "drop"         },
+  { "espers",            "hmq1725"      },
+  { "flax",              "c11"          },
+  { "hsr",               "x13sm3"       },
+  { "jackpot",           "jha"          },
+  { "jane",              "scryptjane"   }, 
+  { "lyra2",             "lyra2re"      },
+  { "lyra2v2",           "lyra2rev2"    },
+  { "lyra2zoin",         "lyra2z330"    },
+  { "myriad",            "myr-gr"       },
+  { "neo",               "neoscrypt"    },
+  { "phi",               "phi1612"      },
+//  { "sia",               "blake2b"      },
+  { "sib",               "x11gost"      },
+  { "timetravel8",       "timetravel"   },
+  { "ziftr",             "zr5"          },
+  { "yenten",            "yescryptr16"  },
+  { "yescryptr8k",       "yescrypt"     },
+  { "zcoin",             "lyra2z"       },
+  { "zoin",              "lyra2z330"    },
+  { NULL,                NULL           }   
 };

 // if arg is a valid alias for a known algo it is updated with the proper name.
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -85,12 +85,13 @@

 typedef  uint32_t set_t;

-#define EMPTY_SET 0
-#define SSE2_OPT  1
-#define AES_OPT   2
-#define AVX_OPT   4
-#define AVX2_OPT  8
-#define SHA_OPT  16
+#define EMPTY_SET       0
+#define SSE2_OPT        1
+#define AES_OPT         2  
+#define AVX_OPT         4
+#define AVX2_OPT        8
+#define SHA_OPT      0x10
+#define FOUR_WAY_OPT 0x20

 // return set containing all elements from sets a & b
 inline set_t set_union ( set_t a, set_t b ) { return a | b; }
@@ -156,7 +157,7 @@ bool return_false();
 void *return_null();
 void algo_not_tested();
 void algo_not_implemented();
-
+void four_way_not_tested();

 // Warning: algo_gate.nonce_index should only be used in targetted code
 // due to different behaviours by different targets. The JR2 index uses an
@@ -215,18 +216,20 @@ int64_t get_max64_0xffffLL();
 void std_set_target   ( struct work *work, double job_diff );
 void scrypt_set_target( struct work *work, double job_diff );

-bool std_work_decode( const json_t *val, struct work *work );
+bool std_le_work_decode( const json_t *val, struct work *work );
+bool std_be_work_decode( const json_t *val, struct work *work );
 bool jr2_work_decode( const json_t *val, struct work *work );

-bool std_submit_getwork_result( CURL *curl, struct work *work );
+bool std_le_submit_getwork_result( CURL *curl, struct work *work );
+bool std_be_submit_getwork_result( CURL *curl, struct work *work );
 bool jr2_submit_getwork_result( CURL *curl, struct work *work );

 void std_le_build_stratum_request( char *req, struct work *work );
 void std_be_build_stratum_request( char *req, struct work *work );
 void jr2_build_stratum_request   ( char *req, struct work *work );

-// set_work_data_endian target, default is do_nothing;
-void swab_work_data( struct work *work );
+// Default is do_nothing (assumed LE)
+void set_work_data_big_endian( struct work *work );

 double std_calc_network_diff( struct work *work );

--- a/algo/argon2/argon2a.c
+++ b/algo/argon2/argon2a.c
@@ -1,5 +1,3 @@
-#include "miner.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -0,0 +1,112 @@
+#include "blake-gate.h"
+#include "sph_blake.h"
+#include "blake-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+
+#if defined (BLAKE_4WAY)
+
+void blakehash_4way(void *state, const void *input)
+{
+     uint32_t vhash[4*4] __attribute__ ((aligned (64)));
+     uint32_t hash0[4] __attribute__ ((aligned (32)));
+     uint32_t hash1[4] __attribute__ ((aligned (32)));
+     uint32_t hash2[4] __attribute__ ((aligned (32)));
+     uint32_t hash3[4] __attribute__ ((aligned (32)));
+     blake256_4way_context ctx;
+
+     blake256_4way_init( &ctx );
+     blake256_4way( &ctx, input, 16 );
+     blake256_4way_close( &ctx, vhash );
+
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash1, 32 );
+     memcpy( state+96, hash1, 32 );
+}
+
+int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+//   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+
+//   if (opt_benchmark)
+//      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( edata, pdata, 20 );
+
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   uint32_t *noncep = vdata + 76;   // 19*4
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+
+      blakehash_4way( hash, vdata );
+
+      if ( hash[7] == 0 )
+      {
+         if ( fulltest( hash, ptarget ) )
+         {
+             found[0] = true;
+             num_found++;
+             nonces[0] = n;
+             pdata[19] = n;
+         }
+      }
+      if ( (hash+8)[7] == 0 ) 
+      {
+         if ( fulltest( hash+8, ptarget ) ) 
+         {
+             found[1] = true;
+             num_found++;
+             nonces[1] = n+1;
+         }
+      }
+      if ( (hash+16)[7] == 0 )
+      {
+          if ( fulltest( hash+8, ptarget ) )
+          {
+              found[2] = true;
+              num_found++;
+              nonces[2] = n+2;
+          }
+      }
+      if ( (hash+24)[7] == 0 )
+      {
+         if ( fulltest( hash+8, ptarget ) )
+         {
+              found[3] = true;
+              num_found++;
+              nonces[3] = n+3;
+         }
+      }
+       n += 4;
+      *hashes_done = n - first_nonce + 1;
+
+   } while ( (num_found == 0) && (n < max_nonce) 
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
+
--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -0,0 +1,26 @@
+#include "blake-gate.h"
+
+int64_t blake_get_max64 ()
+{
+  return 0x7ffffLL;
+}
+
+bool register_blake_algo( algo_gate_t* gate )
+{
+  gate->get_max64 = (void*)&blake_get_max64;
+//#if defined (__AVX2__) && defined (FOUR_WAY)
+//   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
+//  gate->scanhash  = (void*)&scanhash_blake_8way;
+//  gate->hash      = (void*)&blakehash_8way;
+#if defined(BLAKE_4WAY)
+  four_way_not_tested();
+  gate->optimizations = FOUR_WAY_OPT;
+  gate->scanhash  = (void*)&scanhash_blake_4way;
+  gate->hash      = (void*)&blakehash_4way;
+#else
+  gate->scanhash  = (void*)&scanhash_blake;
+  gate->hash      = (void*)&blakehash;
+#endif
+  return true;
+}
+
--- a/algo/blake/blake-gate.h
+++ b/algo/blake/blake-gate.h
@@ -0,0 +1,21 @@
+#ifndef __BLAKE_GATE_H__
+#define __BLAKE_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX__)
+  #define BLAKE_4WAY
+#endif
+
+#if defined (BLAKE_4WAY)
+void blakehash_4way(void *state, const void *input);
+int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
+void blakehash( void *state, const void *input );
+int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done );
+
+#endif
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -0,0 +1,105 @@
+/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
+/**
+ * BLAKE interface. BLAKE is a family of functions which differ by their
+ * output size; this implementation defines BLAKE for output sizes 224,
+ * 256, 384 and 512 bits. This implementation conforms to the "third
+ * round" specification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_blake.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef __BLAKE_HASH_4WAY__
+#define __BLAKE_HASH_4WAY___
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+/**
+ * Output size (in bits) for BLAKE-256.
+ */
+#define SPH_SIZE_blake256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BLAKE-512.
+ */
+#define SPH_SIZE_blake512   512
+
+#endif
+
+#ifdef __AVX__
+typedef struct {
+        __m128i buf[16] __attribute__ ((aligned (64)));
+        __m128i H[8];
+        __m128i S[4];    
+        size_t ptr;
+	sph_u32 T0, T1;
+} blake_4way_small_context;
+
+typedef blake_4way_small_context blake256_4way_context;
+
+void blake256_4way_init(void *cc);
+void blake256_4way(void *cc, const void *data, size_t len);
+void blake256_4way_close(void *cc, void *dst);
+void blake256_4way_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __AVX2__
+
+typedef struct {
+        __m256i buf[16] __attribute__ ((aligned (64)));
+        __m256i H[8];
+        __m256i S[4];   
+        size_t ptr;
+	sph_u64 T0, T1;
+} blake_4way_big_context;
+
+typedef blake_4way_big_context blake512_4way_context;
+
+void blake512_4way_init(void *cc);
+void blake512_4way(void *cc, const void *data, size_t len);
+void blake512_4way_close(void *cc, void *dst);
+void blake512_4way_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/blake/blake.c
+++ b/algo/blake/blake.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"
 #include "sph_blake.h"

@@ -90,18 +89,3 @@ int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
-int64_t blake_get_max64 ()
-{
-  return 0x7ffffLL;
-}
-
-bool register_blake_algo( algo_gate_t* gate )
-{
-  gate->scanhash  = (void*)&scanhash_blake;
-  gate->hash      = (void*)&blakehash;
-  gate->get_max64 = (void*)&blake_get_max64;
-  return true;
-}
-
-
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -3,16 +3,13 @@
 * tpruvot@github 2015-2016
 */

-#include "miner.h"
 #include "algo-gate-api.h"
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/sph_blake2b.h"

-
-static __thread sph_blake2b_ctx s_midstate;
-static __thread sph_blake2b_ctx s_ctx;
+//static __thread sph_blake2b_ctx s_midstate;
+//static __thread sph_blake2b_ctx s_ctx;
 #define MIDLEN 76
 #define A 64

@@ -28,6 +25,7 @@ void blake2b_hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

+/*
 static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
 {
 	s_ctx.outlen = MIDLEN;
@@ -35,6 +33,7 @@ static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
 	sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
 	sph_blake2b_final(&s_ctx, (uint8_t*) output);
 }
+*/

 int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done )
@@ -220,6 +219,8 @@ bool register_blake2b_algo( algo_gate_t* gate )
  gate->hash                  = (void*)&blake2b_hash;
  gate->calc_network_diff     = (void*)&blake2b_calc_network_diff;
  gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
  gate->build_extraheader     = (void*)&blake2b_build_extraheader;
  gate->get_new_work          = (void*)&blake2b_get_new_work;
  gate->get_max64             = (void*)&blake2b_get_max64;
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <string.h>
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"
 #define BLAKE32_ROUNDS 8
 #include "sph_blake.h"
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -0,0 +1,157 @@
+#include "decred-gate.h"
+#include "sph_blake.h"
+#include "blake-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+#include <unistd.h>
+
+#if defined (DECRED_4WAY)
+
+static __thread blake256_4way_context blake_mid;
+static __thread bool ctx_midstate_done = false;
+
+void decred_hash_4way( void *state, const void *input )
+{
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     uint32_t hash0[8] __attribute__ ((aligned (32)));
+     uint32_t hash1[8] __attribute__ ((aligned (32)));
+     uint32_t hash2[8] __attribute__ ((aligned (32)));
+     uint32_t hash3[8] __attribute__ ((aligned (32)));
+     blake256_4way_context ctx __attribute__ ((aligned (64)));
+
+     sph_blake256_context ctx2 __attribute__ ((aligned (64)));
+     uint32_t hash[16] __attribute__ ((aligned (64)));
+     uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
+
+     mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
+
+     void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
+     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
+
+     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
+     blake256_4way( &ctx, tail, tail_len );
+     blake256_4way_close( &ctx, vhash );
+/*
+     sph_blake256_init( &ctx2 );
+     sph_blake256( &ctx2, sin0, 180 );
+     sph_blake256_close( &ctx2, hash );
+*/
+/*
+     blake256_4way_init( &ctx );
+     blake256_4way( &ctx, input, 180 );
+     blake256_4way_close( &ctx, vhash );
+*/
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+/*
+        for ( int i = 0; i < 8; i++ )
+          if ( hash[i] != hash0[i] )
+            printf(" hash mismatch, i = %u\n",i);
+
+printf("hash:  %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
+                             *(hash+2), *(hash+3) );
+printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
+                             *(hash0+2), *(hash0+3) );
+printf("\n");
+*/
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+
+//     memcpy( state, hash, 32 );
+
+}
+
+int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done)
+{
+   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+        uint32_t _ALIGN(64) edata[48];
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
+        uint32_t n = first_nonce;
+        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+
+        ctx_midstate_done = false;
+        memcpy( edata, pdata, 180 );
+
+   // use the old way until  new way updated for size.
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 );
+
+   blake256_4way_init( &blake_mid );
+   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
+
+   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      * noncep    = n;
+      *(noncep+1) = n+1;
+      *(noncep+2) = n+2;
+      *(noncep+3) = n+3;
+
+      decred_hash_4way( hash, vdata );
+
+      if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
+      {
+          work_set_target_ratio( work, hash );
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          pdata[DECRED_NONCE_INDEX] = n;
+      }
+/*
+      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
+      {
+printf("found 1\n");          
+
+printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] );
+printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] );
+printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
+printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
+
+          work_set_target_ratio( work, hash+8 );
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+      }
+*/
+      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
+      {
+          work_set_target_ratio( work, hash+16 );
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+      }
+/*
+      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
+      {
+printf("found 3\n");          
+
+printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] );
+printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] );
+printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
+printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
+
+          work_set_target_ratio( work, hash+24 );
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+      }
+*/
+      n += 2;
+//      n += 4;
+  } while ( (num_found == 0) && (n < max_nonce) 
+            && !work_restart[thr_id].restart );
+
+  *hashes_done = n - first_nonce + 1;
+  return num_found;
+}
+
+#endif
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -0,0 +1,174 @@
+#include "decred-gate.h"
+#include <unistd.h>
+#include <memory.h>
+#include <string.h>
+
+uint32_t *decred_get_nonceptr( uint32_t *work_data )
+{
+   return &work_data[ DECRED_NONCE_INDEX ];
+}
+
+double decred_calc_network_diff( struct work* work )
+{
+   // sample for diff 43.281 : 1c05ea29
+   // todo: endian reversed on longpoll could be zr5 specific...
+   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
+   uint32_t bits = ( nbits & 0xffffff );
+   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
+   int m;
+   double d = (double)0x0000ffff / (double)bits;
+
+   for ( m = shift; m < 29; m++ )
+       d *= 256.0;
+   for ( m = 29; m < shift; m++ )
+       d /= 256.0;
+   if ( shift == 28 )
+       d *= 256.0; // testnet
+   if ( opt_debug_diff )
+       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
+                           shift, bits );
+   return net_diff;
+}
+
+void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
+{
+   // some random extradata to make the work unique
+   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
+   work->height = work->data[32];
+   if (!have_longpoll && work->height > *net_blocks + 1)
+   {
+      char netinfo[64] = { 0 };
+      if (opt_showdiff && net_diff > 0.)
+      {
+         if (net_diff != work->targetdiff)
+            sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
+                   work->targetdiff);
+         else
+             sprintf(netinfo, ", diff %.3f", net_diff);
+       }
+       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
+                       netinfo);
+       *net_blocks = work->height - 1;
+   }
+}
+
+void decred_be_build_stratum_request( char *req, struct work *work,
+                                      struct stratum_ctx *sctx )
+{
+   unsigned char *xnonce2str;
+   uint32_t ntime, nonce;
+   char ntimestr[9], noncestr[9];
+
+   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
+   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
+   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
+   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
+   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
+                                     sctx->xnonce1_size );
+   snprintf( req, JSON_BUF_LEN,
+        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
+         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
+   free(xnonce2str);
+}
+#define min(a,b) (a>b ? (b) :(a))
+
+void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
+{
+   uchar merkle_root[64] = { 0 };
+   uint32_t extraheader[32] = { 0 };
+   int headersize = 0;
+   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
+   size_t t;
+   int i;
+
+   // getwork over stratum, getwork merkle + header passed in coinb1
+   memcpy(merkle_root, sctx->job.coinbase, 32);
+   headersize = min((int)sctx->job.coinbase_size - 32,
+                  sizeof(extraheader) );
+   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
+
+   // Increment extranonce2 
+   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
+
+   // Assemble block header 
+   memset( g_work->data, 0, sizeof(g_work->data) );
+   g_work->data[0] = le32dec( sctx->job.version );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[1 + i] = swab32(
+                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
+
+//   for ( i = 0; i < 8; i++ ) // prevhash
+//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
+//   for ( i = 0; i < 8; i++ ) // merkle
+//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
+
+   for ( i = 0; i < headersize/4; i++ ) // header
+      g_work->data[17 + i] = extraheader[i];
+   // extradata
+
+   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
+      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
+   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
+      g_work->data[i] = 0;
+   g_work->data[37] = (rand()*4) << 8;
+   // block header suffix from coinb2 (stake version)
+   memcpy( &g_work->data[44],
+           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
+   sctx->bloc_height = g_work->data[32];
+   //applog_hex(work->data, 180);
+   //applog_hex(&work->data[36], 36);
+}
+
+#undef min
+
+bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
+                           int thr_id )
+{
+   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
+      // need to regen g_work..
+      return false;
+   if ( have_stratum && !work->data[0] && !opt_benchmark )
+   {
+      sleep(1);
+      return false;
+   }
+   // extradata: prevent duplicates
+   work->data[ DECRED_XNONCE_INDEX     ] += 1;
+   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
+   return true;
+}
+
+
+bool register_decred_algo( algo_gate_t* gate )
+{
+#if defined(DECRED_4WAY)
+  four_way_not_tested();
+  gate->optimizations = FOUR_WAY_OPT;
+  gate->scanhash  = (void*)&scanhash_decred_4way;
+  gate->hash      = (void*)&decred_hash_4way;
+#else
+  gate->optimizations = SSE2_OPT;
+  gate->scanhash  = (void*)&scanhash_decred;
+  gate->hash      = (void*)&decred_hash;
+#endif
+
+  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
+  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
+  gate->display_extra_data    = (void*)&decred_decode_extradata;
+  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
+  gate->build_extraheader     = (void*)&decred_build_extraheader;
+  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
+  gate->nbits_index           = DECRED_NBITS_INDEX;
+  gate->ntime_index           = DECRED_NTIME_INDEX;
+  gate->nonce_index           = DECRED_NONCE_INDEX;
+  gate->work_data_size        = DECRED_DATA_SIZE;
+  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE;
+  allow_mininginfo            = false;
+  have_gbt                    = false;
+  return true;
+}
+
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -0,0 +1,36 @@
+#ifndef __DECRED_GATE_H__
+#define __DECRED_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#define DECRED_NBITS_INDEX 29
+#define DECRED_NTIME_INDEX 34
+#define DECRED_NONCE_INDEX 35
+#define DECRED_XNONCE_INDEX 36
+#define DECRED_DATA_SIZE 192
+#define DECRED_WORK_COMPARE_SIZE 140
+#define DECRED_MIDSTATE_LEN 128
+
+#if defined (__AVX2__) 
+//void blakehash_84way(void *state, const void *input);
+//int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
+//                         uint64_t *hashes_done );
+#endif
+
+#if defined(FOUR_WAY) && defined(__AVX__)
+  #define DECRED_4WAY
+#endif
+
+#if defined (DECRED_4WAY)
+void decred_hash_4way(void *state, const void *input);
+int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+#endif
+
+void decred_hash( void *state, const void *input );
+int scanhash_decred( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+
+#endif
+
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -1,5 +1,4 @@
-#include "miner.h"
-#include "algo-gate-api.h"
+#include "decred-gate.h"
 #include "sph_blake.h"

 #include <string.h>
@@ -15,33 +14,33 @@
 #define max(a,b) (a<b ? b : a)
 #endif
 */
-
+/*
 #define DECRED_NBITS_INDEX 29
 #define DECRED_NTIME_INDEX 34
 #define DECRED_NONCE_INDEX 35
 #define DECRED_XNONCE_INDEX 36
 #define DECRED_DATA_SIZE 192
 #define DECRED_WORK_COMPARE_SIZE 140
-
+*/
 static __thread sph_blake256_context blake_mid;
 static __thread bool ctx_midstate_done = false;

 void decred_hash(void *state, const void *input)
 {
-        #define MIDSTATE_LEN 128
+//        #define MIDSTATE_LEN 128
        sph_blake256_context ctx __attribute__ ((aligned (64)));

        uint8_t *ending = (uint8_t*) input;
-        ending += MIDSTATE_LEN;
+        ending += DECRED_MIDSTATE_LEN;

        if (!ctx_midstate_done) {
                sph_blake256_init(&blake_mid);
-                sph_blake256(&blake_mid, input, MIDSTATE_LEN);
+                sph_blake256(&blake_mid, input, DECRED_MIDSTATE_LEN);
                ctx_midstate_done = true;
        }
        memcpy(&ctx, &blake_mid, sizeof(blake_mid));

-        sph_blake256(&ctx, ending, (180 - MIDSTATE_LEN));
+        sph_blake256(&ctx, ending, (180 - DECRED_MIDSTATE_LEN));
        sph_blake256_close(&ctx, state);
 }

@@ -60,9 +59,9 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;

-        #define DCR_NONCE_OFT32 35
+//        #define DCR_NONCE_OFT32 35

-        const uint32_t first_nonce = pdata[DCR_NONCE_OFT32];
+        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];

        uint32_t n = first_nonce;
@@ -82,7 +81,7 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t

        do {
                //be32enc(&endiandata[DCR_NONCE_OFT32], n);
-                endiandata[DCR_NONCE_OFT32] = n;
+                endiandata[DECRED_NONCE_INDEX] = n;
                decred_hash(hash32, endiandata);

                if (hash32[7] <= HTarget && fulltest(hash32, ptarget)) {
@@ -93,7 +92,7 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
                        applog_hash(ptarget);
                        applog_compare_hash(hash32, ptarget);
 #endif
-                        pdata[DCR_NONCE_OFT32] = n;
+                        pdata[DECRED_NONCE_INDEX] = n;
                        return 1;
                }

@@ -102,24 +101,17 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
        } while (n < max_nonce && !work_restart[thr_id].restart);

        *hashes_done = n - first_nonce + 1;
-        pdata[DCR_NONCE_OFT32] = n;
+        pdata[DECRED_NONCE_INDEX] = n;
        return 0;
 }

+/*
 uint32_t *decred_get_nonceptr( uint32_t *work_data )
 {
   return &work_data[ DECRED_NONCE_INDEX ];
 }

-// does decred need a custom stratum_get_g_work to fix nicehash
-//  bad extranonce2 size?
-// 
-// does decred need a custom init_nonce?
-// does it need to increment nonce, seems not because gen_work_now always
-// returns true
-
 double decred_calc_network_diff( struct work* work )
-//void decred_calc_network_diff( struct work* work )
 {
   // sample for diff 43.281 : 1c05ea29
   // todo: endian reversed on longpoll could be zr5 specific...
@@ -181,7 +173,7 @@ void decred_be_build_stratum_request( char *req, struct work *work,
         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
   free(xnonce2str);
 }
-
+*/
 /*
 // data shared between gen_merkle_root and build_extraheader.
 __thread uint32_t decred_extraheader[32] = { 0 };
@@ -197,7 +189,7 @@ void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 }
 */

-
+/*
 #define min(a,b) (a>b ? (b) :(a))

 void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
@@ -235,11 +227,15 @@ void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   for ( i = 0; i < headersize/4; i++ ) // header
      g_work->data[17 + i] = extraheader[i];
   // extradata
+
   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
      g_work->data[i] = 0;
   g_work->data[37] = (rand()*4) << 8;
+   // block header suffix from coinb2 (stake version)
+   memcpy( &g_work->data[44],
+           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
   sctx->bloc_height = g_work->data[32];
   //applog_hex(work->data, 180);
   //applog_hex(&work->data[36], 36);
@@ -274,6 +270,8 @@ bool register_decred_algo( algo_gate_t* gate )
  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
  gate->display_extra_data    = (void*)&decred_decode_extradata;
  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
  gate->build_extraheader     = (void*)&decred_build_extraheader;
  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
  gate->nbits_index           = DECRED_NBITS_INDEX;
@@ -285,4 +283,4 @@ bool register_decred_algo( algo_gate_t* gate )
  have_gbt                    = false;
  return true;
 }
-
+*/
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -0,0 +1,206 @@
+#include "pentablake-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "blake-hash-4way.h"
+#include "sph_blake.h"
+
+//#define DEBUG_ALGO
+
+#ifdef PENTABLAKE_4WAY
+
+extern void pentablakehash_4way( void *output, const void *input )
+{
+	unsigned char _ALIGN(32) hash[128];
+//	// same as uint32_t hashA[16], hashB[16];
+//	#define hashB hash+64
+
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake512_4way_context ctx;
+
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, input, 80 );
+     blake512_4way_close( &ctx, vhash );
+
+uint64_t sin0[10], sin1[10], sin2[10], sin3[10];
+mm256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
+sph_blake512_context ctx2_blake;
+sph_blake512_init(&ctx2_blake);
+sph_blake512(&ctx2_blake, sin0, 80);
+sph_blake512_close(&ctx2_blake, (void*) hash);
+
+mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+uint64_t* hash64 = (uint64_t*)hash;
+for( int i = 0; i < 8; i++ )
+{
+   if ( hash0[i] != hash64[i] )
+      printf("hash mismatch %u\n",i);
+}
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+
+/*
+     uint64_t sin0[10] __attribute__ ((aligned (64)));
+     uint64_t sin1[10] __attribute__ ((aligned (64)));
+     uint64_t sin2[10] __attribute__ ((aligned (64)));
+     uint64_t sin3[10] __attribute__ ((aligned (64)));
+
+	sph_blake512_context     ctx_blake;
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+	memcpy(output, hash, 32);
+*/
+}
+
+int scanhash_pentablake_4way( int thr_id, struct work *work,
+                              uint32_t max_nonce, uint64_t *hashes_done )
+{
+    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t endiandata[32] __attribute__ ((aligned (64)));
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19] - 1;
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t Htarg = ptarget[7];
+    uint32_t *nonces = work->nonces;
+    bool *found = work->nfound;
+    int num_found = 0;
+    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+    uint32_t *noncep1 = vdata + 75;
+    uint32_t *noncep2 = vdata + 77;
+    uint32_t *noncep3 = vdata + 79;
+
+//    uint32_t _ALIGN(32) hash64[8];
+//    uint32_t _ALIGN(32) endiandata[32];
+
+    uint64_t htmax[] = {
+	0,
+	0xF,
+	0xFF,
+	0xFFF,
+	0xFFFF,
+	0x10000000
+    };
+    uint32_t masks[] = {
+ 	0xFFFFFFFF,
+	0xFFFFFFF0,
+	0xFFFFFF00,
+	0xFFFFF000,
+	0xFFFF0000,
+	0
+    };
+
+	// we need bigendian data...
+    swab32_array( endiandata, pdata, 20 );
+
+    uint64_t *edata = (uint64_t*)endiandata;
+    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+    for ( int m=0; m < 6; m++ )
+    {
+        if ( Htarg <= htmax[m] )
+        {
+           uint32_t mask = masks[m];
+           do {
+              found[0] = found[1] = found[2] = found[3] = false;
+              be32enc( noncep0, n   );
+              be32enc( noncep1, n+1 );
+              be32enc( noncep2, n+2 );
+              be32enc( noncep3, n+3 );
+
+              pentablakehash_4way( hash, vdata );
+
+              // return immediately on nonce found, only one submit
+              if ( ( !(hash[7] & mask) ) && fulltest( hash, ptarget ) )
+              {
+                  found[0] = true;
+                  num_found++;
+                  nonces[0] = n;
+                  pdata[19] = n;
+                  *hashes_done = n - first_nonce + 1;
+                  return 1;
+              }
+              if ( (! ((hash+8)[7] & mask) ) && fulltest( hash+8, ptarget ) )
+              {
+                  found[1] = true;
+                  num_found++;
+                  nonces[1] = n;
+                  *hashes_done = n - first_nonce + 1;
+                  return 1;
+              }
+              if ( ( !((hash+16)[7] & mask) ) && fulltest( hash+16, ptarget ) )
+              {
+                  found[2] = true;
+                  num_found++;
+                  nonces[2] = n;
+                  *hashes_done = n - first_nonce + 1;
+                  return 1;
+              }
+              if ( ( !((hash+24)[7] & mask) ) && fulltest( hash+24, ptarget ) )
+              {
+                  found[3] = true;
+                  num_found++;
+                  nonces[3] = n;
+                  *hashes_done = n - first_nonce + 1;
+                  return 1;
+              }
+              n += 4;
+
+           } while (n < max_nonce && !work_restart[thr_id].restart);
+           break;
+        }
+    }
+
+    *hashes_done = n - first_nonce + 1;
+    pdata[19] = n;
+    return 0;
+} 
+
+#endif
--- a/algo/blake/pentablake-gate.c
+++ b/algo/blake/pentablake-gate.c
@@ -0,0 +1,16 @@
+#include "pentablake-gate.h"
+
+bool register_pentablake_algo( algo_gate_t* gate )
+{
+#if defined (PENTABLAKE_4WAY)
+    gate->scanhash  = (void*)&scanhash_pentablake_4way;
+    gate->hash      = (void*)&pentablakehash_4way;
+#else
+    gate->scanhash  = (void*)&scanhash_pentablake;
+    gate->hash      = (void*)&pentablakehash;
+#endif
+    gate->optimizations = FOUR_WAY_OPT;
+    gate->get_max64 = (void*)&get_max64_0x3ffff;
+    return true;
+};
+
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -0,0 +1,21 @@
+#ifndef __PENTABLAKE_GATE_H__
+#define __PENTABLAKE_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX__)
+  #define PENTABLAKE_4WAY
+#endif
+
+#if defined(PENTABLAKE_4WAY)
+void pentablakehash_4way( void *state, const void *input );
+int scanhash_pentablake_4way( int thr_id, struct work *work,
+                              uint32_t max_nonce, uint64_t *hashes_done );
+#endif
+
+void pentablakehash( void *state, const void *input );
+int scanhash_pentablake( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
--- a/algo/blake/pentablake.c
+++ b/algo/blake/pentablake.c
@@ -1,5 +1,4 @@
-#include "miner.h"
-#include "algo-gate-api.h"
+#include "pentablake-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -111,11 +110,3 @@ int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 } 

-bool register_pentablake_algo( algo_gate_t* gate )
-{
-    gate->scanhash  = (void*)&scanhash_pentablake;
-    gate->hash      = (void*)&pentablakehash;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-};
-
--- a/algo/blake/sph_blake.c
+++ b/algo/blake/sph_blake.c
@@ -813,6 +813,7 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)

 	buf = sc->buf;
 	ptr = sc->ptr;
+
 	if (len < (sizeof sc->buf) - ptr) {
 		memcpy(buf + ptr, data, len);
 		ptr += len;
@@ -871,6 +872,7 @@ blake32_close(sph_blake_small_context *sc,
 	} else {
 		sc->T0 -= 512 - bit_len;
 	}
+
 	if (bit_len <= 446) {
 		memset(u.buf + ptr + 1, 0, 55 - ptr);
 		if (out_size_w32 == 8)
@@ -890,9 +892,9 @@ blake32_close(sph_blake_small_context *sc,
 		sph_enc32be_aligned(u.buf + 60, tl);
 		blake32(sc, u.buf, 64);
 	}
-	out = dst;
-	for (k = 0; k < out_size_w32; k ++)
-		sph_enc32be(out + (k << 2), sc->H[k]);
+        out = dst;
+        for (k = 0; k < out_size_w32; k ++)
+                sph_enc32be(out + (k << 2), sc->H[k]);
 }

 #if SPH_64
@@ -982,9 +984,11 @@ blake64_close(sph_blake_big_context *sc,
 			u.buf[111] |= 1;
 		sph_enc64be_aligned(u.buf + 112, th);
 		sph_enc64be_aligned(u.buf + 120, tl);
+
 		blake64(sc, u.buf + ptr, 128 - ptr);
 	} else {
 		memset(u.buf + ptr + 1, 0, 127 - ptr);
+
 		blake64(sc, u.buf + ptr, 128 - ptr);
 		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
 		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
@@ -993,6 +997,7 @@ blake64_close(sph_blake_big_context *sc,
 			u.buf[111] = 1;
 		sph_enc64be_aligned(u.buf + 112, th);
 		sph_enc64be_aligned(u.buf + 120, tl);
+
 		blake64(sc, u.buf, 128);
 	}
 	out = dst;
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
@@ -0,0 +1,969 @@
+/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * BMW implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+#include "bmw-hash-4way.h"
+
+#if defined(__AVX2__)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+//#include "sph_bmw.h"
+
+//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW
+#define SPH_SMALL_FOOTPRINT_BMW   1
+//#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+//#undef SPH_ROTL64
+//#define SPH_ROTL64(x,n)  (((x) << (n)) | ((x) >> (64 - (n))))
+//#define SPH_ROTL64(x,n)  mm256_rotl_64(x,n)
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0x40414243), SPH_C32(0x44454647),
+	SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F),
+	SPH_C32(0x50515253), SPH_C32(0x54555657),
+	SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F),
+	SPH_C32(0x60616263), SPH_C32(0x64656667),
+	SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F),
+	SPH_C32(0x70717273), SPH_C32(0x74757677),
+	SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F)
+};
+
+#if SPH_64
+
+static const sph_u64 IV512[] = {
+	SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F),
+	SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F),
+	SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF),
+	SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF),
+	SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF),
+	SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF),
+	SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF),
+	SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF)
+};
+
+#endif
+
+#define XCAT(x, y)    XCAT_(x, y)
+#define XCAT_(x, y)   x ## y
+
+#define LPAR   (
+
+/*
+#define ss0(x)    (((x) >> 1) ^ SPH_T32((x) << 3) \
+                  ^ SPH_ROTL32(x,  4) ^ SPH_ROTL32(x, 19))
+#define ss1(x)    (((x) >> 1) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x,  8) ^ SPH_ROTL32(x, 23))
+#define ss2(x)    (((x) >> 2) ^ SPH_T32((x) << 1) \
+                  ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25))
+#define ss3(x)    (((x) >> 2) ^ SPH_T32((x) << 2) \
+                  ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29))
+#define ss4(x)    (((x) >> 1) ^ (x))
+#define ss5(x)    (((x) >> 2) ^ (x))
+#define rs1(x)    SPH_ROTL32(x,  3)
+#define rs2(x)    SPH_ROTL32(x,  7)
+#define rs3(x)    SPH_ROTL32(x, 13)
+#define rs4(x)    SPH_ROTL32(x, 16)
+#define rs5(x)    SPH_ROTL32(x, 19)
+#define rs6(x)    SPH_ROTL32(x, 23)
+#define rs7(x)    SPH_ROTL32(x, 27)
+
+#define Ks(j)   SPH_T32((sph_u32)(j) * SPH_C32(0x05555555))
+
+#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \
+	(SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \
+		- SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m))
+
+#define expand1s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \
+		+ ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \
+		+ ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \
+		+ ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand1s(qf, mf, hf, i16) \
+	expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand1s_(qf, mf, hf, i16, ix, iy) \
+	expand1s_inner LPAR qf, mf, hf, i16, ix, iy)
+
+#define expand2s_inner(qf, mf, hf, i16, \
+		i0, i1, i2, i3, i4, i5, i6, i7, i8, \
+		i9, i10, i11, i12, i13, i14, i15, \
+		i0m, i1m, i3m, i4m, i7m, i10m, i11m) \
+	SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \
+		+ qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \
+		+ qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \
+		+ qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \
+		+ add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16))
+
+#define expand2s(qf, mf, hf, i16) \
+	expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16)
+#define expand2s_(qf, mf, hf, i16, ix, iy) \
+	expand2s_inner LPAR qf, mf, hf, i16, ix, iy)
+*/
+#if SPH_64
+
+#define sb0(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
+                                       _mm256_slli_epi64( (x), 3) ), \
+                     _mm256_xor_si256( mm256_rotl_64( (x), 4), \
+                                       mm256_rotl_64( (x), 37) ) )
+
+#define sb1(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 1), \
+                                       _mm256_slli_epi64( (x), 2) ), \
+                     _mm256_xor_si256( mm256_rotl_64( (x), 13), \
+                                       mm256_rotl_64( (x), 43) ) )
+
+#define sb2(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
+                                       _mm256_slli_epi64( (x), 1) ), \
+                     _mm256_xor_si256( mm256_rotl_64( (x), 19), \
+                                       mm256_rotl_64( (x), 53) ) )
+
+#define sb3(x) \
+   _mm256_xor_si256( _mm256_xor_si256( _mm256_srli_epi64( (x), 2), \
+                                       _mm256_slli_epi64( (x), 2) ), \
+                     _mm256_xor_si256( mm256_rotl_64( (x), 28), \
+                                       mm256_rotl_64( (x), 59) ) )
+
+#define sb4(x) \
+  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 1 ) )
+
+#define sb5(x) \
+  _mm256_xor_si256( (x), _mm256_srli_epi64( (x), 2 ) )
+
+#define rb1(x)    mm256_rotl_64( x,  5 ) 
+#define rb2(x)    mm256_rotl_64( x, 11 ) 
+#define rb3(x)    mm256_rotl_64( x, 27 ) 
+#define rb4(x)    mm256_rotl_64( x, 32 ) 
+#define rb5(x)    mm256_rotl_64( x, 37 ) 
+#define rb6(x)    mm256_rotl_64( x, 43 ) 
+#define rb7(x)    mm256_rotl_64( x, 53 ) 
+
+#define rol_off( M, j, off ) \
+   mm256_rotl_64( M[ ( (j) + (off) ) & 15 ] , \
+                   ( ( (j) + (off) ) & 15 ) + 1 )
+
+#define add_elt_b( M, H, j ) \
+   _mm256_xor_si256( \
+      _mm256_add_epi64( \
+            _mm256_sub_epi64( _mm256_add_epi64( rol_off( M, j, 0 ), \
+                                                rol_off( M, j, 3 ) ), \
+                             rol_off( M, j, 10 ) ), \
+            _mm256_set1_epi64x( ( (j) + 16 ) * 0x0555555555555555ULL ) ), \
+       H[ ( (j)+7 ) & 15 ] )
+          
+#define expand1b( qt, M, H, i ) \
+   _mm256_add_epi64( \
+      _mm256_add_epi64( \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)-16 ] ), \
+                                  sb2( qt[ (i)-15 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)-14 ] ), \
+                                  sb0( qt[ (i)-13 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)-12 ] ), \
+                                  sb2( qt[ (i)-11 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)-10 ] ), \
+                                  sb0( qt[ (i)- 9 ] ) ) ) ), \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)- 8 ] ), \
+                                  sb2( qt[ (i)- 7 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)- 6 ] ), \
+                                  sb0( qt[ (i)- 5 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( sb1( qt[ (i)- 4 ] ), \
+                                  sb2( qt[ (i)- 3 ] ) ), \
+                _mm256_add_epi64( sb3( qt[ (i)- 2 ] ), \
+                                  sb0( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_b( M, H, (i)-16 ) )
+
+#define expand2b( qt, M, H, i) \
+   _mm256_add_epi64( \
+      _mm256_add_epi64( \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)-16 ], rb1( qt[ (i)-15 ] ) ), \
+                _mm256_add_epi64( qt[ (i)-14 ], rb2( qt[ (i)-13 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)-12 ], rb3( qt[ (i)-11 ] ) ), \
+                _mm256_add_epi64( qt[ (i)-10 ], rb4( qt[ (i)- 9 ] ) ) ) ), \
+         _mm256_add_epi64( \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)- 8 ], rb5( qt[ (i)- 7 ] ) ), \
+                _mm256_add_epi64( qt[ (i)- 6 ], rb6( qt[ (i)- 5 ] ) ) ), \
+             _mm256_add_epi64( \
+                _mm256_add_epi64( qt[ (i)- 4 ], rb7( qt[ (i)- 3 ] ) ), \
+                _mm256_add_epi64( sb4( qt[ (i)- 2 ] ), \
+                                  sb5( qt[ (i)- 1 ] ) ) ) ) ), \
+      add_elt_b( M, H, (i)-16 ) )
+
+#endif
+
+/*
+#define MAKE_W( i0, op01, i1, op12, i2, op23, i3, op34, i4) \
+        ((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \
+        op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4)))
+*/
+
+/*
+#define Ws0    MAKE_W(SPH_T32,  5, -,  7, +, 10, +, 13, +, 14)
+#define Ws1    MAKE_W(SPH_T32,  6, -,  8, +, 11, +, 14, -, 15)
+#define Ws2    MAKE_W(SPH_T32,  0, +,  7, +,  9, -, 12, +, 15)
+#define Ws3    MAKE_W(SPH_T32,  0, -,  1, +,  8, -, 10, +, 13)
+#define Ws4    MAKE_W(SPH_T32,  1, +,  2, +,  9, -, 11, -, 14)
+#define Ws5    MAKE_W(SPH_T32,  3, -,  2, +, 10, -, 12, +, 15)
+#define Ws6    MAKE_W(SPH_T32,  4, -,  0, -,  3, -, 11, +, 13)
+#define Ws7    MAKE_W(SPH_T32,  1, -,  4, -,  5, -, 12, -, 14)
+#define Ws8    MAKE_W(SPH_T32,  2, -,  5, -,  6, +, 13, -, 15)
+#define Ws9    MAKE_W(SPH_T32,  0, -,  3, +,  6, -,  7, +, 14)
+#define Ws10   MAKE_W(SPH_T32,  8, -,  1, -,  4, -,  7, +, 15)
+#define Ws11   MAKE_W(SPH_T32,  8, -,  0, -,  2, -,  5, +,  9)
+#define Ws12   MAKE_W(SPH_T32,  1, +,  3, -,  6, -,  9, +, 10)
+#define Ws13   MAKE_W(SPH_T32,  2, +,  4, +,  7, +, 10, +, 11)
+#define Ws14   MAKE_W(SPH_T32,  3, -,  5, +,  8, -, 11, -, 12)
+#define Ws15   MAKE_W(SPH_T32, 12, -,  4, -,  6, -,  9, +, 13)
+
+#if SPH_SMALL_FOOTPRINT_BMW
+
+#define MAKE_Qas   do { \
+		unsigned u; \
+		sph_u32 Ws[16]; \
+		Ws[ 0] = Ws0; \
+		Ws[ 1] = Ws1; \
+		Ws[ 2] = Ws2; \
+		Ws[ 3] = Ws3; \
+		Ws[ 4] = Ws4; \
+		Ws[ 5] = Ws5; \
+		Ws[ 6] = Ws6; \
+		Ws[ 7] = Ws7; \
+		Ws[ 8] = Ws8; \
+		Ws[ 9] = Ws9; \
+		Ws[10] = Ws10; \
+		Ws[11] = Ws11; \
+		Ws[12] = Ws12; \
+		Ws[13] = Ws13; \
+		Ws[14] = Ws14; \
+		Ws[15] = Ws15; \
+		for (u = 0; u < 15; u += 5) { \
+			qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \
+			qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \
+			qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \
+			qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \
+			qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \
+		} \
+		qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#else
+
+#define MAKE_Qas   do { \
+		qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \
+		qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \
+		qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \
+		qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \
+		qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \
+		qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \
+		qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \
+		qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \
+		qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \
+		qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \
+		qt[10] = SPH_T32(ss0(Ws10) + H(11)); \
+		qt[11] = SPH_T32(ss1(Ws11) + H(12)); \
+		qt[12] = SPH_T32(ss2(Ws12) + H(13)); \
+		qt[13] = SPH_T32(ss3(Ws13) + H(14)); \
+		qt[14] = SPH_T32(ss4(Ws14) + H(15)); \
+		qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \
+	} while (0)
+
+#define MAKE_Qbs   do { \
+		qt[16] = expand1s(Qs, M, H, 16); \
+		qt[17] = expand1s(Qs, M, H, 17); \
+		qt[18] = expand2s(Qs, M, H, 18); \
+		qt[19] = expand2s(Qs, M, H, 19); \
+		qt[20] = expand2s(Qs, M, H, 20); \
+		qt[21] = expand2s(Qs, M, H, 21); \
+		qt[22] = expand2s(Qs, M, H, 22); \
+		qt[23] = expand2s(Qs, M, H, 23); \
+		qt[24] = expand2s(Qs, M, H, 24); \
+		qt[25] = expand2s(Qs, M, H, 25); \
+		qt[26] = expand2s(Qs, M, H, 26); \
+		qt[27] = expand2s(Qs, M, H, 27); \
+		qt[28] = expand2s(Qs, M, H, 28); \
+		qt[29] = expand2s(Qs, M, H, 29); \
+		qt[30] = expand2s(Qs, M, H, 30); \
+		qt[31] = expand2s(Qs, M, H, 31); \
+	} while (0)
+
+#endif
+
+#define MAKE_Qs   do { \
+		MAKE_Qas; \
+		MAKE_Qbs; \
+	} while (0)
+
+#define Qs(j)   (qt[j])
+*/
+#if SPH_64
+
+#define Wb0 \
+   _mm256_add_epi64( \
+       _mm256_add_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 5], H[ 5] ), \
+                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+             _mm256_xor_si256( M[10], H[10] ) ), \
+          _mm256_xor_si256( M[13], H[13] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb1 \
+   _mm256_sub_epi64( \
+       _mm256_add_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 6], H[ 6] ), \
+                               _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+             _mm256_xor_si256( M[11], H[11] ) ), \
+          _mm256_xor_si256( M[14], H[14] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb2 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb3 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[10], H[10] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+#define Wb4 \
+   _mm256_sub_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+             _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb5 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                               _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+             _mm256_xor_si256( M[10], H[10] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb6 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 4], H[ 4] ), \
+                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+             _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+#define Wb7 \
+   _mm256_sub_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+             _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+          _mm256_xor_si256( M[12], H[12] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb8 \
+   _mm256_sub_epi64( \
+       _mm256_add_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[13], H[13] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb9 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 0], H[ 0] ), \
+                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+       _mm256_xor_si256( M[14], H[14] ) )
+
+#define Wb10 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                               _mm256_xor_si256( M[ 1], H[ 1] ) ), \
+             _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+          _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+       _mm256_xor_si256( M[15], H[15] ) )
+
+#define Wb11 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 8], H[ 8] ), \
+                               _mm256_xor_si256( M[ 0], H[ 0] ) ), \
+             _mm256_xor_si256( M[ 2], H[ 2] ) ), \
+          _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+       _mm256_xor_si256( M[ 9], H[ 9] ) )
+
+#define Wb12 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 1], H[ 1] ), \
+                               _mm256_xor_si256( M[ 3], H[ 3] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+       _mm256_xor_si256( M[10], H[10] ) )
+
+#define Wb13 \
+   _mm256_add_epi64( \
+       _mm256_add_epi64( \
+          _mm256_add_epi64( \
+             _mm256_add_epi64( _mm256_xor_si256( M[ 2], H[ 2] ), \
+                               _mm256_xor_si256( M[ 4], H[ 4] ) ), \
+             _mm256_xor_si256( M[ 7], H[ 7] ) ), \
+          _mm256_xor_si256( M[10], H[10] ) ), \
+       _mm256_xor_si256( M[11], H[11] ) )
+
+#define Wb14 \
+   _mm256_sub_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_add_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[ 3], H[ 3] ), \
+                               _mm256_xor_si256( M[ 5], H[ 5] ) ), \
+             _mm256_xor_si256( M[ 8], H[ 8] ) ), \
+          _mm256_xor_si256( M[11], H[11] ) ), \
+       _mm256_xor_si256( M[12], H[12] ) )
+
+#define Wb15 \
+   _mm256_add_epi64( \
+       _mm256_sub_epi64( \
+          _mm256_sub_epi64( \
+             _mm256_sub_epi64( _mm256_xor_si256( M[12], H[12] ), \
+                               _mm256_xor_si256( M[ 4], H[4] ) ), \
+             _mm256_xor_si256( M[ 6], H[ 6] ) ), \
+          _mm256_xor_si256( M[ 9], H[ 9] ) ), \
+       _mm256_xor_si256( M[13], H[13] ) )
+
+void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] )
+{
+   __m256i qt[32], xl, xh; \
+
+   qt[ 0] = sb0( Wb0 ) + H[ 1]; 
+   qt[ 1] = sb1( Wb1 ) + H[ 2]; 
+   qt[ 2] = sb2( Wb2 ) + H[ 3]; 
+   qt[ 3] = sb3( Wb3 ) + H[ 4]; 
+   qt[ 4] = sb4( Wb4 ) + H[ 5]; 
+   qt[ 5] = sb0( Wb5 ) + H[ 6]; 
+   qt[ 6] = sb1( Wb6 ) + H[ 7]; 
+   qt[ 7] = sb2( Wb7 ) + H[ 8]; 
+   qt[ 8] = sb3( Wb8 ) + H[ 9]; 
+   qt[ 9] = sb4( Wb9 ) + H[10]; 
+   qt[10] = sb0( Wb10) + H[11]; 
+   qt[11] = sb1( Wb11) + H[12]; 
+   qt[12] = sb2( Wb12) + H[13]; 
+   qt[13] = sb3( Wb13) + H[14];
+   qt[14] = sb4( Wb14) + H[15]; 
+   qt[15] = sb0( Wb15) + H[ 0]; 
+   qt[16] = expand1b( qt, M, H, 16 ); 
+   qt[17] = expand1b( qt, M, H, 17 ); 
+   qt[18] = expand2b( qt, M, H, 18 ); 
+   qt[19] = expand2b( qt, M, H, 19 ); 
+   qt[20] = expand2b( qt, M, H, 20 ); 
+   qt[21] = expand2b( qt, M, H, 21 ); 
+   qt[22] = expand2b( qt, M, H, 22 ); 
+   qt[23] = expand2b( qt, M, H, 23 ); 
+   qt[24] = expand2b( qt, M, H, 24 ); 
+   qt[25] = expand2b( qt, M, H, 25 ); 
+   qt[26] = expand2b( qt, M, H, 26 ); 
+   qt[27] = expand2b( qt, M, H, 27 ); 
+   qt[28] = expand2b( qt, M, H, 28 ); 
+   qt[29] = expand2b( qt, M, H, 29 ); 
+   qt[30] = expand2b( qt, M, H, 30 ); 
+   qt[31] = expand2b( qt, M, H, 31 ); 
+   xl = _mm256_xor_si256( 
+              _mm256_xor_si256( _mm256_xor_si256( qt[16], qt[17] ), 
+                                _mm256_xor_si256( qt[18], qt[19] ) ), 
+              _mm256_xor_si256( _mm256_xor_si256( qt[20], qt[21] ), 
+                                _mm256_xor_si256( qt[22], qt[23] ) ) ); 
+   xh = _mm256_xor_si256( xl, 
+             _mm256_xor_si256( 
+                 _mm256_xor_si256( _mm256_xor_si256( qt[24], qt[25] ),
+                                   _mm256_xor_si256( qt[26], qt[27] ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( qt[28], qt[29] ),
+                                   _mm256_xor_si256( qt[30], qt[31] ) )));
+   dH[ 0] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[0],
+                      _mm256_xor_si256( _mm256_slli_epi64( xh, 5 ),
+                                        _mm256_srli_epi64( qt[16], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[24] ), qt[ 0] ));
+   dH[ 1] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[1],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 7 ),
+                                        _mm256_slli_epi64( qt[17], 8 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[25] ), qt[ 1] ));
+   dH[ 2] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[2],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 5 ),
+                                        _mm256_slli_epi64( qt[18], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[26] ), qt[ 2] ));
+   dH[ 3] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[3],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 1 ),
+                                        _mm256_slli_epi64( qt[19], 5 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[27] ), qt[ 3] ));
+   dH[ 4] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[4],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 3 ),
+                                        _mm256_slli_epi64( qt[20], 0 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[28] ), qt[ 4] ));
+   dH[ 5] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[5],
+                      _mm256_xor_si256( _mm256_slli_epi64( xh, 6 ),
+                                        _mm256_srli_epi64( qt[21], 6 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[29] ), qt[ 5] ));
+   dH[ 6] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[6],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 4 ),
+                                        _mm256_slli_epi64( qt[22], 6 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[30] ), qt[ 6] ));
+   dH[ 7] = _mm256_add_epi64(
+                 _mm256_xor_si256( M[7],
+                      _mm256_xor_si256( _mm256_srli_epi64( xh, 11 ),
+                                        _mm256_slli_epi64( qt[23], 2 ) ) ),
+                 _mm256_xor_si256( _mm256_xor_si256( xl, qt[31] ), qt[ 7] ));
+   dH[ 8] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[4], 9 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[24] ), M[ 8] )),
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, 8 ),
+                                   _mm256_xor_si256( qt[23], qt[ 8] ) ) );
+   dH[ 9] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[5], 10 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[25] ), M[ 9] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 6 ),
+                                   _mm256_xor_si256( qt[16], qt[ 9] ) ) );
+   dH[10] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[6], 11 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[26] ), M[10] )),
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, 6 ),
+                                   _mm256_xor_si256( qt[17], qt[10] ) ) );
+   dH[11] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[7], 12 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[27] ), M[11] )),
+                 _mm256_xor_si256( _mm256_slli_epi64( xl, 4 ),
+                                   _mm256_xor_si256( qt[18], qt[11] ) ) );
+   dH[12] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[0], 13 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[28] ), M[12] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 3 ),
+                                   _mm256_xor_si256( qt[19], qt[12] ) ) );
+   dH[13] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[1], 14 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[29] ), M[13] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 4 ),
+                                   _mm256_xor_si256( qt[20], qt[13] ) ) );
+   dH[14] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[2], 15 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[30] ), M[14] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 7 ),
+                                   _mm256_xor_si256( qt[21], qt[14] ) ) );
+   dH[15] = _mm256_add_epi64( _mm256_add_epi64(
+                 mm256_rotl_64( dH[3], 16 ),
+                 _mm256_xor_si256( _mm256_xor_si256( xh, qt[31] ), M[15] )),
+                 _mm256_xor_si256( _mm256_srli_epi64( xl, 2 ),
+                                   _mm256_xor_si256( qt[22], qt[15] ) ) );
+} 
+
+#endif  // 64
+
+//#define FOLDs   FOLD(sph_u32, MAKE_Qs, SPH_ROTL32, M, Qs, dH)
+
+
+/*
+static void
+compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16])
+{
+#define M(x)    sph_dec32le_aligned(data + 4 * (x))
+#define H(x)    (h[x])
+#define dH(x)   (dh[x])
+
+	FOLDs;
+
+#undef M
+#undef H
+#undef dH
+}
+
+static const sph_u32 final_s[16] = {
+	SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2),
+	SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5),
+	SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8),
+	SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab),
+	SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae),
+	SPH_C32(0xaaaaaaaf)
+};
+
+static void
+bmw32_4way_init(bmw_4way_small_context *sc, const sph_u32 *iv)
+{
+	memcpy(sc->H, iv, sizeof sc->H);
+	sc->ptr = 0;
+#if SPH_64
+	sc->bit_count = 0;
+#else
+	sc->bit_count_high = 0;
+	sc->bit_count_low = 0;
+#endif
+}
+
+static void
+bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len)
+{
+	unsigned char *buf;
+	size_t ptr;
+	sph_u32 htmp[16];
+	sph_u32 *h1, *h2;
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->bit_count += (sph_u64)len << 3;
+#else
+	tmp = sc->bit_count_low;
+	sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3));
+	if (sc->bit_count_low < tmp)
+		sc->bit_count_high ++;
+	sc->bit_count_high += len >> 29;
+#endif
+	buf = sc->buf;
+	ptr = sc->ptr;
+	h1 = sc->H;
+	h2 = htmp;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		len -= clen;
+		ptr += clen;
+		if (ptr == sizeof sc->buf) {
+			sph_u32 *ht;
+
+			compress_small(buf, h1, h2);
+			ht = h1;
+			h1 = h2;
+			h2 = ht;
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+	if (h1 != sc->H)
+		memcpy(sc->H, h1, sizeof sc->H);
+}
+
+static void
+bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w32)
+{
+	unsigned char *buf, *out;
+	size_t ptr, u, v;
+	unsigned z;
+	sph_u32 h1[16], h2[16], *h;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	z = 0x80 >> n;
+	buf[ptr ++] = ((ub & -z) | z) & 0xFF;
+	h = sc->H;
+	if (ptr > (sizeof sc->buf) - 8) {
+		memset(buf + ptr, 0, (sizeof sc->buf) - ptr);
+		compress_small(buf, h, h1);
+		ptr = 0;
+		h = h1;
+	}
+	memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr);
+#if SPH_64
+	sph_enc64le_aligned(buf + (sizeof sc->buf) - 8,
+		SPH_T64(sc->bit_count + n));
+#else
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 8,
+		sc->bit_count_low + n);
+	sph_enc32le_aligned(buf + (sizeof sc->buf) - 4,
+		SPH_T32(sc->bit_count_high));
+#endif
+	compress_small(buf, h, h2);
+	for (u = 0; u < 16; u ++)
+		sph_enc32le_aligned(buf + 4 * u, h2[u]);
+	compress_small(buf, final_s, h1);
+	out = dst;
+	for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++)
+		sph_enc32le(out + 4 * u, h1[v]);
+}
+*/
+#if SPH_64
+
+static const __m256i final_b[16] =
+{
+   { 0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0,
+     0xaaaaaaaaaaaaaaa0, 0xaaaaaaaaaaaaaaa0 },
+   { 0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1,
+     0xaaaaaaaaaaaaaaa1, 0xaaaaaaaaaaaaaaa1 },
+   { 0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2,
+     0xaaaaaaaaaaaaaaa2, 0xaaaaaaaaaaaaaaa2 },
+   { 0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3,
+     0xaaaaaaaaaaaaaaa3, 0xaaaaaaaaaaaaaaa3 },
+   { 0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4,
+     0xaaaaaaaaaaaaaaa4, 0xaaaaaaaaaaaaaaa4 },
+   { 0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5,
+     0xaaaaaaaaaaaaaaa5, 0xaaaaaaaaaaaaaaa5 },
+   { 0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6,
+     0xaaaaaaaaaaaaaaa6, 0xaaaaaaaaaaaaaaa6 },
+   { 0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7,
+     0xaaaaaaaaaaaaaaa7, 0xaaaaaaaaaaaaaaa7 },
+   { 0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8,
+     0xaaaaaaaaaaaaaaa8, 0xaaaaaaaaaaaaaaa8 },
+   { 0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9,
+     0xaaaaaaaaaaaaaaa9, 0xaaaaaaaaaaaaaaa9 },
+   { 0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa,
+     0xaaaaaaaaaaaaaaaa, 0xaaaaaaaaaaaaaaaa },
+   { 0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab,
+     0xaaaaaaaaaaaaaaab, 0xaaaaaaaaaaaaaaab },
+   { 0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac,
+     0xaaaaaaaaaaaaaaac, 0xaaaaaaaaaaaaaaac },
+   { 0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad,
+     0xaaaaaaaaaaaaaaad, 0xaaaaaaaaaaaaaaad },
+   { 0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae,
+     0xaaaaaaaaaaaaaaae, 0xaaaaaaaaaaaaaaae },
+   { 0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf,
+     0xaaaaaaaaaaaaaaaf, 0xaaaaaaaaaaaaaaaf }
+};
+
+static void
+bmw64_4way_init( bmw_4way_big_context *sc, const sph_u64 *iv )
+{
+   for ( int i = 0; i < 16; i++ )
+      sc->H[i] = _mm256_set1_epi64x( iv[i] );
+   sc->ptr = 0;
+   sc->bit_count = 0;
+}
+
+static void
+bmw64_4way( bmw_4way_big_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+   __m256i *buf;
+   __m256i htmp[16];
+   __m256i *h1, *h2;
+   size_t ptr;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+   sc->bit_count += (sph_u64)len << 3;
+   buf = sc->buf;
+   ptr = sc->ptr;
+   h1 = sc->H;
+   h2 = htmp;
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_256( buf + (ptr>>3), vdata, clen >> 3 );
+      vdata = vdata + (clen>>3);
+      len -= clen;
+      ptr += clen;
+      if ( ptr == buf_size )
+      {
+         __m256i *ht;
+         compress_big( buf, h1, h2 );
+         ht = h1;
+         h1 = h2;
+         h2 = ht;
+         ptr = 0;
+      }
+   }
+   sc->ptr = ptr;
+   if ( h1 != sc->H )
+        memcpy_256( sc->H, h1, 16 );
+}
+
+static void
+bmw64_4way_close(bmw_4way_big_context *sc, unsigned ub, unsigned n,
+	void *dst, size_t out_size_w64)
+{
+   __m256i *buf;
+   __m256i h1[16], h2[16], *h;
+   size_t ptr, u, v;
+   unsigned z;
+   const int buf_size = 128;  // bytes of one lane, compatible with len
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   buf[ ptr>>3 ] = _mm256_set1_epi64x( z );
+   ptr += 8;
+   h = sc->H;
+
+   if (  ptr > (buf_size - 8) )
+   {
+      memset_zero_256( buf + (ptr>>3), (buf_size - ptr) >> 3 );
+      compress_big( buf, h, h1 );
+      ptr = 0;
+      h = h1;
+   }
+   memset_zero_256( buf + (ptr>>3), (buf_size - 8 - ptr) >> 3 );
+   buf[ (buf_size - 8) >> 3 ] = _mm256_set1_epi64x( sc->bit_count + n );
+   compress_big( buf, h, h2 );
+   for ( u = 0; u < 16; u ++ )
+      buf[u] = h2[u];
+   compress_big( buf, final_b, h1 );
+   for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++)
+      casti_m256i(dst,u) = h1[v];
+}
+
+#endif
+
+void
+bmw256_4way_init(void *cc)
+{
+//	bmw32_4way_init(cc, IV256);
+}
+
+void
+bmw256_4way(void *cc, const void *data, size_t len)
+{
+//	bmw32_4way(cc, data, len);
+}
+
+void
+bmw256_4way_close(void *cc, void *dst)
+{
+//	bmw256_4way_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+bmw256_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+//	bmw32_4way_close(cc, ub, n, dst, 8);
+}
+
+#if SPH_64
+
+void
+bmw512_4way_init(void *cc)
+{
+	bmw64_4way_init(cc, IV512);
+}
+
+void
+bmw512_4way(void *cc, const void *data, size_t len)
+{
+	bmw64_4way(cc, data, len);
+}
+
+void
+bmw512_4way_close(void *cc, void *dst)
+{
+	bmw512_4way_addbits_and_close(cc, 0, 0, dst);
+}
+
+void
+bmw512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	bmw64_4way_close(cc, ub, n, dst, 8);
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -0,0 +1,154 @@
+/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
+ * functions which differ by their output size; this implementation
+ * defines BMW for output sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_bmw.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef BMW_HASH_H__
+#define BMW_HASH_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#ifdef __AVX2__
+
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+/**
+ * Output size (in bits) for BMW-224.
+ */
+#define SPH_SIZE_bmw224   224
+
+/**
+ * Output size (in bits) for BMW-256.
+ */
+#define SPH_SIZE_bmw256   256
+
+#if SPH_64
+
+/**
+ * Output size (in bits) for BMW-384.
+ */
+#define SPH_SIZE_bmw384   384
+
+/**
+ * Output size (in bits) for BMW-512.
+ */
+#define SPH_SIZE_bmw512   512
+
+#endif
+
+/**
+ * This structure is a context for BMW-224 and BMW-256 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	size_t ptr;
+	sph_u32 H[16];
+#if SPH_64
+	sph_u64 bit_count;
+#else
+	sph_u32 bit_count_high, bit_count_low;
+#endif
+#endif
+} bmw_4way_small_context;
+
+typedef bmw_4way_small_context bmw256_4way_context;
+
+#if SPH_64
+
+/**
+ * This structure is a context for BMW-384 and BMW-512 computations:
+ * it contains the intermediate values and some data from the last
+ * entered block. Once a BMW computation has been performed, the
+ * context can be reused for another computation.
+ *
+ * The contents of this structure are private. A running BMW
+ * computation can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+   __m256i buf[16];
+   __m256i H[16];
+
+//	unsigned char buf[128];    /* first field, for alignment */
+	size_t ptr;
+//	sph_u64 H[16];
+	sph_u64 bit_count;
+#endif
+} bmw_4way_big_context;
+
+typedef bmw_4way_big_context bmw512_4way_context;
+
+#endif
+
+void bmw256_4way_init(void *cc);
+
+void bmw256_4way(void *cc, const void *data, size_t len);
+
+void bmw256_4way_close(void *cc, void *dst);
+
+void bmw256_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#if SPH_64
+
+void bmw512_4way_init(void *cc);
+
+void bmw512_4way(void *cc, const void *data, size_t len);
+
+void bmw512_4way_close(void *cc, void *dst);
+
+void bmw512_4way_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
--- a/algo/bmw/bmw256.c
+++ b/algo/bmw/bmw256.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <string.h>
--- a/algo/cryptonight/cryptolight.c
+++ b/algo/cryptonight/cryptolight.c
@@ -2,7 +2,6 @@
 // Distributed under the MIT/X11 software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.

-#include "miner.h"
 #include "algo-gate-api.h"

 #if defined(__arm__) || defined(_MSC_VER)
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -109,43 +109,43 @@ static __thread cryptonight_ctx ctx;
 void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 {
 #ifndef NO_AES_NI
-    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
+
    uint8_t ExpandedKey[256] __attribute__((aligned(64)));
+    __m128i *longoutput, *expkey, *xmminput;
    size_t i, j;
    
-    memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
+    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
+    memcpy( ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE );
+    ExpandAESKey256( ExpandedKey );
+    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
    
-    __m128i *longoutput, *expkey, *xmminput;
-    longoutput = (__m128i *)ctx.long_state;
-    expkey     = (__m128i *)ExpandedKey;
-    xmminput   = (__m128i *)ctx.text;
+    longoutput = (__m128i*)ctx.long_state;
+    xmminput   = (__m128i*)ctx.text;
+    expkey     = (__m128i*)ExpandedKey;
    
-    //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
-    //    aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey);
-    
-    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
+    // prefetch expkey, xmminput and enough longoutput for 4 iterations
    _mm_prefetch( xmminput,     _MM_HINT_T0 );
    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
-    for ( i = 0; i < 64; i += 16 )
-    {
-       _mm_prefetch( longoutput + i,      _MM_HINT_T0 );
-       _mm_prefetch( longoutput + i +  4, _MM_HINT_T0 );
-       _mm_prefetch( longoutput + i +  8, _MM_HINT_T0 );
-       _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
-    }
    _mm_prefetch( expkey,     _MM_HINT_T0 );
    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
    _mm_prefetch( expkey + 8, _MM_HINT_T0 );
-
-    for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    for ( i = 0; i < 64; i += 16 )
    {
-        // prefetch 4 loops ahead,
+        __builtin_prefetch( longoutput + i,      1, 0 );
+        __builtin_prefetch( longoutput + i +  4, 1, 0 );
+        __builtin_prefetch( longoutput + i +  8, 1, 0 );
+        __builtin_prefetch( longoutput + i + 12, 1, 0 );
+    }
+
+    // n-4 iterations
+    for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
+                         i += INIT_SIZE_M128I )
+    {
+        // prefetch 4 iterations ahead.
        __builtin_prefetch( longoutput + i + 64, 1, 0 );
        __builtin_prefetch( longoutput + i + 68, 1, 0 );

-	for (j = 0; j < 10; j++ )
+	for ( j = 0; j < 10; j++ )
 	{
 		xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
 		xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
@@ -165,84 +165,99 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	_mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
 	_mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
    }
+    // last 4 iterations
+    for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    {
+        for ( j = 0; j < 10; j++ )
+        {
+                xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
+                xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
+                xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
+                xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
+                xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
+                xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
+                xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
+                xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
+        }
+        _mm_store_si128( &( longoutput[i  ] ), xmminput[0] );
+        _mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
+        _mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
+        _mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
+        _mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
+        _mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
+        _mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
+        _mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
+    }

-//     cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) ,
-//                                          casti_m128i( ctx.state.k, 2 ) );
-//     cast_m128i( ctx.b ) = _mm_xor_si128( casti_m128i( ctx.state.k, 1 ),
-//                                          casti_m128i( ctx.state.k, 3 ) );
+    ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
+    ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
+    ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
+    ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7];

-     ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
-     ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
-     ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
-     ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7];
-
-//    for (i = 0; i < 2; i++) 
-//    {
-//     ctx.a[i] = ((uint64_t *)ctx.state.k)[i] ^  ((uint64_t *)ctx.state.k)[i+4];
-//     ctx.b[i] = ((uint64_t *)ctx.state.k)[i+2] ^ ((uint64_t *)ctx.state.k)[i+6];
-//    }
-
-    __m128i b_x = _mm_load_si128((__m128i *)ctx.b);
-    uint64_t a[2] __attribute((aligned(16))), b[2] __attribute((aligned(16)));
+    uint64_t a[2] __attribute((aligned(16))),
+             b[2] __attribute((aligned(16))),
+             c[2] __attribute((aligned(16)));
    a[0] = ctx.a[0];
    a[1] = ctx.a[1];
-	
-    for(i = 0; __builtin_expect(i < 0x80000, 1); i++)
+    __m128i b_x = _mm_load_si128( (__m128i*)ctx.b );
+    __m128i a_x = _mm_load_si128( (__m128i*)a );
+    __m128i* lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
+    __m128i c_x = _mm_load_si128( lsa );
+    uint64_t *nextblock;
+    uint64_t hi, lo;
+
+    // n-1 iterations
+    for( i = 0; __builtin_expect( i < 0x7ffff, 1 ); i++ )
    {	  
-        uint64_t c[2];
-        __builtin_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], 0, 1 );
-
-	__m128i c_x = _mm_load_si128( 
-                              (__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
-	__m128i a_x = _mm_load_si128((__m128i *)a);
-	c_x = _mm_aesenc_si128(c_x, a_x);
-	_mm_store_si128((__m128i *)c, c_x);
-	
-	b_x = _mm_xor_si128(b_x, c_x);
-	_mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x);
-
-	uint64_t *nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
-//	uint64_t b[2];
+	c_x = _mm_aesenc_si128( c_x, a_x );
+	_mm_store_si128( (__m128i*)c, c_x );
+        b_x = _mm_xor_si128( b_x, c_x );
+        nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
+	_mm_store_si128( lsa, b_x );
 	b[0] = nextblock[0];
 	b[1] = nextblock[1];

-	{
-	  uint64_t hi, lo;
-	 // hi,lo = 64bit x 64bit multiply of c[0] and b[0]
+        // hi,lo = 64bit x 64bit multiply of c[0] and b[0]
+	__asm__( "mulq %3\n\t"
+	         : "=d" ( hi ),
+	           "=a" ( lo )
+	         : "%a" ( c[0] ),
+	           "rm" ( b[0] )
+		 : "cc" );

-	  __asm__("mulq %3\n\t"
-		  : "=d" (hi),
-		"=a" (lo)
-		  : "%a" (c[0]),
-		"rm" (b[0])
-		  : "cc" );
-	  
-	  a[0] += hi;
-	  a[1] += lo;
-	}
-	uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0];
-//        __m128i *dst = (__m128i*)&ctx.long_state[c[0] & 0x1FFFF0];
-
-//        *dst = cast_m128i( a ); 
-	dst[0] = a[0];
-	dst[1] = a[1];
-
-//        cast_m128i( a ) = _mm_xor_si128( cast_m128i( a ), cast_m128i( b ) );
-	a[0] ^= b[0];
-	a[1] ^= b[1];
-	b_x = c_x;
-	__builtin_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], 0, 3 );
+        b_x = c_x;
+        nextblock[0] = a[0] + hi;
+        nextblock[1] = a[1] + lo;
+        a[0] = b[0] ^ nextblock[0];
+        a[1] = b[1] ^ nextblock[1];
+        lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
+        a_x = _mm_load_si128( (__m128i*)a );
+        c_x = _mm_load_si128( lsa );
    }
+    // abreviated nth iteration
+    c_x = _mm_aesenc_si128( c_x, a_x );
+    _mm_store_si128( (__m128i*)c, c_x );
+    b_x = _mm_xor_si128( b_x, c_x );
+    nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
+    _mm_store_si128( lsa, b_x );
+    b[0] = nextblock[0];
+    b[1] = nextblock[1];
+
+    __asm__( "mulq %3\n\t"
+             : "=d" ( hi ),
+               "=a" ( lo )
+             : "%a" ( c[0] ),
+               "rm" ( b[0] )
+             : "cc" );
+
+    nextblock[0] = a[0] + hi;
+    nextblock[1] = a[1] + lo;

-    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
    memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
    ExpandAESKey256( ExpandedKey );
-    
-    //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
-    //    aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]);
+    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
    
    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
-
    _mm_prefetch( xmminput,     _MM_HINT_T0 );
    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
    for ( i = 0; i < 64; i += 16 )
@@ -256,9 +271,11 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
    _mm_prefetch( expkey + 8, _MM_HINT_T0 );

-    for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    // n-4 iterations
+    for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
+                         i += INIT_SIZE_M128I )
    {
-        // stay 4 loops ahead,
+        // stay 4 iterations ahead.
        _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
        _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );

@@ -283,10 +300,34 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	    xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
        }
    }
-        
+    // last 4 iterations 
+    for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    {
+        xmminput[0] = _mm_xor_si128( longoutput[i  ], xmminput[0] );
+        xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
+        xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
+        xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
+        xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
+        xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
+        xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
+        xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
+
+        for( j = 0; j < 10; j++ )
+        {
+            xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
+            xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
+            xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
+            xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
+            xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
+            xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
+            xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
+            xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
+        }
+    }
+
    memcpy( ctx.state.init, ctx.text, INIT_SIZE_BYTE);
    keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
-
    extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
+
 #endif
 }
--- a/algo/cryptonight/cryptonight-common.c
+++ b/algo/cryptonight/cryptonight-common.c
@@ -5,7 +5,6 @@
 // Modified for CPUminer by Lucas Jones

 #include "cpuminer-config.h"
-//#include "miner.h"
 #include "algo-gate-api.h"

 #ifndef NO_AES_NI
--- a/algo/echo/aes_ni/vperm.h
+++ b/algo/echo/aes_ni/vperm.h
@@ -53,11 +53,12 @@ extern const unsigned int _k_aesmix4[];
 	x  = _mm_shuffle_epi8(*((__m128i*)table + 0), x);\
 	x  = _mm_xor_si128(x, t1)

+#if 0
 // compiled erroneously with 32-bit msc compiler
-	//t2 = _mm_shuffle_epi8(table[0], x);\
-	//x  = _mm_shuffle_epi8(table[1], t1);\
-	//x  = _mm_xor_si128(x, t2)
-
+	t2 = _mm_shuffle_epi8(table[0], x);\
+	x  = _mm_shuffle_epi8(table[1], t1);\
+	x  = _mm_xor_si128(x, t2)
+#endif

 // input: x
 // output: t2, t3
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -21,7 +21,7 @@

 #include "brg_endian.h"
 #define NEED_UINT_64T
-#include "brg_types.h"
+#include "algo/sha/brg_types.h"

 /* some sizes (number of bytes) */
 #define ROWS (8)
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -35,7 +35,7 @@ typedef crypto_uint64 u64;

 #include "brg_endian.h"
 #define NEED_UINT_64T
-#include "brg_types.h"
+#include "algo/sha/brg_types.h"

 #ifdef IACA_TRACE
  #include IACA_MARKS
--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdio.h>
@@ -99,22 +98,21 @@ void groestl_set_target( struct work* work, double job_diff )
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }

-bool register_groestl_algo( algo_gate_t* gate )
+bool register_dmd_gr_algo( algo_gate_t* gate )
 {
    init_groestl_ctx();
    gate->optimizations   = SSE2_OPT | AES_OPT;
    gate->scanhash        = (void*)&scanhash_groestl;
    gate->hash            = (void*)&groestlhash;
    gate->set_target      = (void*)&groestl_set_target;
-    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
    gate->get_max64       = (void*)&get_max64_0x3ffff;
    return true;
 };

-bool register_dmd_gr_algo( algo_gate_t* gate )
+bool register_groestl_algo( algo_gate_t* gate )
 {
-    register_groestl_algo( gate );
-    gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
+    register_dmd_gr_algo( gate );
+    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
    return true;
 };

--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdio.h>
@@ -12,11 +11,8 @@
  #include "aes_ni/hash-groestl.h"
 #endif

-#if defined __SHA__
-  #include <openssl/sha.h>
-#else
-  #include "algo/sha/sph_sha2.h"
-#endif
+#include <openssl/sha.h>
+#include "algo/sha/sph_sha2.h"

 typedef struct {
 #ifdef NO_AES_NI
@@ -24,7 +20,7 @@ typedef struct {
 #else
    hashState_groestl       groestl;
 #endif
-#if defined __SHA__
+#ifndef USE_SPH_SHA
   SHA256_CTX         sha;
 #else
   sph_sha256_context sha;
@@ -40,7 +36,7 @@ void init_myrgr_ctx()
 #else
     init_groestl (&myrgr_ctx.groestl, 64 );
 #endif
-#if defined __SHA__
+#ifndef USE_SPH_SHA
   SHA256_Init( &myrgr_ctx.sha );
 #else
   sph_sha256_init( &myrgr_ctx.sha );
@@ -61,7 +57,7 @@ void myriadhash( void *output, const void *input )
                               (const char*)input, 640 );
 #endif

-#if defined __SHA__
+#ifndef USE_SPH_SHA
     SHA256_Update( &ctx.sha, hash, 64 );
     SHA256_Final( (unsigned char*) hash, &ctx.sha );
 #else
@@ -108,7 +104,7 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,

 bool register_myriad_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | SHA_OPT;
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
    init_myrgr_ctx();
    gate->scanhash = (void*)&scanhash_myriad;
    gate->hash     = (void*)&myriadhash;
--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdio.h>
--- a/algo/heavy/heavy.c
+++ b/algo/heavy/heavy.c
@@ -2,7 +2,6 @@
 #include <openssl/sha.h>
 #include <stdint.h>

-#include "miner.h"
 #include "algo-gate-api.h"
 #include "sph_hefty1.h"
 #include "algo/keccak/sph_keccak.h"
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -1,10 +1,7 @@
 #include <memory.h>
 #include <stdlib.h>

-#include "miner.h"
-//#include "algo-gate-api.h"
 #include "hodl-gate.h"
-//#include "hodl.h"
 #include "hodl-wolf.h"

 #define HODL_NSTARTLOC_INDEX 20
@@ -97,13 +94,7 @@ bool hodl_do_this_thread( int thr_id )
 int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done )
 {
-#ifdef NO_AES_NI
-  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
-  return false;
-//  GetPsuedoRandomData( hodl_scratchbuf, work->data, thr_id );
-//  pthread_barrier_wait( &hodl_barrier );
-//  return scanhash_hodl( thr_id, work, max_nonce, hashes_done );
-#else
+#ifndef NO_AES_NI
  GenRandomGarbage( hodl_scratchbuf, work->data, thr_id );
  pthread_barrier_wait( &hodl_barrier );
  return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
@@ -112,6 +103,10 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,

 bool register_hodl_algo( algo_gate_t* gate )
 {
+#ifdef NO_AES_NI
+  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
+  return false;
+#endif
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
  gate->optimizations         = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
--- a/algo/hodl/sha512_avx.c
+++ b/algo/hodl/sha512_avx.c
@@ -4,6 +4,11 @@
 //Dependencies
 #include <string.h>
 #include <stdlib.h>
+
+#ifdef __FreeBSD__
+#include <sys/endian.h>
+#endif 
+
 #include "tmmintrin.h"
 #include "smmintrin.h"

--- a/algo/hodl/sha512_avx2.c
+++ b/algo/hodl/sha512_avx2.c
@@ -3,6 +3,11 @@
 //Dependencies
 #include <string.h>
 #include <stdlib.h>
+
+#ifdef __FreeBSD__
+#include <sys/endian.h>
+#endif 
+
 #include "tmmintrin.h"
 #include "smmintrin.h"
 #include "immintrin.h"
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -0,0 +1,609 @@
+/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */
+/*
+ * JH implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifdef __AVX2__
+
+#include <stddef.h>
+#include <string.h>
+
+#include "jh-hash-4way.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH
+#define SPH_SMALL_FOOTPRINT_JH   1
+#endif
+
+#if !defined SPH_JH_64 && SPH_64_TRUE
+#define SPH_JH_64   1
+#endif
+
+#if !SPH_64
+#undef SPH_JH_64
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * The internal bitslice representation may use either big-endian or
+ * little-endian (true bitslice operations do not care about the bit
+ * ordering, and the bit-swapping linear operations in JH happen to
+ * be invariant through endianness-swapping). The constants must be
+ * defined according to the chosen endianness; we use some
+ * byte-swapping macros for that.
+ */
+
+#if SPH_LITTLE_ENDIAN
+
+#if SPH_64
+#define C64e(x)     ((SPH_C64(x) >> 56) \
+                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
+                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
+                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
+                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
+                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
+                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
+                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
+#define dec64e_aligned   sph_dec64le_aligned
+#define enc64e           sph_enc64le
+#endif
+
+#else
+
+#if SPH_64
+#define C64e(x)     SPH_C64(x)
+#define dec64e_aligned   sph_dec64be_aligned
+#define enc64e           sph_enc64be
+#endif
+
+#endif
+
+#define Sb(x0, x1, x2, x3, c) \
+do { \
+   __m256i cc = _mm256_set_epi64x( c, c, c, c ); \
+    x3 = mm256_not( x3 ); \
+    x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
+    tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
+    x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
+    x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \
+    x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
+    x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \
+    x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
+    x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
+    x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
+    x2 = _mm256_xor_si256( x2, tmp ); \
+} while (0)
+
+#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
+do { \
+    x4 = _mm256_xor_si256( x4, x1 ); \
+    x5 = _mm256_xor_si256( x5, x2 ); \
+    x6 = _mm256_xor_si256( x6, _mm256_xor_si256( x3, x0 ) ); \
+    x7 = _mm256_xor_si256( x7, x0 ); \
+    x0 = _mm256_xor_si256( x0, x5 ); \
+    x1 = _mm256_xor_si256( x1, x6 ); \
+    x2 = _mm256_xor_si256( x2, _mm256_xor_si256( x7, x4 ) ); \
+    x3 = _mm256_xor_si256( x3, x4 ); \
+} while (0)
+
+#if SPH_JH_64
+
+static const sph_u64 C[] = {
+	C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557),
+	C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40),
+	C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a),
+	C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231),
+	C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410),
+	C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc),
+	C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0),
+	C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3),
+	C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce),
+	C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23),
+	C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8),
+	C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197),
+	C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95),
+	C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214),
+	C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80),
+	C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4),
+	C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989),
+	C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36),
+	C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7),
+	C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f),
+	C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727),
+	C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b),
+	C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e),
+	C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062),
+	C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984),
+	C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5),
+	C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2),
+	C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f),
+	C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465),
+	C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a),
+	C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1),
+	C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf),
+	C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48),
+	C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0),
+	C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134),
+	C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a),
+	C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff),
+	C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6),
+	C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae),
+	C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567),
+	C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a),
+	C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518),
+	C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446),
+	C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e),
+	C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee),
+	C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001),
+	C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779),
+	C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83),
+	C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a),
+	C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef),
+	C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d),
+	C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65),
+	C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a),
+	C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c),
+	C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d),
+	C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71),
+	C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc),
+	C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0),
+	C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c),
+	C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f),
+	C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751),
+	C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad),
+	C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56),
+	C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6),
+	C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a),
+	C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163),
+	C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826),
+	C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f),
+	C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30),
+	C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a),
+	C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3),
+	C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505),
+	C64e(0xb17681d913326cce), C64e(0x3c175284f805a262),
+	C64e(0xf42bcbb378471547), C64e(0xff46548223936a48),
+	C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e),
+	C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e),
+	C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd),
+	C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7),
+	C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be),
+	C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de),
+	C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9),
+	C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a),
+	C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b),
+	C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2)
+};
+
+#define Ceven_hi(r)   (C[((r) << 2) + 0])
+#define Ceven_lo(r)   (C[((r) << 2) + 1])
+#define Codd_hi(r)    (C[((r) << 2) + 2])
+#define Codd_lo(r)    (C[((r) << 2) + 3])
+
+#define S(x0, x1, x2, x3, cb, r)   do { \
+		Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
+		Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
+	} while (0)
+
+#define L(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
+			x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
+		Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
+			x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
+	} while (0)
+
+
+#define Wz(x, c, n) \
+do { \
+   __m256i t = _mm256_slli_epi64( _mm256_and_si256(x ## h, (c)), (n) ); \
+   x ## h = _mm256_or_si256( _mm256_and_si256( \
+                                _mm256_srli_epi64(x ## h, (n)), (c)), t ); \
+   t = _mm256_slli_epi64( _mm256_and_si256(x ## l, (c)), (n) ); \
+   x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
+} while (0)
+
+
+/*
+#define Wz(x, c, n)   do { \
+		sph_u64 t = (x ## h & (c)) << (n); \
+		x ## h = ((x ## h >> (n)) & (c)) | t; \
+		t = (x ## l & (c)) << (n); \
+		x ## l = ((x ## l >> (n)) & (c)) | t; \
+	} while (0)
+*/
+
+#define W0(x)   Wz(x, _mm256_set_epi64x( 0x5555555555555555, \
+       0x5555555555555555, 0x5555555555555555, 0x5555555555555555 ), 1 )
+#define W1(x)   Wz(x, _mm256_set_epi64x( 0x3333333333333333, \
+       0x3333333333333333, 0x3333333333333333, 0x3333333333333333 ), 2 )
+#define W2(x)   Wz(x, _mm256_set_epi64x( 0x0F0F0F0F0F0F0F0F, \
+       0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F ), 4 )
+#define W3(x)   Wz(x, _mm256_set_epi64x( 0x00FF00FF00FF00FF, \
+       0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF ), 8 ) 
+#define W4(x)   Wz(x, _mm256_set_epi64x( 0x0000FFFF0000FFFF, \
+       0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF ), 16 )
+#define W5(x)   Wz(x, _mm256_set_epi64x( 0x00000000FFFFFFFF, \
+       0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF ), 32 )
+#define W6(x) \
+do { \
+   __m256i t = x ## h; \
+   x ## h = x ## l; \
+   x ## l = t; \
+} while (0)
+
+/*
+#define W0(x)   Wz(x, SPH_C64(0x5555555555555555),  1)
+#define W1(x)   Wz(x, SPH_C64(0x3333333333333333),  2)
+#define W2(x)   Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F),  4)
+#define W3(x)   Wz(x, SPH_C64(0x00FF00FF00FF00FF),  8)
+#define W4(x)   Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
+#define W5(x)   Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
+#define W6(x)   do { \
+		sph_u64 t = x ## h; \
+		x ## h = x ## l; \
+		x ## l = t; \
+	} while (0)
+*/
+
+#define DECL_STATE \
+	__m256i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
+	__m256i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
+	__m256i tmp;
+
+#define READ_STATE(state)   do { \
+		h0h = (state)->H[ 0]; \
+		h0l = (state)->H[ 1]; \
+		h1h = (state)->H[ 2]; \
+		h1l = (state)->H[ 3]; \
+		h2h = (state)->H[ 4]; \
+		h2l = (state)->H[ 5]; \
+		h3h = (state)->H[ 6]; \
+		h3l = (state)->H[ 7]; \
+		h4h = (state)->H[ 8]; \
+		h4l = (state)->H[ 9]; \
+		h5h = (state)->H[10]; \
+		h5l = (state)->H[11]; \
+		h6h = (state)->H[12]; \
+		h6l = (state)->H[13]; \
+		h7h = (state)->H[14]; \
+		h7l = (state)->H[15]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->H[ 0] = h0h; \
+		(state)->H[ 1] = h0l; \
+		(state)->H[ 2] = h1h; \
+		(state)->H[ 3] = h1l; \
+		(state)->H[ 4] = h2h; \
+		(state)->H[ 5] = h2l; \
+		(state)->H[ 6] = h3h; \
+		(state)->H[ 7] = h3l; \
+		(state)->H[ 8] = h4h; \
+		(state)->H[ 9] = h4l; \
+		(state)->H[10] = h5h; \
+		(state)->H[11] = h5l; \
+		(state)->H[12] = h6h; \
+		(state)->H[13] = h6l; \
+		(state)->H[14] = h7h; \
+		(state)->H[15] = h7l; \
+	} while (0)
+
+#define INPUT_BUF1 \
+	__m256i m0h = buf[0]; \
+	__m256i m0l = buf[1]; \
+	__m256i m1h = buf[2]; \
+	__m256i m1l = buf[3]; \
+	__m256i m2h = buf[4]; \
+	__m256i m2l = buf[5]; \
+	__m256i m3h = buf[6]; \
+	__m256i m3l = buf[7]; \
+        h0h = _mm256_xor_si256( h0h, m0h ); \
+        h0l = _mm256_xor_si256( h0l, m0l ); \
+        h1h = _mm256_xor_si256( h1h, m1h ); \
+        h1l = _mm256_xor_si256( h1l, m1l ); \
+        h2h = _mm256_xor_si256( h2h, m2h ); \
+        h2l = _mm256_xor_si256( h2l, m2l ); \
+        h3h = _mm256_xor_si256( h3h, m3h ); \
+        h3l = _mm256_xor_si256( h3l, m3l ); \
+
+#define INPUT_BUF2 \
+   h4h = _mm256_xor_si256( h4h, m0h ); \
+   h4l = _mm256_xor_si256( h4l, m0l ); \
+   h5h = _mm256_xor_si256( h5h, m1h ); \
+   h5l = _mm256_xor_si256( h5l, m1l ); \
+   h6h = _mm256_xor_si256( h6h, m2h ); \
+   h6l = _mm256_xor_si256( h6l, m2l ); \
+   h7h = _mm256_xor_si256( h7h, m3h ); \
+   h7l = _mm256_xor_si256( h7l, m3l ); \
+
+static const sph_u64 IV256[] = {
+	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
+	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
+	C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477),
+	C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8),
+	C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262),
+	C64e(0x277695f776248f94), C64e(0x87d5b6574780296c),
+	C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f),
+	C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769)
+};
+
+
+static const sph_u64 IV512[] = {
+	C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543),
+	C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361),
+	C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80),
+	C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7),
+	C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a),
+	C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199),
+	C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156),
+	C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b)
+};
+
+#else
+
+
+#endif
+
+#define SL(ro)   SLu(r + ro, ro)
+
+#define SLu(r, ro)   do { \
+		S(h0, h2, h4, h6, Ceven_, r); \
+		S(h1, h3, h5, h7, Codd_, r); \
+		L(h0, h2, h4, h6, h1, h3, h5, h7); \
+		W ## ro(h1); \
+		W ## ro(h3); \
+		W ## ro(h5); \
+		W ## ro(h7); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_JH
+
+#if SPH_JH_64
+
+/*
+ * The "small footprint" 64-bit version just uses a partially unrolled
+ * loop.
+ */
+
+#define E8   do { \
+		unsigned r; \
+		for (r = 0; r < 42; r += 7) { \
+			SL(0); \
+			SL(1); \
+			SL(2); \
+			SL(3); \
+			SL(4); \
+			SL(5); \
+			SL(6); \
+		} \
+	} while (0)
+
+#else
+
+
+#endif
+
+#else
+
+#if SPH_JH_64
+
+/*
+ * On a "true 64-bit" architecture, we can unroll at will.
+ */
+
+#define E8   do { \
+		SLu( 0, 0); \
+		SLu( 1, 1); \
+		SLu( 2, 2); \
+		SLu( 3, 3); \
+		SLu( 4, 4); \
+		SLu( 5, 5); \
+		SLu( 6, 6); \
+		SLu( 7, 0); \
+		SLu( 8, 1); \
+		SLu( 9, 2); \
+		SLu(10, 3); \
+		SLu(11, 4); \
+		SLu(12, 5); \
+		SLu(13, 6); \
+		SLu(14, 0); \
+		SLu(15, 1); \
+		SLu(16, 2); \
+		SLu(17, 3); \
+		SLu(18, 4); \
+		SLu(19, 5); \
+		SLu(20, 6); \
+		SLu(21, 0); \
+		SLu(22, 1); \
+		SLu(23, 2); \
+		SLu(24, 3); \
+		SLu(25, 4); \
+		SLu(26, 5); \
+		SLu(27, 6); \
+		SLu(28, 0); \
+		SLu(29, 1); \
+		SLu(30, 2); \
+		SLu(31, 3); \
+		SLu(32, 4); \
+		SLu(33, 5); \
+		SLu(34, 6); \
+		SLu(35, 0); \
+		SLu(36, 1); \
+		SLu(37, 2); \
+		SLu(38, 3); \
+		SLu(39, 4); \
+		SLu(40, 5); \
+		SLu(41, 6); \
+	} while (0)
+
+#else
+
+
+#endif
+
+#endif
+
+static void
+jh_4way_init( jh_4way_context *sc, const void *iv )
+{
+    uint64_t *v = (uint64_t*)iv;
+    
+    for ( int i = 0; i < 16; i++ )
+        sc->H[i] = _mm256_set_epi64x( v[i], v[i], v[i], v[i] );
+    sc->ptr = 0;
+    sc->block_count = 0;
+}
+
+static void
+jh_4way_core( jh_4way_context *sc, const void *data, size_t len )
+{
+    __m256i *buf;
+    __m256i *vdata = (__m256i*)data;
+   const int buf_size = 64;   // 64 * _m256i
+   size_t ptr;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr) )
+   {
+       memcpy_256( buf + (ptr>>3), vdata, len>>3 );
+       ptr += len;
+       sc->ptr = ptr;
+       return;
+   }
+
+   READ_STATE(sc);
+   while ( len > 0 )
+   {
+       size_t clen;
+       clen = buf_size - ptr;
+       if ( clen > len )
+          clen = len;
+
+       memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
+       ptr += clen;
+       vdata += (clen>>3);
+       len -= clen;
+       if ( ptr == buf_size )
+       {
+          INPUT_BUF1;
+          E8;
+          INPUT_BUF2;
+          sc->block_count ++;
+          ptr = 0;
+       }
+   }
+   WRITE_STATE(sc);
+   sc->ptr = ptr;
+}
+
+static void
+jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
+               size_t out_size_w32, const void *iv )
+{
+   __m256i buf[16*4];
+   __m256i *dst256 = (__m256i*)dst;
+   size_t numz, u;
+   sph_u64 l0, l1, l0e, l1e;
+
+   buf[0] = _mm256_set_epi64x( 0x80, 0x80, 0x80, 0x80 );
+
+   if ( sc->ptr == 0 )
+       numz = 48;
+   else
+       numz = 112 - sc->ptr;
+
+   memset_zero_256( buf+1, (numz>>3) - 1 );   
+
+   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
+   l1 = SPH_T64(sc->block_count >> 55);
+   sph_enc64be( &l0e, l0 );
+   sph_enc64be( &l1e, l1 );
+   *(buf + (numz>>3)    ) = _mm256_set_epi64x( l1e, l1e, l1e, l1e );
+   *(buf + (numz>>3) + 1) = _mm256_set_epi64x( l0e, l0e, l0e, l0e ); 
+
+   jh_4way_core( sc, buf, numz + 16 );
+
+   for ( u=0; u < 8; u++ )
+       buf[u] = sc->H[u+8];
+
+    memcpy_256( dst256, buf, 8 );
+}
+
+void
+jh256_4way_init(void *cc)
+{
+	jh_4way_init(cc, IV256);
+}
+
+void
+jh256_4way(void *cc, const void *data, size_t len)
+{
+	jh_4way_core(cc, data, len);
+}
+
+void
+jh256_4way_close(void *cc, void *dst)
+{
+	jh_4way_close(cc, 0, 0, dst, 8, IV256);
+}
+
+void
+jh512_4way_init(void *cc)
+{
+	jh_4way_init(cc, IV512);
+}
+
+void
+jh512_4way(void *cc, const void *data, size_t len)
+{
+	jh_4way_core(cc, data, len);
+}
+
+void
+jh512_4way_close(void *cc, void *dst)
+{
+	jh_4way_close(cc, 0, 0, dst, 16, IV512);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -0,0 +1,100 @@
+/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * JH interface. JH is a family of functions which differ by
+ * their output size; this implementation defines JH for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_jh.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef JH_HASH_4WAY_H__
+#define JH_HASH_4WAY_H__
+
+#ifdef __AVX2__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_jh256   256
+
+#define SPH_SIZE_jh512   512
+
+/**
+ * This structure is a context for JH computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a JH computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running JH computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+    __m256i buf[8] __attribute__ ((aligned (64)));
+    __m256i H[16];
+    size_t ptr;
+    uint64_t block_count;
+/*
+	unsigned char buf[64]; 
+	size_t ptr;
+	union {
+		sph_u64 wide[16];
+	} H;
+	sph_u64 block_count;
+*/
+} jh_4way_context;
+
+typedef jh_4way_context jh256_4way_context;
+
+typedef jh_4way_context jh512_4way_context;
+
+void jh256_4way_init(void *cc);
+
+void jh256_4way(void *cc, const void *data, size_t len);
+
+void jh256_4way_close(void *cc, void *dst);
+
+void jh512_4way_init(void *cc);
+
+void jh512_4way(void *cc, const void *data, size_t len);
+
+void jh512_4way_close(void *cc, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -0,0 +1,250 @@
+#include "jha-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+//#include "avxdefs.h"
+
+#if defined(JHA_4WAY)
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+
+//static __thread keccak512_4way_context jha_kec_mid
+//                                   __attribute__ ((aligned (64)));
+
+void jha_hash_4way( void *out, const void *input )
+{
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhash0[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhash1[8*4] __attribute__ ((aligned (64)));
+    __m256i mask, mask0, mask1;
+    __m256i* vh = (__m256i*)vhash;
+    __m256i* vh0 = (__m256i*)vhash0;
+    __m256i* vh1 = (__m256i*)vhash1;
+
+    blake512_4way_context  ctx_blake;
+    hashState_groestl      ctx_groestl;
+    jh512_4way_context     ctx_jh;
+    skein512_4way_context  ctx_skein;
+    keccak512_4way_context ctx_keccak;
+
+    keccak512_4way_init( &ctx_keccak );
+    keccak512_4way( &ctx_keccak, input, 80 );
+    keccak512_4way_close( &ctx_keccak, vhash );
+
+//    memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
+//    keccak512_4way( &ctx_keccak, input + (64<<2), 16 );
+//    keccak512_4way_close( &ctx_keccak, vhash );
+
+    // Heavy & Light Pair Loop
+    for ( int round = 0; round < 3; round++ )
+    {
+      // select next function based on bit 0 of previous hash.
+      // Specutively execute both functions and use mask to
+      // select results from correct function for each lane.
+      // hash = mask : vhash0 ? vhash1
+      mask = mm256_negate_64(
+                     _mm256_and_si256( vh[0], _mm256_set1_epi64x( 0x1 ) ) );
+
+// second version
+//      mask0 = mask
+//      mask1 = mm256_not( mask );
+
+// first version
+//       mask = _mm256_sub_epi64( _mm256_and_si256( vh[0],
+//                     _mm256_set1_epi64x( 0x1 ) ), _mm256_set1_epi64x( 0x1 ) );
+
+       // groestl (serial) vs skein
+
+       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash0,
+                                 (char*)hash0, 512 );
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash1,
+                                 (char*)hash1, 512 );
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash2,
+                                 (char*)hash2, 512 );
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash3,
+                                 (char*)hash3, 512 );
+
+       mm256_interleave_4x64( vhash0, hash0, hash1, hash2, hash3, 512 );
+
+       // skein
+
+       skein512_4way_init( &ctx_skein );
+       skein512_4way( &ctx_skein, vhash, 64 );
+       skein512_4way_close( &ctx_skein, vhash1 );
+
+       // merge vectored hash
+       for ( int i = 0; i < 8; i++ )
+       {
+          // blend should be faster
+          vh[i] = _mm256_blendv_epi8( vh0[i], vh1[i], mask );
+
+// second version
+//          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
+//                                   _mm256_and_si256( vh1[i], mask1 ) );
+
+// first version
+/*
+          vh0[i] = _mm256_maskload_epi64( 
+                                      vhash0 + i*4, mm256_not( mask ) );
+          vh1[i] = _mm256_maskload_epi64(
+                                      vhash1 + i*4, mask );
+          vh[i]  = _mm256_or_si256( vh0[i], vh1[i] );
+*/
+       }
+
+       // blake v jh
+
+       blake512_4way_init( &ctx_blake );
+       blake512_4way( &ctx_blake, vhash, 64 );
+       blake512_4way_close( &ctx_blake, vhash0 );
+
+       jh512_4way_init( &ctx_jh );
+       jh512_4way( &ctx_jh, vhash, 64 );
+       jh512_4way_close( &ctx_jh, vhash1 );
+
+       // merge hash
+       for ( int i = 0; i < 8; i++ )
+       {
+          vh[i] = _mm256_or_si256( _mm256_and_si256( vh0[i], mask0 ),
+                                   _mm256_and_si256( vh1[i], mask1 ) );
+/*
+          vha256[i] = _mm256_maskload_epi64(
+                                      vhasha + i*4, mm256_not( mask ) );
+          vhb256[i] = _mm256_maskload_epi64(
+                                      vhashb + i*4, mask );
+          vh256[i]  = _mm256_or_si256( vha256[i], vhb256[i] );
+*/
+       }
+    }
+
+    mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
+
+//    memcpy( output,       hash0, 32 );
+//    memcpy( output+32,    hash1, 32 );
+//    memcpy( output+64,    hash2, 32 );
+//    memcpy( output+96,    hash3, 32 );
+
+}
+
+int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+     uint32_t hash[8*4] __attribute__ ((aligned (64)));
+     uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+	uint32_t n = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+
+	uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+	uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+   // we need bigendian data...
+   for ( int i=0; i < 19; i++ )
+      be32enc( &endiandata[i], pdata[i] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   // precalc midstate for keccak
+//   keccak512_4way_init( &jha_kec_mid );
+//   keccak512_4way( &jha_kec_mid, vdata, 64 );
+
+   for ( int m = 0; m < 6; m++ )
+   {
+      if ( Htarg <= htmax[m] )
+      {
+         uint32_t mask = masks[m];
+         do {
+              found[0] = found[1] = found[2] = found[3] = false;
+              be32enc( noncep0, n   );
+              be32enc( noncep1, n+1 );
+              be32enc( noncep2, n+2 );
+              be32enc( noncep3, n+3 );
+
+              jha_hash_4way( hash, vdata );
+
+              pdata[19] = n;
+
+              if ( ( !(hash[7] & mask) )
+                   && fulltest( hash, ptarget ) )
+              {
+                 found[0] = true;
+                 num_found++;
+                 nonces[0] = n;
+                 work_set_target_ratio( work, hash );
+              }
+              if ( ( !((hash+8)[7] & mask) )
+                   && fulltest( hash+8, ptarget ) )
+              {
+                 found[1] = true;
+                 num_found++;
+                 nonces[1] = n+1;
+                 work_set_target_ratio( work, hash+8 );
+              }
+              if ( ( !((hash+16)[7] & mask) )
+                 && fulltest( hash+16, ptarget ) )
+              {
+                 found[2] = true;
+                 num_found++;
+                 nonces[2] = n+2;
+                 work_set_target_ratio( work, hash+16 );
+              }
+              if ( ( !((hash+24)[7] & mask) )
+                   && fulltest( hash+24, ptarget ) )
+              {
+                 found[3] = true;
+                 num_found++;
+                 nonces[3] = n+3;
+                 work_set_target_ratio( work, hash+24 );
+              }
+              n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                     && !work_restart[thr_id].restart );
+
+         break;
+      }
+   }
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+#endif
--- a/algo/jh/jha-gate.c
+++ b/algo/jh/jha-gate.c
@@ -0,0 +1,19 @@
+#include "jha-gate.h"
+
+
+bool register_jha_algo( algo_gate_t* gate )
+{
+#if defined (JHA_4WAY)
+  four_way_not_tested();
+  gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
+  gate->scanhash         = (void*)&scanhash_jha_4way;
+  gate->hash             = (void*)&jha_hash_4way;
+#else
+  gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT;
+  gate->scanhash         = (void*)&scanhash_jha;
+  gate->hash             = (void*)&jha_hash;
+#endif
+  gate->set_target       = (void*)&scrypt_set_target;
+  return true;
+};
+
--- a/algo/jh/jha-gate.h
+++ b/algo/jh/jha-gate.h
@@ -0,0 +1,25 @@
+#ifndef JHA_GATE_H__
+#define JHA_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+
+#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI)
+  #define JHA_4WAY
+#endif
+
+#if defined JHA_4WAY
+void jha_hash_4way( void *state, const void *input );
+
+int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+#endif
+
+void jha_hash( void *state, const void *input );
+
+int scanhash_jha( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+
+#endif
+
--- a/algo/jh/jha.c
+++ b/algo/jh/jha.c
@@ -0,0 +1,155 @@
+#include "jha-gate.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "algo/blake/sph_blake.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+
+#ifdef NO_AES_NI
+  #include "algo/groestl/sph_groestl.h"
+#else
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+#endif
+
+static __thread sph_keccak512_context jha_kec_mid __attribute__ ((aligned (64)));
+
+void jha_kec_midstate( const void* input )
+{
+    sph_keccak512_init( &jha_kec_mid );
+    sph_keccak512( &jha_kec_mid, input, 64 );
+}
+
+void jha_hash(void *output, const void *input)
+{
+	uint8_t _ALIGN(128) hash[64];
+
+#ifdef NO_AES_NI
+	sph_groestl512_context ctx_groestl;
+#else
+        hashState_groestl      ctx_groestl;
+#endif
+        sph_blake512_context ctx_blake;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+
+        memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
+        sph_keccak512(&ctx_keccak, input+64, 16 );
+	sph_keccak512_close(&ctx_keccak, hash );
+
+	// Heavy & Light Pair Loop
+	for (int round = 0; round < 3; round++)
+	{
+	   if (hash[0] & 0x01)
+           {
+#ifdef NO_AES_NI
+		sph_groestl512_init(&ctx_groestl);
+		sph_groestl512(&ctx_groestl, hash, 64 );
+		sph_groestl512_close(&ctx_groestl, hash );
+#else
+                init_groestl( &ctx_groestl, 64 );
+                update_and_final_groestl( &ctx_groestl, (char*)hash,
+                                          (char*)hash, 512 );
+#endif
+	    }
+            else
+            {
+		sph_skein512_init(&ctx_skein);
+		sph_skein512(&ctx_skein, hash, 64);
+		sph_skein512_close(&ctx_skein, hash );
+	    }
+
+	    if (hash[0] & 0x01)
+            {
+		sph_blake512_init(&ctx_blake);
+		sph_blake512(&ctx_blake, hash, 64);
+		sph_blake512_close(&ctx_blake, hash );
+	    }
+            else
+            {
+		sph_jh512_init(&ctx_jh);
+		sph_jh512(&ctx_jh, hash, 64 );
+		sph_jh512_close(&ctx_jh, hash );
+	    }
+	}
+
+	memcpy(output, hash, 32);
+}
+
+int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+{
+	uint32_t _ALIGN(128) hash32[8];
+	uint32_t _ALIGN(128) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+	uint32_t n = pdata[19] - 1;
+
+	uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+	uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+	// we need bigendian data...
+	for (int i=0; i < 19; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+
+        jha_kec_midstate( endiandata );
+
+#ifdef DEBUG_ALGO
+	printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
+	for (int m=0; m < 6; m++) {
+		if (Htarg <= htmax[m]) {
+			uint32_t mask = masks[m];
+			do {
+				pdata[19] = ++n;
+				be32enc(&endiandata[19], n);
+				jha_hash(hash32, endiandata);
+#ifndef DEBUG_ALGO
+				if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) {
+					work_set_target_ratio(work, hash32);
+					*hashes_done = n - first_nonce + 1;
+					return 1;
+				}
+#else
+				if (!(n % 0x1000) && !thr_id) printf(".");
+				if (!(hash32[7] & mask)) {
+					printf("[%d]",thr_id);
+					if (fulltest(hash32, ptarget)) {
+						work_set_target_ratio(work, hash32);
+						*hashes_done = n - first_nonce + 1;
+						return 1;
+					}
+				}
+#endif
+			} while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
+			break;
+		}
+	}
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
+
--- a/algo/jh/sph_jh.c
+++ b/algo/jh/sph_jh.c
@@ -914,6 +914,7 @@ jh_core(sph_jh_context *sc, const void *data, size_t len)

 	buf = sc->buf;
 	ptr = sc->ptr;
+
 	if (len < (sizeof sc->buf) - ptr) {
 		memcpy(buf + ptr, data, len);
 		ptr += len;
--- a/algo/jh/sse2/jh_sse2_opt64.h
+++ b/algo/jh/sse2/jh_sse2_opt64.h
@@ -22,15 +22,12 @@
 */


-
 #include <emmintrin.h>
 #include <stdint.h>
 #include <string.h>
+#include "algo/sha/sha3-defs.h"

 typedef __m128i  word128;   /*word128 defines a 128-bit SSE2 word*/
-
-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
 typedef enum {jhSUCCESS = 0, jhFAIL = 1, jhBAD_HASHLEN = 2} jhReturn;

 /*define data alignment for different C compilers*/
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -0,0 +1,95 @@
+#include "keccak-gate.h"
+
+#ifdef KECCAK_4WAY
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "sph_keccak.h"
+#include "keccak-hash-4way.h"
+
+void keccakhash_4way(void *state, const void *input)
+{
+    uint64_t vhash[4*4] __attribute__ ((aligned (64)));
+    keccak256_4way_context ctx;
+
+    keccak256_4way_init( &ctx );
+    keccak256_4way( &ctx, input, 80 );
+    keccak256_4way_close( &ctx, vhash );
+
+    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done)
+{
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+//   const uint32_t Htarg = ptarget[7];
+   uint32_t endiandata[20];
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+   uint32_t *noncep1 = vdata + 75;
+   uint32_t *noncep2 = vdata + 77;
+   uint32_t *noncep3 = vdata + 79;
+
+   for ( int i=0; i < 19; i++ ) 
+      be32enc( &endiandata[i], pdata[i] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+	
+      keccakhash_4way( hash, vdata );
+
+      if ( ( ( hash[7] & 0xFFFFFF00 ) == 0 )
+         && fulltest( hash, ptarget) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = n;
+          pdata[19] = n;
+      }
+      if ( ( ( (hash+8)[7] & 0xFFFFFF00 ) == 0 )
+         && fulltest( hash+8, ptarget) ) 
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+      }
+      if ( ( ( (hash+16) [7] & 0xFFFFFF00 ) == 0 )
+         && fulltest( hash+16, ptarget) )
+      {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+      }
+      if ( ( ( (hash+24)[7] & 0xFFFFFF00 ) == 0 )
+         && fulltest( hash+24, ptarget) )
+      {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+      }
+      n += 4;
+
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -0,0 +1,46 @@
+#include "keccak-gate.h"
+
+void keccak_set_target( struct work* work, double job_diff )
+{
+  work_set_target( work, job_diff / (128.0 * opt_diff_factor) );
+}
+
+int64_t keccak_get_max64() { return 0x7ffffLL; }
+
+bool register_keccak_algo( algo_gate_t* gate )
+{
+  gate->optimizations = FOUR_WAY_OPT;
+  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->set_target      = (void*)&keccak_set_target;
+  gate->get_max64       = (void*)&keccak_get_max64;
+#if defined (KECCAK_4WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_4way;
+  gate->hash      = (void*)&keccakhash_4way;
+#else
+  gate->scanhash        = (void*)&scanhash_keccak;
+  gate->hash            = (void*)&keccakhash;
+#endif
+  return true;
+};
+
+void keccakc_set_target( struct work* work, double job_diff )
+{
+  work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_keccakc_algo( algo_gate_t* gate )
+{
+  gate->optimizations = FOUR_WAY_OPT;
+  gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
+  gate->set_target      = (void*)&keccakc_set_target;
+  gate->get_max64       = (void*)&keccak_get_max64;
+#if defined (KECCAK_4WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_4way;
+  gate->hash      = (void*)&keccakhash_4way;
+#else
+  gate->scanhash        = (void*)&scanhash_keccak;
+  gate->hash            = (void*)&keccakhash;
+#endif
+  return true;
+};
+
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -0,0 +1,23 @@
+#ifndef KECCAK_GATE_H__
+#define KECCAK_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(FOUR_WAY) && defined(__AVX2__)
+  #define KECCAK_4WAY
+#endif
+
+#if defined(KECCAK_4WAY)
+
+void keccakhash_4way( void *state, const void *input );
+int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+#endif
+
+void keccakhash( void *state, const void *input );
+int scanhash_keccak( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+#endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -0,0 +1,503 @@
+#include <stddef.h>
+#include "keccak-hash-4way.h"
+
+#if defined(__AVX2__)
+
+static const sph_u64 RC[] = {
+        SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
+        SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
+        SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
+        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
+        SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
+        SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
+        SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
+        SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
+        SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
+        SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
+        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
+        SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
+};
+
+#define a00   (kc->w[ 0])
+#define a10   (kc->w[ 1])
+#define a20   (kc->w[ 2])
+#define a30   (kc->w[ 3])
+#define a40   (kc->w[ 4])
+#define a01   (kc->w[ 5])
+#define a11   (kc->w[ 6])
+#define a21   (kc->w[ 7])
+#define a31   (kc->w[ 8])
+#define a41   (kc->w[ 9])
+#define a02   (kc->w[10])
+#define a12   (kc->w[11])
+#define a22   (kc->w[12])
+#define a32   (kc->w[13])
+#define a42   (kc->w[14])
+#define a03   (kc->w[15])
+#define a13   (kc->w[16])
+#define a23   (kc->w[17])
+#define a33   (kc->w[18])
+#define a43   (kc->w[19])
+#define a04   (kc->w[20])
+#define a14   (kc->w[21])
+#define a24   (kc->w[22])
+#define a34   (kc->w[23])
+#define a44   (kc->w[24])
+
+#define DECL_STATE
+#define READ_STATE(sc)
+#define WRITE_STATE(sc)
+
+#define INPUT_BUF(size)   do { \
+    size_t j; \
+    for (j = 0; j < (size>>3); j++ ) \
+        kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
+} while (0)
+
+#define DECL64(x)        __m256i x
+#define MOV64(d, s)      (d = s)
+#define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
+#define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
+#define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
+#define NOT64(d, s)      (d = _mm256_xor_si256(s,mm256_neg1))
+#define ROL64(d, v, n)   (d = mm256_rotl_64(v, n))
+#define XOR64_IOTA       XOR64
+
+#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
+                DECL64(tt0); \
+                DECL64(tt1); \
+                DECL64(tt2); \
+                DECL64(tt3); \
+                XOR64(tt0, d0, d1); \
+                XOR64(tt1, d2, d3); \
+                XOR64(tt0, tt0, d4); \
+                XOR64(tt0, tt0, tt1); \
+                ROL64(tt0, tt0, 1); \
+                XOR64(tt2, c0, c1); \
+                XOR64(tt3, c2, c3); \
+                XOR64(tt0, tt0, c4); \
+                XOR64(tt2, tt2, tt3); \
+                XOR64(t, tt0, tt2); \
+        } while (0)
+
+#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                DECL64(t0); \
+                DECL64(t1); \
+                DECL64(t2); \
+                DECL64(t3); \
+                DECL64(t4); \
+                TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
+                TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
+                TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
+                TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
+                TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
+                XOR64(b00, b00, t0); \
+                XOR64(b01, b01, t0); \
+                XOR64(b02, b02, t0); \
+                XOR64(b03, b03, t0); \
+                XOR64(b04, b04, t0); \
+                XOR64(b10, b10, t1); \
+                XOR64(b11, b11, t1); \
+                XOR64(b12, b12, t1); \
+                XOR64(b13, b13, t1); \
+                XOR64(b14, b14, t1); \
+                XOR64(b20, b20, t2); \
+                XOR64(b21, b21, t2); \
+                XOR64(b22, b22, t2); \
+                XOR64(b23, b23, t2); \
+                XOR64(b24, b24, t2); \
+                XOR64(b30, b30, t3); \
+                XOR64(b31, b31, t3); \
+                XOR64(b32, b32, t3); \
+                XOR64(b33, b33, t3); \
+                XOR64(b34, b34, t3); \
+                XOR64(b40, b40, t4); \
+                XOR64(b41, b41, t4); \
+                XOR64(b42, b42, t4); \
+                XOR64(b43, b43, t4); \
+                XOR64(b44, b44, t4); \
+        } while (0)
+
+#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                /* ROL64(b00, b00,  0); */ \
+                ROL64(b01, b01, 36); \
+                ROL64(b02, b02,  3); \
+                ROL64(b03, b03, 41); \
+                ROL64(b04, b04, 18); \
+                ROL64(b10, b10,  1); \
+                ROL64(b11, b11, 44); \
+                ROL64(b12, b12, 10); \
+                ROL64(b13, b13, 45); \
+                ROL64(b14, b14,  2); \
+                ROL64(b20, b20, 62); \
+                ROL64(b21, b21,  6); \
+                ROL64(b22, b22, 43); \
+                ROL64(b23, b23, 15); \
+                ROL64(b24, b24, 61); \
+                ROL64(b30, b30, 28); \
+                ROL64(b31, b31, 55); \
+                ROL64(b32, b32, 25); \
+                ROL64(b33, b33, 21); \
+                ROL64(b34, b34, 56); \
+                ROL64(b40, b40, 27); \
+                ROL64(b41, b41, 20); \
+                ROL64(b42, b42, 39); \
+                ROL64(b43, b43,  8); \
+                ROL64(b44, b44, 14); \
+        } while (0)
+
+/*
+ * The KHI macro integrates the "lane complement" optimization. On input,
+ * some words are complemented:
+ *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
+ * On output, the following words are complemented:
+ *    a04 a10 a20 a22 a23 a31
+ *
+ * The (implicit) permutation and the theta expansion will bring back
+ * the input mask for the next round.
+ */
+
+#define KHI_XO(d, a, b, c)   do { \
+                DECL64(kt); \
+                OR64(kt, b, c); \
+                XOR64(d, a, kt); \
+        } while (0)
+
+#define KHI_XA(d, a, b, c)   do { \
+                DECL64(kt); \
+                AND64(kt, b, c); \
+                XOR64(d, a, kt); \
+        } while (0)
+
+#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                DECL64(c0); \
+                DECL64(c1); \
+                DECL64(c2); \
+                DECL64(c3); \
+                DECL64(c4); \
+                DECL64(bnn); \
+                NOT64(bnn, b20); \
+                KHI_XO(c0, b00, b10, b20); \
+                KHI_XO(c1, b10, bnn, b30); \
+                KHI_XA(c2, b20, b30, b40); \
+                KHI_XO(c3, b30, b40, b00); \
+                KHI_XA(c4, b40, b00, b10); \
+                MOV64(b00, c0); \
+                MOV64(b10, c1); \
+                MOV64(b20, c2); \
+                MOV64(b30, c3); \
+                MOV64(b40, c4); \
+                NOT64(bnn, b41); \
+                KHI_XO(c0, b01, b11, b21); \
+                KHI_XA(c1, b11, b21, b31); \
+                KHI_XO(c2, b21, b31, bnn); \
+                KHI_XO(c3, b31, b41, b01); \
+                KHI_XA(c4, b41, b01, b11); \
+                MOV64(b01, c0); \
+                MOV64(b11, c1); \
+                MOV64(b21, c2); \
+                MOV64(b31, c3); \
+                MOV64(b41, c4); \
+                NOT64(bnn, b32); \
+                KHI_XO(c0, b02, b12, b22); \
+                KHI_XA(c1, b12, b22, b32); \
+                KHI_XA(c2, b22, bnn, b42); \
+                KHI_XO(c3, bnn, b42, b02); \
+                KHI_XA(c4, b42, b02, b12); \
+                MOV64(b02, c0); \
+                MOV64(b12, c1); \
+                MOV64(b22, c2); \
+                MOV64(b32, c3); \
+                MOV64(b42, c4); \
+                NOT64(bnn, b33); \
+                KHI_XA(c0, b03, b13, b23); \
+                KHI_XO(c1, b13, b23, b33); \
+                KHI_XO(c2, b23, bnn, b43); \
+                KHI_XA(c3, bnn, b43, b03); \
+                KHI_XO(c4, b43, b03, b13); \
+                MOV64(b03, c0); \
+                MOV64(b13, c1); \
+                MOV64(b23, c2); \
+                MOV64(b33, c3); \
+                MOV64(b43, c4); \
+                NOT64(bnn, b14); \
+                KHI_XA(c0, b04, bnn, b24); \
+                KHI_XO(c1, bnn, b24, b34); \
+                KHI_XA(c2, b24, b34, b44); \
+                KHI_XO(c3, b34, b44, b04); \
+                KHI_XA(c4, b44, b04, b14); \
+                MOV64(b04, c0); \
+                MOV64(b14, c1); \
+                MOV64(b24, c2); \
+                MOV64(b34, c3); \
+                MOV64(b44, c4); \
+        } while (0)
+
+#define IOTA(r)   XOR64_IOTA(a00, a00, r)
+
+#define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
+              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
+#define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
+              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
+#define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
+              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
+#define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
+              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
+#define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
+              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
+#define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
+              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
+#define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
+              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
+#define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
+              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
+#define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
+              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
+#define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
+              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
+#define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
+              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
+#define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
+              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
+#define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
+              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
+#define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
+              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
+#define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
+              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
+#define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
+              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
+#define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
+              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
+#define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
+              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
+#define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
+              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
+#define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
+              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
+#define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
+              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
+#define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
+              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
+#define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
+              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
+#define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
+              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
+
+#define P8_TO_P0   do { \
+                DECL64(t); \
+                MOV64(t, a01); \
+                MOV64(a01, a11); \
+                MOV64(a11, a43); \
+                MOV64(a43, t); \
+                MOV64(t, a02); \
+                MOV64(a02, a22); \
+                MOV64(a22, a31); \
+                MOV64(a31, t); \
+                MOV64(t, a03); \
+                MOV64(a03, a33); \
+                MOV64(a33, a24); \
+                MOV64(a24, t); \
+                MOV64(t, a04); \
+                MOV64(a04, a44); \
+                MOV64(a44, a12); \
+                MOV64(a12, t); \
+                MOV64(t, a10); \
+                MOV64(a10, a32); \
+                MOV64(a32, a13); \
+                MOV64(a13, t); \
+                MOV64(t, a14); \
+                MOV64(a14, a21); \
+                MOV64(a21, a20); \
+                MOV64(a20, t); \
+                MOV64(t, a23); \
+                MOV64(a23, a42); \
+                MOV64(a42, a40); \
+                MOV64(a40, t); \
+                MOV64(t, a30); \
+                MOV64(a30, a41); \
+                MOV64(a41, a34); \
+                MOV64(a34, t); \
+        } while (0)
+
+#define LPAR   (
+#define RPAR   )
+
+#define KF_ELT(r, s, k)   do { \
+                THETA LPAR P ## r RPAR; \
+                RHO LPAR P ## r RPAR; \
+                KHI LPAR P ## s RPAR; \
+                IOTA(k); \
+        } while (0)
+
+#define DO(x)   x
+
+#define KECCAK_F_1600   DO(KECCAK_F_1600_)
+
+#define KECCAK_F_1600_   do { \
+    int j; \
+    for (j = 0; j < 24; j += 8) \
+    { \
+       KF_ELT( 0,  1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \
+                                       RC[j + 0], RC[j + 0])) ); \
+       KF_ELT( 1,  2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \
+                                       RC[j + 1], RC[j + 1])) ); \
+       KF_ELT( 2,  3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \
+                                       RC[j + 2], RC[j + 2])) ); \
+       KF_ELT( 3,  4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \
+                                       RC[j + 3], RC[j + 3])) ); \
+       KF_ELT( 4,  5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \
+                                       RC[j + 4], RC[j + 4])) ); \
+       KF_ELT( 5,  6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \
+                                       RC[j + 5], RC[j + 5])) ); \
+       KF_ELT( 6,  7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \
+                                       RC[j + 6], RC[j + 6])) ); \
+       KF_ELT( 7,  8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \
+                                       RC[j + 7], RC[j + 7])) ); \
+       P8_TO_P0; \
+    } \
+} while (0)
+
+
+static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
+{
+   int i;
+   for (i = 0; i < 25; i ++)
+          kc->w[i] = _mm256_setzero_si256();
+
+   // Initialization for the "lane complement".
+   kc->w[ 1] = mm256_neg1;
+   kc->w[ 2] = mm256_neg1;
+   kc->w[ 8] = mm256_neg1;
+   kc->w[12] = mm256_neg1;
+   kc->w[17] = mm256_neg1;
+   kc->w[20] = mm256_neg1;
+   kc->ptr = 0;
+   kc->lim = 200 - (out_size >> 2);
+}
+
+static void
+keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
+               size_t lim )
+{
+    __m256i *buf;
+    __m256i *vdata = (__m256i*)data;
+    size_t ptr;
+    DECL_STATE
+
+    buf = kc->buf;
+    ptr = kc->ptr;
+
+    if ( len < (lim - ptr) )
+    {
+        memcpy_256( buf + (ptr>>3), vdata, len>>3 );
+        kc->ptr = ptr + len;
+        return;
+    }
+
+    READ_STATE( kc );
+    while ( len > 0 )
+    {
+        size_t clen;
+
+        clen = (lim - ptr);
+        if ( clen > len )
+             clen = len;
+        memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
+        ptr += clen;
+        vdata = vdata + (clen>>3);
+        len -= clen;
+        if ( ptr == lim )
+        {
+            INPUT_BUF( lim );
+            KECCAK_F_1600;
+            ptr = 0;
+        }
+    }
+    WRITE_STATE( kc );
+    kc->ptr = ptr;
+}
+
+static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
+            size_t lim )
+{
+    unsigned eb;
+    union {
+       __m256i tmp[lim + 1];
+       sph_u64 dummy;   /* for alignment */
+    } u;
+    size_t j;
+    size_t m256_len = byte_len >> 3;
+
+    eb = 0x100  >> 8;
+    if ( kc->ptr == (lim - 8) )
+    {
+        uint64_t t = eb | 0x8000000000000000;
+        u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
+        j = 8;
+    }
+    else
+    {
+        j = lim - kc->ptr;
+        u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
+        memset_zero_256( u.tmp + 1, (j>>3) - 2 );
+        u.tmp[ (j>>3) - 1] = _mm256_set_epi64x( 0x8000000000000000,
+                0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+    }
+    keccak64_core( kc, u.tmp, j, lim );
+    /* Finalize the "lane complement" */
+    NOT64( kc->w[ 1], kc->w[ 1] );
+    NOT64( kc->w[ 2], kc->w[ 2] );
+    NOT64( kc->w[ 8], kc->w[ 8] );
+    NOT64( kc->w[12], kc->w[12] );
+    NOT64( kc->w[17], kc->w[17] );
+    NOT64( kc->w[20], kc->w[20] );
+    for ( j = 0; j < m256_len; j++ )
+         u.tmp[j] =  kc->w[j]; 
+    memcpy_256( dst, u.tmp, m256_len );
+}
+
+void keccak256_4way_init( void *kc )
+{
+   keccak64_init( kc, 256 );
+}
+
+void
+keccak256_4way(void *cc, const void *data, size_t len)
+{
+    keccak64_core(cc, data, len, 136);
+}
+
+void
+keccak256_4way_close(void *cc, void *dst)
+{
+    keccak64_close(cc, dst, 32, 136);
+}
+
+void keccak512_4way_init( void *kc )
+{
+   keccak64_init( kc, 512 );
+}
+
+void
+keccak512_4way(void *cc, const void *data, size_t len)
+{
+        keccak64_core(cc, data, len, 72);
+}
+
+void
+keccak512_4way_close(void *cc, void *dst)
+{
+        keccak64_close(cc, dst, 64, 72);
+}
+
+#endif
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -0,0 +1,94 @@
+/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Keccak interface. This is the interface for Keccak with the
+ * recommended parameters for SHA-3, with output lengths 224, 256,
+ * 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_keccak.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef KECCAK_HASH_4WAY_H__
+#define KECCAK_HASH_4WAY_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef  __AVX2__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_keccak256   256
+
+/**
+ * Output size (in bits) for Keccak-512.
+ */
+#define SPH_SIZE_keccak512   512
+
+/**
+ * This structure is a context for Keccak computations: it contains the
+ * intermediate values and some data from the last entered block. Once a
+ * Keccak computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running Keccak computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+
+typedef struct {
+        __m256i buf[144*8];    /* first field, for alignment */
+        __m256i w[25];
+        size_t ptr, lim;
+//        sph_u64 wide[25];
+} keccak64_ctx_m256i;
+
+typedef keccak64_ctx_m256i keccak256_4way_context;
+typedef keccak64_ctx_m256i keccak512_4way_context;
+
+void keccak256_4way_init(void *cc);
+void keccak256_4way(void *cc, const void *data, size_t len);
+void keccak256_4way_close(void *cc, void *dst);
+
+
+void keccak512_4way_init(void *cc);
+void keccak512_4way(void *cc, const void *data, size_t len);
+void keccak512_4way_close(void *cc, void *dst);
+void keccak512_4way_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/keccak/keccak.c
+++ b/algo/keccak/keccak.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
@@ -51,17 +50,3 @@ int scanhash_keccak(int thr_id, struct work *work,
 	return 0;
 }

-void keccak_set_target( struct work* work, double job_diff )
-{
-  work_set_target( work, job_diff / (128.0 * opt_diff_factor) );
-}
-
-bool register_keccak_algo( algo_gate_t* gate )
-{
-  gate->scanhash        = (void*)&scanhash_keccak;
-  gate->hash            = (void*)&keccakhash;
-  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
-  gate->set_target      = (void*)&keccak_set_target;
-  return true;
-};
-
--- a/algo/keccak/sph_keccak.c
+++ b/algo/keccak/sph_keccak.c
@@ -955,6 +955,7 @@ static const struct {

 #endif

+
 #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
 		DECL64(tt0); \
 		DECL64(tt1); \
@@ -1643,8 +1644,7 @@ keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
 		for (j = 0; j < d; j += 8) \
 			sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \
 		memcpy(dst, u.tmp, d); \
-		keccak_init(kc, (unsigned)d << 3); \
-	} \
+}

 #else

--- a/algo/lbry.c
+++ b/algo/lbry.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"
 #include <stdlib.h>
 #include <stdint.h>
@@ -6,9 +5,7 @@
 #include <stdio.h>
 #include "ripemd/sph_ripemd.h"
 #include "sha/sph_sha2.h"
-#if defined __SHA__
- #include <openssl/sha.h>
-#endif
+#include <openssl/sha.h>

 #define LBRY_NTIME_INDEX 25
 #define LBRY_NBITS_INDEX 26
@@ -16,45 +13,22 @@
 #define LBRY_WORK_DATA_SIZE 192
 #define LBRY_WORK_CMP_SIZE 76  // same as default

-/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
-typedef struct {
-#if defined __SHA__
-   SHA256_CTX             sha256;
-#else
-   sph_sha256_context     sha256;
-#endif
-   sph_sha512_context     sha512;
-   sph_ripemd160_context  ripemd;
-} lbryhash_context_holder;
-
-/* no need to copy, because close reinit the context */
-static  lbryhash_context_holder ctx __attribute__ ((aligned (64)));
-
-void init_lbry_contexts(void *dummy)
-{
-#if defined __SHA__
-   SHA256_Init( &ctx.sha256 );
-#else
-   sph_sha256_init( &ctx.sha256 );
-#endif
-   sph_sha512_init( &ctx.sha512 );
-   sph_ripemd160_init( &ctx.ripemd );
-}

 void lbry_hash(void* output, const void* input)
 {
-#if defined __SHA__
+#ifndef USE_SPH_SHA
   SHA256_CTX              ctx_sha256 __attribute__ ((aligned (64)));
+   SHA512_CTX              ctx_sha512 __attribute__ ((aligned (64)));
 #else
   sph_sha256_context      ctx_sha256 __attribute__ ((aligned (64)));
-#endif
   sph_sha512_context      ctx_sha512 __attribute__ ((aligned (64)));
+#endif
   sph_ripemd160_context   ctx_ripemd __attribute__ ((aligned (64)));
   uint32_t _ALIGN(64) hashA[16];
   uint32_t _ALIGN(64) hashB[16];
   uint32_t _ALIGN(64) hashC[16];

-#if defined __SHA__
+#ifndef USE_SPH_SHA
   SHA256_Init( &ctx_sha256 );
   SHA256_Update( &ctx_sha256, input, 112 );
   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
@@ -62,6 +36,10 @@ void lbry_hash(void* output, const void* input)
   SHA256_Init( &ctx_sha256 );
   SHA256_Update( &ctx_sha256, hashA, 32 );
   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
+
+   SHA512_Init( &ctx_sha512 );
+   SHA512_Update( &ctx_sha512, hashA, 32 );
+   SHA512_Final( (unsigned char*) hashA, &ctx_sha512 );
 #else
   sph_sha256_init( &ctx_sha256 );
   sph_sha256 ( &ctx_sha256, input, 112 );
@@ -70,11 +48,11 @@ void lbry_hash(void* output, const void* input)
   sph_sha256_init( &ctx_sha256 );
   sph_sha256 ( &ctx_sha256, hashA, 32 );
   sph_sha256_close( &ctx_sha256, hashA );
-#endif

   sph_sha512_init( &ctx_sha512 );
   sph_sha512 ( &ctx_sha512, hashA, 32 );
-   sph_sha512_close( &ctx_sha512, hashA );  
+   sph_sha512_close( &ctx_sha512, hashA );
+#endif

   sph_ripemd160_init( &ctx_ripemd );
   sph_ripemd160 ( &ctx_ripemd, hashA, 32 );
@@ -84,7 +62,7 @@ void lbry_hash(void* output, const void* input)
   sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
   sph_ripemd160_close( &ctx_ripemd, hashC );

-#if defined __SHA__
+#ifndef USE_SPH_SHA
   SHA256_Init( &ctx_sha256 );
   SHA256_Update( &ctx_sha256, hashB, 20 );
   SHA256_Update( &ctx_sha256, hashC, 20 );
@@ -243,7 +221,7 @@ int64_t lbry_get_max64() { return 0x1ffffLL; }

 bool register_lbry_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | SHA_OPT;
+  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
  gate->scanhash              = (void*)&scanhash_lbry;
  gate->hash                  = (void*)&lbry_hash;
  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
--- a/algo/luffa/luffa.c
+++ b/algo/luffa/luffa.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
--- a/algo/luffa/sse2/luffa_for_sse2.c
+++ b/algo/luffa/sse2/luffa_for_sse2.c
@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
+                      mm_byteswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      casti_m128i( state->buffer, 0 ) = mm_byteswap_epi32( cast_m128i( data ) );
+      casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) );
      // padding of partial block
      casti_m128i( state->buffer, 1 ) =
            _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
+                      mm_byteswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    {
      // padding of partial block
      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
-                      mm_byteswap_epi32( cast_m128i( data ) ) );
+                      mm_byteswap_32( cast_m128i( data ) ) );
    }
    else
    {
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )

    _mm256_store_si256( (__m256i*)hash, t );

-    casti_m256i( b, 0 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );

    rnd512( state, zero, zero );

@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )

    _mm256_store_si256( (__m256i*)hash, t );

-    casti_m256i( b, 1 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
 }

 #else
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 0 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 1 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );

    rnd512( state, zero, zero );

@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 2 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 3 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
 }
 #endif

--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -377,7 +377,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
   uint64_t *wholeMatrix = _mm_malloc( i, 64 );
   if (wholeMatrix == NULL)
      return -1;
-
+/*
 #if defined (__AVX2__)
   memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
 #elif defined(__AVX__)
@@ -385,7 +385,7 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
 #else
   memset(wholeMatrix, 0, i);
 #endif
-
+*/
   uint64_t *ptrWord = wholeMatrix;

   //=== Getting the password + salt + basil padded with 10*1 ==========//
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -21,8 +21,9 @@
 #define LYRA2_H_

 #include <stdint.h>
+#include "algo/sha/sha3-defs.h"

-typedef unsigned char byte;
+//typedef unsigned char byte;

 //Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
 #define BLOCK_LEN_BLAKE2_SAFE_INT64 8                                   //512 bits (=64 bytes, =8 uint64_t)
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -0,0 +1,93 @@
+#include <memory.h>
+#include <mm_malloc.h>
+#include "algo-gate-api.h"
+#include "lyra2.h"
+#include "algo/blake/sph_blake.h"
+
+__thread uint64_t* lyra2h_matrix;
+
+bool lyra2h_thread_init()
+{
+   const int i = 16 * 16 * 96;
+   lyra2h_matrix = _mm_malloc( i, 64 );
+   return lyra2h_matrix;
+}
+
+static __thread sph_blake256_context lyra2h_blake_mid;
+
+void lyra2h_midstate( const void* input )
+{
+       sph_blake256_init( &lyra2h_blake_mid );
+       sph_blake256( &lyra2h_blake_mid, input, 64 );
+}
+
+void lyra2h_hash( void *state, const void *input )
+{
+        uint32_t _ALIGN(64) hash[16];
+
+        sph_blake256_context ctx_blake __attribute__ ((aligned (64)));
+
+        memcpy( &ctx_blake, &lyra2h_blake_mid, sizeof lyra2h_blake_mid );
+        sph_blake256( &ctx_blake, input + 64, 16 );
+        sph_blake256_close( &ctx_blake, hash );
+
+        LYRA2Z( lyra2h_matrix, hash, 32, hash, 32, hash, 32, 16, 16, 16 );
+
+    memcpy(state, hash, 32);
+}
+
+int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done )
+{
+	uint32_t _ALIGN(64) hash[8];
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+
+	if (opt_benchmark)
+		ptarget[7] = 0x0000ff;
+
+	for (int i=0; i < 19; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+
+        lyra2h_midstate( endiandata );
+
+	do {
+		be32enc(&endiandata[19], nonce);
+                lyra2h_hash( hash, endiandata );
+
+		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
+			work_set_target_ratio(work, hash);
+			pdata[19] = nonce;
+			*hashes_done = pdata[19] - first_nonce;
+			return 1;
+		}
+		nonce++;
+
+	} while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
+void lyra2h_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_lyra2h_algo( algo_gate_t* gate )
+{
+  gate->optimizations = AVX_OPT | AVX2_OPT;
+  gate->miner_thread_init = (void*)&lyra2h_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h;
+  gate->hash       = (void*)&lyra2h_hash;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&lyra2h_set_target;
+  return true;
+};
+
--- a/algo/lyra2/lyra2re.c
+++ b/algo/lyra2/lyra2re.c
@@ -1,6 +1,5 @@
 #include <memory.h>

-#include "miner.h"
 #include "algo/blake/sph_blake.h"
 #include "algo/groestl/sph_groestl.h"
 #include "algo/skein/sph_skein.h"
@@ -129,34 +128,10 @@ void lyra2re_set_target ( struct work* work, double job_diff )
   work_set_target(work, job_diff / (128.0 * opt_diff_factor) );
 }

-/*
-bool lyra2re_thread_init()
-{
-   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-
-   int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
-   lyra2re_wholeMatrix = _mm_malloc( i, 64 );
-
-   if ( lyra2re_wholeMatrix == NULL )
-     return false;
-
-#if defined (__AVX2__)
-   memset_zero_m256i( (__m256i*)lyra2re_wholeMatrix, i/32 );
-#elif defined(__AVX__)
-   memset_zero_m128i( (__m128i*)lyra2re_wholeMatrix, i/16 );
-#else
-   memset( lyra2re_wholeMatrix, 0, i );
-#endif
-   return true;
-}
-*/
-
 bool register_lyra2re_algo( algo_gate_t* gate )
 {
  init_lyra2re_ctx();
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-//  gate->miner_thread_init = (void*)&lyra2re_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2re;
  gate->hash       = (void*)&lyra2re_hash;
  gate->get_max64  = (void*)&lyra2re_get_max64;
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -1,6 +1,5 @@
 #include <memory.h>

-#include "miner.h"
 #include "algo-gate-api.h"

 #include "algo/blake/sph_blake.h"
@@ -133,23 +132,13 @@ bool lyra2rev2_thread_init()
   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
   l2v2_wholeMatrix = _mm_malloc( i, 64 );

-   if ( l2v2_wholeMatrix == NULL )
-     return false;
-
-#if defined (__AVX2__)
-   memset_zero_m256i( (__m256i*)l2v2_wholeMatrix, i/32 );
-#elif defined (__AVX__)
-   memset_zero_m128i( (__m128i*)l2v2_wholeMatrix, i/16 );
-#else
-   memset( l2v2_wholeMatrix, 0, i );
-#endif
-   return true;
+   return l2v2_wholeMatrix;
 }

 bool register_lyra2rev2_algo( algo_gate_t* gate )
 {
  init_lyra2rev2_ctx();
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
  gate->scanhash          = (void*)&scanhash_lyra2rev2;
  gate->hash              = (void*)&lyra2rev2_hash;
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -0,0 +1,128 @@
+#include "lyra2z-gate.h"
+
+#ifdef LYRA2Z_4WAY
+
+#include <memory.h>
+#include <mm_malloc.h>
+#include "lyra2.h"
+#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake-hash-4way.h"
+
+__thread uint64_t* lyra2z_4way_matrix;
+
+bool lyra2z_4way_thread_init()
+{
+ return ( lyra2z_4way_matrix = _mm_malloc( LYRA2Z_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_4way_context l2z_4way_blake_mid;
+
+void lyra2z_4way_midstate( const void* input )
+{
+       blake256_4way_init( &l2z_4way_blake_mid );
+       blake256_4way( &l2z_4way_blake_mid, input, 64 );
+}
+
+void lyra2z_4way_hash( void *state, const void *input )
+{
+     uint32_t hash0[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (64)));
+     uint32_t hash2[8] __attribute__ ((aligned (64)));
+     uint32_t hash3[8] __attribute__ ((aligned (64)));
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
+
+     memcpy( &ctx_blake, &l2z_4way_blake_mid, sizeof l2z_4way_blake_mid );
+     blake256_4way( &ctx_blake, input + (64*4), 16 );
+     blake256_4way_close( &ctx_blake, vhash );
+
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+
+     LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 8, 8, 8 );
+     LYRA2Z( lyra2z_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 8, 8, 8 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   bool *found = work->nfound;
+   int num_found = 0;
+   uint32_t *noncep0 = vdata + 76; // 19*4
+   uint32_t *noncep1 = vdata + 77;
+   uint32_t *noncep2 = vdata + 78;
+   uint32_t *noncep3 = vdata + 79;
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   for ( int i=0; i < 19; i++ )
+      be32enc( &edata[i], pdata[i] );
+
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   lyra2z_4way_midstate( vdata );
+
+   do {
+      found[0] = found[1] = found[2] = found[3] = false;
+      be32enc( noncep0, n   );
+      be32enc( noncep1, n+1 );
+      be32enc( noncep2, n+2 );
+      be32enc( noncep3, n+3 );
+
+      be32enc( &edata[19], n );
+      lyra2z_4way_hash( hash, vdata );
+
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+      {
+          found[0] = true;
+          num_found++;
+          nonces[0] = pdata[19] = n;
+          work_set_target_ratio( work, hash );
+      }
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
+      {
+          found[1] = true;
+          num_found++;
+          nonces[1] = n+1;
+          work_set_target_ratio( work, hash+8 );
+      }
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
+      {
+          found[2] = true;
+          num_found++;
+          nonces[2] = n+2;
+          work_set_target_ratio( work, hash+16 );
+      }
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
+      {
+          found[3] = true;
+          num_found++;
+          nonces[3] = n+3;
+          work_set_target_ratio( work, hash+24 );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
+
--- a/algo/lyra2/lyra2z-gate.c
+++ b/algo/lyra2/lyra2z-gate.c
@@ -0,0 +1,25 @@
+#include "lyra2z-gate.h"
+#include "lyra2.h"
+
+void lyra2z_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_lyra2z_algo( algo_gate_t* gate )
+{
+#ifdef LYRA2Z_4WAY
+  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
+  gate->hash       = (void*)&lyra2z_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&lyra2z_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z;
+  gate->hash       = (void*)&lyra2z_hash;
+#endif
+  gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&lyra2z_set_target;
+  return true;
+};
+
--- a/algo/lyra2/lyra2z-gate.h
+++ b/algo/lyra2/lyra2z-gate.h
@@ -0,0 +1,33 @@
+#ifndef LYRA2Z_GATE_H__
+#define LYRA2Z_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY)
+  #define LYRA2Z_4WAY
+#endif
+
+
+#define LYRA2Z_MATRIX_SIZE  BLOCK_LEN_INT64 * 8 * 8 * 8
+
+#if defined(LYRA2Z_4WAY)
+
+void lyra2z_4way_hash( void *state, const void *input );
+
+int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+bool lyra2z_4way_thread_init();
+
+#endif
+
+void lyra2z_hash( void *state, const void *input );
+
+int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+bool lyra2z_thread_init();
+
+#endif
+
--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -1,41 +1,49 @@
 #include <memory.h>
 #include <mm_malloc.h>
-#include "miner.h"
-#include "algo-gate-api.h"
+#include "lyra2z-gate.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "avxdefs.h"

-__thread uint64_t* zcoin_wholeMatrix;
+__thread uint64_t* lyra2z_matrix;

-static __thread sph_blake256_context zcoin_blake_mid;
-
-
-void zcoin_midstate( const void* input )
+bool lyra2z_thread_init()
 {
-       sph_blake256_init( &zcoin_blake_mid );
-       sph_blake256( &zcoin_blake_mid, input, 64 );
+//   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
+//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+//   int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
+   const int i = BLOCK_LEN_INT64 * 8 * 8 * 8;
+   lyra2z_matrix = _mm_malloc( i, 64 );
+   return lyra2z_matrix;
+}
+
+static __thread sph_blake256_context lyra2z_blake_mid;
+
+void lyra2z_midstate( const void* input )
+{
+       sph_blake256_init( &lyra2z_blake_mid );
+       sph_blake256( &lyra2z_blake_mid, input, 64 );
 }

 // block 2050 new algo, blake plus new lyra parms. new input
 // is power of 2 so normal lyra can be used
 //void zcoin_hash(void *state, const void *input, uint32_t height)
-void zcoin_hash(void *state, const void *input )
+void lyra2z_hash( void *state, const void *input )
 {
        uint32_t _ALIGN(64) hash[16];

        sph_blake256_context ctx_blake __attribute__ ((aligned (64)));

-        memcpy( &ctx_blake, &zcoin_blake_mid, sizeof zcoin_blake_mid );
+        memcpy( &ctx_blake, &lyra2z_blake_mid, sizeof lyra2z_blake_mid );
        sph_blake256( &ctx_blake, input + 64, 16 );
        sph_blake256_close( &ctx_blake, hash );

-        LYRA2Z( zcoin_wholeMatrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);
+        LYRA2Z( lyra2z_matrix, hash, 32, hash, 32, hash, 32, 8, 8, 8);

    memcpy(state, hash, 32);
 }

-int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
+int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done )
 {
 	uint32_t _ALIGN(64) hash[8];
@@ -53,11 +61,11 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[i], pdata[i]);
 	}

-        zcoin_midstate( endiandata );
+        lyra2z_midstate( endiandata );

 	do {
 		be32enc(&endiandata[19], nonce);
-                zcoin_hash( hash, endiandata );
+                lyra2z_hash( hash, endiandata );

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			work_set_target_ratio(work, hash);
@@ -74,50 +82,41 @@ int scanhash_zcoin( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

+/*
 //int64_t get_max64_0xffffLL() { return 0xffffLL; };

-void zcoin_set_target( struct work* work, double job_diff )
+void lyra2z_set_target( struct work* work, double job_diff )
 {
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }
-/*
+
 bool zcoin_get_work_height( struct work* work, struct stratum_ctx* sctx )
 {
   work->height = sctx->bloc_height;
   return false;
 }
-*/

-bool zcoin_thread_init()
+
+bool lyra2z_thread_init()
 {
   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 8; // nCols
   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

   int i = (int64_t)ROW_LEN_BYTES * 8; // nRows;
-   zcoin_wholeMatrix = _mm_malloc( i, 64 );
+   lyra2z_wholeMatrix = _mm_malloc( i, 64 );

-   if ( zcoin_wholeMatrix == NULL )
-     return false;
-
-#if defined (__AVX2__)
-   memset_zero_m256i( (__m256i*)zcoin_wholeMatrix, i/32 );
-#elif defined(__AVX__)
-   memset_zero_m128i( (__m128i*)zcoin_wholeMatrix, i/16 );
-#else
-   memset( zcoin_wholeMatrix, 0, i );
-#endif
-   return true;
+   return lyra2z_wholeMatrix;
 }

-bool register_zcoin_algo( algo_gate_t* gate )
+bool register_lyra2z_algo( algo_gate_t* gate )
 {
  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  gate->miner_thread_init = (void*)&zcoin_thread_init;
-  gate->scanhash   = (void*)&scanhash_zcoin;
-  gate->hash       = (void*)&zcoin_hash;
+  gate->miner_thread_init = (void*)&lyra2z_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z;
+  gate->hash       = (void*)&lyra2z_hash;
  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&zcoin_set_target;
+  gate->set_target = (void*)&lyra2z_set_target;
 //  gate->prevent_dupes = (void*)&zcoin_get_work_height;
  return true;
 };
-
+*/
--- a/algo/lyra2/lyra2z330.c
+++ b/algo/lyra2/lyra2z330.c
@@ -1,5 +1,4 @@
 #include <memory.h>
-#include "miner.h"
 #include "algo-gate-api.h"
 #include "lyra2.h"
 #include "avxdefs.h"
@@ -65,22 +64,12 @@ bool lyra2z330_thread_init()
   int i = (int64_t)ROW_LEN_BYTES * 330; // nRows;
   lyra2z330_wholeMatrix = _mm_malloc( i, 64 );

-   if ( lyra2z330_wholeMatrix == NULL )
-     return false;
-
-#if defined (__AVX2__)
-   memset_zero_m256i( (__m256i*)lyra2z330_wholeMatrix, i/32 );
-#elif defined(__AVX__)
-   memset_zero_m128i( (__m128i*)lyra2z330_wholeMatrix, i/16 );
-#else
-   memset( lyra2z330_wholeMatrix, 0, i );
-#endif
-   return true;
+   return lyra2z330_wholeMatrix;
 }

 bool register_lyra2z330_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = AVX_OPT | AVX2_OPT;
  gate->miner_thread_init = (void*)&lyra2z330_thread_init;
  gate->scanhash   = (void*)&scanhash_lyra2z330;
  gate->hash       = (void*)&lyra2z330_hash;
--- a/algo/lyra2/sponge.c
+++ b/algo/lyra2/sponge.c
@@ -130,12 +130,12 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
    //Squeezes full blocks
    for ( i = 0; i < fullBlocks; i++ )
    {
-       memcpy_m256i( out, state, BLOCK_LEN_M256I );
+       memcpy_256( out, state, BLOCK_LEN_M256I );
       LYRA_ROUND_AVX2( state[0], state[1], state[2], state[3] );
       out += BLOCK_LEN_M256I;
    }
    //Squeezes remaining bytes
-    memcpy_m256i( out, state, ( len_m256i % BLOCK_LEN_M256I ) );
+    memcpy_256( out, state, ( len_m256i % BLOCK_LEN_M256I ) );

 #elif defined (__AVX__)

@@ -148,13 +148,13 @@ inline void squeeze( uint64_t *State, byte *Out, unsigned int len )
    //Squeezes full blocks
    for ( i = 0; i < fullBlocks; i++ )
    {
-       memcpy_m128i( out, state, BLOCK_LEN_M128I );
+       memcpy_128( out, state, BLOCK_LEN_M128I );
       LYRA_ROUND_AVX( state[0], state[1], state[2], state[3],
                       state[4], state[5], state[6], state[7] );
       out += BLOCK_LEN_M128I;
    }
    //Squeezes remaining bytes
-    memcpy_m128i( out, state, ( len_m128i % BLOCK_LEN_M128I ) );
+    memcpy_128( out, state, ( len_m128i % BLOCK_LEN_M128I ) );

 #else

--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -65,13 +65,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_rotl256_1x64( s1); \
-   s2 = mm256_swap128( s2 ); \
-   s3 = mm256_rotr256_1x64( s3 ); \
+   s1 = mm256_rotr256_1x64( s1); \
+   s2 = mm256_swap_128( s2 ); \
+   s3 = mm256_rotl256_1x64( s3 ); \
   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_rotr256_1x64( s1 ); \
-   s2 = mm256_swap128( s2 ); \
-   s3 = mm256_rotl256_1x64( s3 );
+   s1 = mm256_rotl256_1x64( s1 ); \
+   s2 = mm256_swap_128( s2 ); \
+   s3 = mm256_rotr256_1x64( s3 );

 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -105,14 +105,14 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_rotl256_1x64( s2, s3 ); \
-   mm128_swap128( s4, s5 ); \
-   mm128_rotr256_1x64( s6, s7 ); \
+   mm_rotl256_1x64( s2, s3 ); \
+   mm_swap_128( s4, s5 ); \
+   mm_rotr256_1x64( s6, s7 ); \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm128_rotr256_1x64( s2, s3 ); \
-   mm128_swap128( s4, s5 ); \
-   mm128_rotl256_1x64( s6, s7 );
+   mm_rotr256_1x64( s2, s3 ); \
+   mm_swap_128( s4, s5 ); \
+   mm_rotl256_1x64( s6, s7 );

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/m7m.c
+++ b/algo/m7m.c
@@ -1,5 +1,4 @@
 #include "cpuminer-config.h"
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <gmp.h>
@@ -14,9 +13,7 @@
 #include "algo/tiger/sph_tiger.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/ripemd/sph_ripemd.h"
-#if defined __SHA__
- #include <openssl/sha.h>
-#endif
+#include <openssl/sha.h>


 #define EPSa DBL_EPSILON
@@ -120,12 +117,13 @@ uint32_t sw2_(int nnounce)
 }

 typedef struct {
-#if defined __SHA__
+#ifndef USE_SPH_SHA
    SHA256_CTX               sha256;
+    SHA512_CTX               sha512;
 #else
    sph_sha256_context       sha256;
-#endif
    sph_sha512_context       sha512;
+#endif
    sph_keccak512_context    keccak;
    sph_whirlpool_context    whirlpool;
    sph_haval256_5_context   haval;
@@ -137,12 +135,13 @@ m7m_ctx_holder m7m_ctx;

 void init_m7m_ctx()
 {
-#if defined __SHA__
+#ifndef USE_SPH_SHA
    SHA256_Init( &m7m_ctx.sha256 );
+    SHA512_Init( &m7m_ctx.sha512 );
 #else
    sph_sha256_init( &m7m_ctx.sha256 );
-#endif
    sph_sha512_init( &m7m_ctx.sha512 );
+#endif
    sph_keccak512_init( &m7m_ctx.keccak );
    sph_whirlpool_init( &m7m_ctx.whirlpool );
    sph_haval256_5_init( &m7m_ctx.haval );
@@ -177,7 +176,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

    m7m_ctx_holder ctx1, ctx2 __attribute__ ((aligned (64)));
    memcpy( &ctx1, &m7m_ctx, sizeof(m7m_ctx) );
-#if defined __SHA__
+#ifndef USE_SPH_SHA
    SHA256_CTX         ctxf_sha256;
 #else
    sph_sha256_context ctxf_sha256;
@@ -185,18 +184,20 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

    memcpy(data, pdata, 80);

-#if defined __SHA__
+#ifndef USE_SPH_SHA
    SHA256_Update(  &ctx1.sha256,    data, M7_MIDSTATE_LEN );
+    SHA512_Update(  &ctx1.sha512,    data, M7_MIDSTATE_LEN );
 #else
    sph_sha256(     &ctx1.sha256,    data, M7_MIDSTATE_LEN );
-#endif
    sph_sha512(     &ctx1.sha512,    data, M7_MIDSTATE_LEN );
+#endif
    sph_keccak512(  &ctx1.keccak,    data, M7_MIDSTATE_LEN );
    sph_whirlpool(  &ctx1.whirlpool, data, M7_MIDSTATE_LEN );
    sph_haval256_5( &ctx1.haval,     data, M7_MIDSTATE_LEN );
    sph_tiger(      &ctx1.tiger,     data, M7_MIDSTATE_LEN );
    sph_ripemd160(  &ctx1.ripemd,    data, M7_MIDSTATE_LEN );

+// the following calculations can be performed once and the results shared
    mpz_t magipi, magisw, product, bns0, bns1;
    mpf_t magifpi, magifpi0, mpt1, mpt2, mptmp, mpten;
    
@@ -221,16 +222,22 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

        memcpy( &ctx2, &ctx1, sizeof(m7m_ctx) );

-#if defined __SHA__
+// with 4 way can a single midstate be shared among lanes?
+// do sinlge round of midstate and inyerleave for final
+
+#ifndef USE_SPH_SHA
        SHA256_Update(  &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
        SHA256_Final( (unsigned char*) (bhash[0]), &ctx2.sha256 );
+
+        SHA512_Update(  &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
+        SHA512_Final( (unsigned char*) (bhash[1]), &ctx2.sha512 );
 #else
        sph_sha256( &ctx2.sha256, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_sha256_close( &ctx2.sha256, (void*)(bhash[0]) );
-#endif
+
        sph_sha512( &ctx2.sha512, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_sha512_close( &ctx2.sha512, (void*)(bhash[1]) );
-
+#endif
        sph_keccak512( &ctx2.keccak, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_keccak512_close( &ctx2.keccak, (void*)(bhash[2]) );

@@ -246,6 +253,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
        sph_ripemd160( &ctx2.ripemd, data_p64, 80 - M7_MIDSTATE_LEN );
        sph_ripemd160_close( &ctx2.ripemd, (void*)(bhash[6]) );

+// 4 way serial
 	mpz_import(bns0, a, -1, p, -1, 0, bhash[0]);
        mpz_set(bns1, bns0);
 	mpz_set(product, bns0);
@@ -261,7 +269,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
        bytes = mpz_sizeinbase(product, 256);
        mpz_export((void *)bdata, NULL, -1, 1, 0, 0, product);

-#if defined __SHA__
+#ifndef USE_SPH_SHA
        SHA256_Init( &ctxf_sha256 );
        SHA256_Update(  &ctxf_sha256, bdata, bytes );
        SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
@@ -271,6 +279,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
        sph_sha256_close( &ctxf_sha256, (void*)(hash) );
 #endif

+// do once and share
        digits=(int)((sqrt((double)(n/2))*(1.+EPS))/9000+75);
        mp_bitcnt_t prec = (long int)(digits*BITS_PER_DIGIT+16);
 	mpf_set_prec_raw(magifpi, prec);
@@ -293,7 +302,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
 	    mpz_set_f(magipi, magifpi);
            mpz_add(magipi,magipi,magisw);
            mpz_add(product,product,magipi);
-			
+// share magipi, product and do serial			
 	    mpz_import(bns0, b, -1, p, -1, 0, (void*)(hash));
            mpz_add(bns1, bns1, bns0);
            mpz_mul(product,product,bns1);
@@ -303,7 +312,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
            mpzscale=bytes;
            mpz_export(bdata, NULL, -1, 1, 0, 0, product);

-#if defined __SHA__
+#ifndef USE_SPH_SHA
            SHA256_Init( &ctxf_sha256 );
            SHA256_Update(  &ctxf_sha256, bdata, bytes );
            SHA256_Final( (unsigned char*) hash, &ctxf_sha256 );
@@ -314,6 +323,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,
 #endif
 	}

+// this is the scanhash part
 	const unsigned char *hash_ = (const unsigned char *)hash;
 	const unsigned char *target_ = (const unsigned char *)ptarget;
 	for ( i = 31; i >= 0; i-- )
@@ -343,6 +353,7 @@ int scanhash_m7m_hash( int thr_id, struct work* work,

     pdata[19] = n;

+// do this in hashm7m
 out:
     mpf_set_prec_raw(magifpi, prec0);
     mpf_set_prec_raw(magifpi0, prec0);
@@ -361,21 +372,17 @@ out:
    return rc;
 }

-void m7m_reverse_endian( struct work *work )
-{
-   swab32_array( work->data, work->data, 20 );
-}
-
 bool register_m7m_algo( algo_gate_t *gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | SHA_OPT;
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
  init_m7m_ctx();
  gate->scanhash              = (void*)scanhash_m7m_hash;
  gate->build_stratum_request = (void*)&std_be_build_stratum_request;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
  gate->set_target            = (void*)&scrypt_set_target;
  gate->get_max64             = (void*)&get_max64_0x1ffff;
-  gate->set_work_data_endian  = (void*)&m7m_reverse_endian;
-  gate->work_data_size        = 80;
+  gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
  return true;
 }

--- a/algo/neoscrypt/neoscrypt.c
+++ b/algo/neoscrypt/neoscrypt.c
@@ -31,7 +31,6 @@
 #include <string.h>
 #include <unistd.h>

-#include "miner.h"
 #include "algo-gate-api.h"

 #define USE_CUSTOM_BLAKE2S
@@ -1089,7 +1088,9 @@ bool register_neoscrypt_algo( algo_gate_t* gate )
  gate->set_target            = (void*)&scrypt_set_target;
  gate->wait_for_diff         = (void*)&neoscrypt_wait_for_diff;
  gate->build_stratum_request = (void*)&std_be_build_stratum_request;
-  gate->set_work_data_endian  = (void*)&swab_work_data;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
+  gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
  gate->work_data_size        = 80;
  return true;
 };
--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -0,0 +1,173 @@
+#include "nist5-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(NIST5_4WAY)
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+
+// no improvement with midstate
+//static __thread blake512_4way_context ctx_mid;
+
+void nist5hash_4way( void *out, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake512_4way_context  ctx_blake;
+     hashState_groestl      ctx_groestl;
+     jh512_4way_context     ctx_jh;
+     skein512_4way_context  ctx_skein;
+     keccak512_4way_context ctx_keccak;
+
+//     memcpy( &ctx_blake, &ctx_mid, sizeof(ctx_mid) );
+//     blake512_4way( &ctx_blake, input + (64<<2), 16 );
+
+     blake512_4way_init( &ctx_blake );
+     blake512_4way( &ctx_blake, input, 80 );
+     blake512_4way_close( &ctx_blake, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash0,
+                               (const char*)hash0, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash1,
+                               (const char*)hash1, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash2,
+                               (const char*)hash2, 512 );
+     init_groestl( &ctx_groestl, 64 );
+     update_and_final_groestl( &ctx_groestl, (char*)hash3,
+                               (const char*)hash3, 512 );
+
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     jh512_4way_init( &ctx_jh );
+     jh512_4way( &ctx_jh, vhash, 64 );
+     jh512_4way_close( &ctx_jh, vhash );
+
+     keccak512_4way_init( &ctx_keccak );
+     keccak512_4way( &ctx_keccak, vhash, 64 );
+     keccak512_4way_close( &ctx_keccak, vhash );
+
+     skein512_4way_init( &ctx_skein );
+     skein512_4way( &ctx_skein, vhash, 64 );
+     skein512_4way_close( &ctx_skein, vhash );
+
+     mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
+}
+
+int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done)
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     const uint32_t Htarg = ptarget[7];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+     uint32_t *noncep1 = vdata + 75;
+     uint32_t *noncep2 = vdata + 77;
+     uint32_t *noncep3 = vdata + 79;
+
+     uint64_t htmax[] = {          0,
+                                 0xF,
+                                0xFF,
+                               0xFFF,
+                              0xFFFF,
+                          0x10000000 };
+
+     uint32_t masks[] = { 0xFFFFFFFF,
+                          0xFFFFFFF0,
+                          0xFFFFFF00,
+                          0xFFFFF000,
+                          0xFFFF0000,
+                                   0 };
+
+     // we need bigendian data...
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+     // precalc midstate
+//     blake512_4way_init( &ctx_mid );
+//     blake512_4way( &ctx_mid, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ )
+     {
+        if (Htarg <= htmax[m])
+        {
+           uint32_t mask = masks[m];
+
+           do {
+              found[0] = found[1] = found[2] = found[3] = false;
+              be32enc( noncep0, n   );
+              be32enc( noncep1, n+1 );
+              be32enc( noncep2, n+2 );
+              be32enc( noncep3, n+3 );
+
+              nist5hash_4way( hash, vdata );
+
+              pdata[19] = n;
+
+              if ( ( !(hash[7] & mask) )
+                   && fulltest( hash, ptarget ) ) 
+              {
+                 found[0] = true;
+                 num_found++;
+                 nonces[0] = n; 
+                 work_set_target_ratio( work, hash );
+              }
+              if ( ( !((hash+8)[7] & mask) )
+                   && fulltest( hash+8, ptarget ) )
+              {
+                 found[1] = true;
+                 num_found++;
+                 nonces[1] = n+1;
+                 work_set_target_ratio( work, hash+8 );
+              }
+              if ( ( !((hash+16)[7] & mask) )
+                 && fulltest( hash+16, ptarget ) )
+              {
+                 found[2] = true;
+                 num_found++;
+                 nonces[2] = n+2;
+                 work_set_target_ratio( work, hash+16 );
+              }
+              if ( ( !((hash+24)[7] & mask) )
+                   && fulltest( hash+24, ptarget ) )
+              {
+                 found[3] = true;
+                 num_found++;
+                 nonces[3] = n+3;
+                 work_set_target_ratio( work, hash+24 );
+              }
+              n += 4;
+           } while ( ( num_found == 0 ) && ( n < max_nonce )
+                     && !work_restart[thr_id].restart );
+           break;
+        }
+     }
+
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
--- a/algo/nist5/nist5-gate.c
+++ b/algo/nist5/nist5-gate.c
@@ -0,0 +1,16 @@
+#include "nist5-gate.h"
+
+bool register_nist5_algo( algo_gate_t* gate )
+{
+    gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT;
+#if defined (NIST5_4WAY)
+    gate->scanhash = (void*)&scanhash_nist5_4way;
+    gate->hash     = (void*)&nist5hash_4way;
+#else
+    init_nist5_ctx();
+    gate->scanhash = (void*)&scanhash_nist5;
+    gate->hash     = (void*)&nist5hash;
+#endif
+    return true;
+};
+
--- a/algo/nist5/nist5-gate.h
+++ b/algo/nist5/nist5-gate.h
@@ -0,0 +1,26 @@
+#ifndef __NIST5_GATE_H__
+#define __NIST5_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(HASH_4WAY) && defined(__AES__)
+  #define NIST5_4WAY
+#endif
+
+#if defined(NIST5_4WAY)
+
+void nist5hash_4way( void *state, const void *input );
+
+int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+#else
+
+void nist5hash( void *state, const void *input );
+
+int scanhash_nist5( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+#endif
+
+#endif
--- a/algo/nist5/nist5.c
+++ b/algo/nist5/nist5.c
@@ -1,5 +1,4 @@
-#include "miner.h"
-#include "algo-gate-api.h"
+#include "nist5-gate.h"

 #include <stdlib.h>
 #include <stdint.h>
@@ -148,7 +147,7 @@ int scanhash_nist5(int thr_id, struct work *work,
 	pdata[19] = n;
 	return 0;
 }
-
+/*
 bool register_nist5_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | AES_OPT;
@@ -157,4 +156,4 @@ bool register_nist5_algo( algo_gate_t* gate )
    gate->hash     = (void*)&nist5hash;
    return true;
 };
-
+*/
--- a/algo/nist5/zr5.c
+++ b/algo/nist5/zr5.c
@@ -27,7 +27,6 @@
 */

 #include "cpuminer-config.h"
-#include "miner.h"
 #include "algo-gate-api.h"
 #include <string.h>
 #include <stdint.h>
@@ -229,7 +228,9 @@ bool register_zr5_algo( algo_gate_t* gate )
    gate->get_max64             = (void*)&zr5_get_max64;
    gate->display_extra_data    = (void*)&zr5_display_pok;
    gate->build_stratum_request = (void*)&std_be_build_stratum_request;
-    gate->set_work_data_endian  = (void*)&swab_work_data;
+    gate->work_decode           = (void*)&std_be_work_decode;
+    gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
+    gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
    gate->work_data_size        = 80;
    gate->work_cmp_size         = 72;
    return true;
--- a/algo/pluck.c
+++ b/algo/pluck.c
@@ -25,7 +25,6 @@
 */

 #include "cpuminer-config.h"
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
--- a/algo/quark/quark.c
+++ b/algo/quark/quark.c
@@ -1,5 +1,4 @@
 #include "cpuminer-config.h"
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdio.h>
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -1,32 +1,20 @@
-#include "miner.h"
 #include "algo-gate-api.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
-
 #include "algo/luffa/sse2/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-#include "algo/simd/sse2/nist.h"
-#include "algo/shavite/sph_shavite.h"
-
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
+#else
+#include "algo/echo/sph_echo.h"
 #endif

 typedef struct
 {
        hashState_luffa         luffa;
        cubehashParam           cubehash;
-        sph_shavite512_context  shavite;
-        hashState_sd            simd;
 #ifdef NO_AES_NI
        sph_echo512_context echo;
 #else
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -1,24 +1,16 @@
-#include "miner.h"
 #include "algo-gate-api.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-#include "algo/echo/sph_echo.h"
-
 #include "algo/luffa/sse2/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
 #include "algo/simd/sse2/nist.h"
 #include "algo/shavite/sph_shavite.h"
-
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
+#else
+#include "algo/echo/sph_echo.h"
 #endif

 typedef struct
--- a/algo/s3.c
+++ b/algo/s3.c
@@ -1,116 +0,0 @@
-#include "miner.h"
-#include "algo-gate-api.h"
-
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-
-#include "algo/skein/sph_skein.h"
-#include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
-
-void s3hash(void *output, const void *input)
-{
-
- 	sph_shavite512_context ctx_shavite;
-	sph_simd512_context ctx_simd;
-	sph_skein512_context ctx_skein;
-
-	unsigned char _ALIGN(128) hash[64];
-
-	sph_shavite512_init(&ctx_shavite);
-	sph_shavite512(&ctx_shavite, input, 80);
-	sph_shavite512_close(&ctx_shavite, (void*)hash);
-
-	sph_simd512_init(&ctx_simd);
-	sph_simd512(&ctx_simd, (const void*)hash, 64);
-	sph_simd512_close(&ctx_simd, (void*)hash);
-
-	sph_skein512_init(&ctx_skein);
-	sph_skein512(&ctx_skein, (const void*)hash, 64);
-	sph_skein512_close(&ctx_skein, (void*)hash);
-
-	memcpy(output, hash, 32);
-
-}
-
-int scanhash_s3(int thr_id, struct work *work, uint32_t max_nonce,
-                      uint64_t *hashes_done)
-{
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-	uint32_t n = pdata[19] - 1;
-	const uint32_t first_nonce = pdata[19];
-	const uint32_t Htarg = ptarget[7];
-
-	uint32_t _ALIGN(32) hash64[8];
-	uint32_t endiandata[32];
-
-
-	uint64_t htmax[] = {
-		0,
-		0xF,
-		0xFF,
-		0xFFF,
-		0xFFFF,
-		0x10000000
-	};
-	uint32_t masks[] = {
-		0xFFFFFFFF,
-		0xFFFFFFF0,
-		0xFFFFFF00,
-		0xFFFFF000,
-		0xFFFF0000,
-		0
-	};
-
-	// we need bigendian data...
-	for (int kk=0; kk < 32; kk++) {
-		be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]);
-	};
-#ifdef DEBUG_ALGO
-	printf("[%d] Htarg=%X\n", thr_id, Htarg);
-#endif
-	for (int m=0; m < 6; m++) {
-		if (Htarg <= htmax[m]) {
-			uint32_t mask = masks[m];
-			do {
-				pdata[19] = ++n;
-				be32enc(&endiandata[19], n);
-				s3hash(hash64, endiandata);
-#ifndef DEBUG_ALGO
-				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
-					*hashes_done = n - first_nonce + 1;
-					return true;
-				}
-#else
-				if (!(n % 0x1000) && !thr_id) printf(".");
-				if (!(hash64[7] & mask)) {
-					printf("[%d]",thr_id);
-					if (fulltest(hash64, ptarget)) {
-						*hashes_done = n - first_nonce + 1;
-						return true;
-					}
-				}
-#endif
-			} while (n < max_nonce && !work_restart[thr_id].restart);
-			// see blake.c if else to understand the loop on htmax => mask
-			break;
-		}
-	}
-
-	*hashes_done = n - first_nonce + 1;
-	pdata[19] = n;
-	return 0;
-}
-
-bool register_s3_algo( algo_gate_t* gate )
-{
-    algo_not_tested();
-    gate->scanhash = (void*)&scanhash_s3;
-    gate->hash     = (void*)&s3hash;
-//  gate->get_max64 = &s3_get_max64;
-    return true;
-};
-
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -27,7 +27,6 @@
 * online backup system.
 */

-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
@@ -780,7 +779,7 @@ bool register_scrypt_algo( algo_gate_t* gate )
 {
  gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
  gate->scanhash         = (void*)&scanhash_scrypt;
-  gate->hash             = (void*)&scrypt_1024_1_1_256_24way;
+//  gate->hash             = (void*)&scrypt_1024_1_1_256_24way;
  gate->set_target       = (void*)&scrypt_set_target;
  gate->get_max64        = (void*)&scrypt_get_max64;

--- a/algo/scryptjane/scrypt-jane.c
+++ b/algo/scryptjane/scrypt-jane.c
@@ -1,5 +1,3 @@
-#include "miner.h"
-
 #include <stdlib.h>
 #include <string.h>
 #include "inttypes.h"
--- a/algo/groestl/aes_ni/brg_types.h
+++ b/algo/groestl/aes_ni/brg_types.h
--- a/algo/sha/sha2.c
+++ b/algo/sha/sha2.c
@@ -8,7 +8,6 @@
 * any later version.  See COPYING for more details.
 */

-#include "miner.h"
 #include "algo-gate-api.h"

 #include <string.h>
--- a/algo/sha/sha256t.c
+++ b/algo/sha/sha256t.c
@@ -1,16 +1,13 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-
 #include "sph_sha2.h"
+#include <openssl/sha.h>

-#if defined __SHA__
- #include <openssl/sha.h>
-
+#ifndef USE_SPH_SHA
 static SHA256_CTX sha256t_ctx __attribute__ ((aligned (64)));
 static __thread SHA256_CTX sha256t_mid  __attribute__ ((aligned (64)));
 #else
@@ -21,7 +18,7 @@
 void sha256t_midstate( const void* input )
 {
    memcpy( &sha256t_mid, &sha256t_ctx, sizeof sha256t_mid );
-#if defined __SHA__
+#ifndef USE_SPH_SHA
    SHA256_Update( &sha256t_mid, input, 64 );
 #else
    sph_sha256( &sha256t_mid, input, 64 );
@@ -34,20 +31,20 @@ void sha256t_hash(void* output, const void* input,  uint32_t len)
        const int midlen = 64;            // bytes
        const int tail   = 80 - midlen;   // 16

-#if defined __SHA__
+#ifndef USE_SPH_SHA 
        SHA256_CTX ctx_sha256 __attribute__ ((aligned (64)));
        memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );

        SHA256_Update( &ctx_sha256, input + midlen, tail );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );

        memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
        SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );

        memcpy( &ctx_sha256, &sha256t_ctx, sizeof sha256t_ctx );
        SHA256_Update( &ctx_sha256, hashA, 32 );
-        SHA256_Final( hashA, &ctx_sha256 );
+        SHA256_Final( (unsigned char*)hashA, &ctx_sha256 );
 #else
        sph_sha256_context ctx_sha256 __attribute__ ((aligned (64)));
        memcpy( &ctx_sha256, &sha256t_mid, sizeof sha256t_mid );
@@ -150,12 +147,12 @@ void sha256t_set_target( struct work* work, double job_diff )

 bool register_sha256t_algo( algo_gate_t* gate )
 {
-#if defined __SHA__
+#ifndef USE_SPH_SHA
    SHA256_Init( &sha256t_ctx );
 #else
    sph_sha256_init( &sha256t_ctx );
 #endif
-    gate->optimizations = SSE2_OPT | SHA_OPT;
+    gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t;
    gate->hash       = (void*)&sha256t_hash;
    gate->set_target = (void*)&sha256t_set_target;
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -0,0 +1,618 @@
+/* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
+/*
+ * Shabal implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#ifdef __AVX2__
+
+#include "shabal-hash-4way.h"
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * Part of this code was automatically generated (the part between
+ * the "BEGIN" and "END" markers).
+ */
+
+#define sM    16
+
+#define C32   SPH_C32
+#define T32   SPH_T32
+
+#define O1   13
+#define O2    9
+#define O3    6
+
+/*
+ * We copy the state into local variables, so that the compiler knows
+ * that it can optimize them at will.
+ */
+
+/* BEGIN -- automatically generated code. */
+
+#define DECL_STATE   \
+	__m128i A00, A01, A02, A03, A04, A05, A06, A07, \
+	        A08, A09, A0A, A0B; \
+	__m128i B0, B1, B2, B3, B4, B5, B6, B7, \
+	        B8, B9, BA, BB, BC, BD, BE, BF; \
+	__m128i C0, C1, C2, C3, C4, C5, C6, C7, \
+	        C8, C9, CA, CB, CC, CD, CE, CF; \
+	__m128i M0, M1, M2, M3, M4, M5, M6, M7, \
+	        M8, M9, MA, MB, MC, MD, ME, MF; \
+	sph_u32 Wlow, Whigh;
+
+#define READ_STATE(state)   do { \
+		A00 = (state)->A[0]; \
+		A01 = (state)->A[1]; \
+		A02 = (state)->A[2]; \
+		A03 = (state)->A[3]; \
+		A04 = (state)->A[4]; \
+		A05 = (state)->A[5]; \
+		A06 = (state)->A[6]; \
+		A07 = (state)->A[7]; \
+		A08 = (state)->A[8]; \
+		A09 = (state)->A[9]; \
+		A0A = (state)->A[10]; \
+		A0B = (state)->A[11]; \
+		B0 = (state)->B[0]; \
+		B1 = (state)->B[1]; \
+		B2 = (state)->B[2]; \
+		B3 = (state)->B[3]; \
+		B4 = (state)->B[4]; \
+		B5 = (state)->B[5]; \
+		B6 = (state)->B[6]; \
+		B7 = (state)->B[7]; \
+		B8 = (state)->B[8]; \
+		B9 = (state)->B[9]; \
+		BA = (state)->B[10]; \
+		BB = (state)->B[11]; \
+		BC = (state)->B[12]; \
+		BD = (state)->B[13]; \
+		BE = (state)->B[14]; \
+		BF = (state)->B[15]; \
+		C0 = (state)->C[0]; \
+		C1 = (state)->C[1]; \
+		C2 = (state)->C[2]; \
+		C3 = (state)->C[3]; \
+		C4 = (state)->C[4]; \
+		C5 = (state)->C[5]; \
+		C6 = (state)->C[6]; \
+		C7 = (state)->C[7]; \
+		C8 = (state)->C[8]; \
+		C9 = (state)->C[9]; \
+		CA = (state)->C[10]; \
+		CB = (state)->C[11]; \
+		CC = (state)->C[12]; \
+		CD = (state)->C[13]; \
+		CE = (state)->C[14]; \
+		CF = (state)->C[15]; \
+		Wlow = (state)->Wlow; \
+		Whigh = (state)->Whigh; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->A[0] = A00; \
+		(state)->A[1] = A01; \
+		(state)->A[2] = A02; \
+		(state)->A[3] = A03; \
+		(state)->A[4] = A04; \
+		(state)->A[5] = A05; \
+		(state)->A[6] = A06; \
+		(state)->A[7] = A07; \
+		(state)->A[8] = A08; \
+		(state)->A[9] = A09; \
+		(state)->A[10] = A0A; \
+		(state)->A[11] = A0B; \
+		(state)->B[0] = B0; \
+		(state)->B[1] = B1; \
+		(state)->B[2] = B2; \
+		(state)->B[3] = B3; \
+		(state)->B[4] = B4; \
+		(state)->B[5] = B5; \
+		(state)->B[6] = B6; \
+		(state)->B[7] = B7; \
+		(state)->B[8] = B8; \
+		(state)->B[9] = B9; \
+		(state)->B[10] = BA; \
+		(state)->B[11] = BB; \
+		(state)->B[12] = BC; \
+		(state)->B[13] = BD; \
+		(state)->B[14] = BE; \
+		(state)->B[15] = BF; \
+		(state)->C[0] = C0; \
+		(state)->C[1] = C1; \
+		(state)->C[2] = C2; \
+		(state)->C[3] = C3; \
+		(state)->C[4] = C4; \
+		(state)->C[5] = C5; \
+		(state)->C[6] = C6; \
+		(state)->C[7] = C7; \
+		(state)->C[8] = C8; \
+		(state)->C[9] = C9; \
+		(state)->C[10] = CA; \
+		(state)->C[11] = CB; \
+		(state)->C[12] = CC; \
+		(state)->C[13] = CD; \
+		(state)->C[14] = CE; \
+		(state)->C[15] = CF; \
+		(state)->Wlow = Wlow; \
+		(state)->Whigh = Whigh; \
+	} while (0)
+
+#define DECODE_BLOCK \
+do { \
+   M0 = buf[ 0]; \
+   M1 = buf[ 1]; \
+   M2 = buf[ 2]; \
+   M3 = buf[ 3]; \
+   M4 = buf[ 4]; \
+   M5 = buf[ 5]; \
+   M6 = buf[ 6]; \
+   M7 = buf[ 7]; \
+   M8 = buf[ 8]; \
+   M9 = buf[ 9]; \
+   MA = buf[10]; \
+   MB = buf[11]; \
+   MC = buf[12]; \
+   MD = buf[13]; \
+   ME = buf[14]; \
+   MF = buf[15]; \
+} while (0)
+
+#define INPUT_BLOCK_ADD \
+do { \
+    B0 = _mm_add_epi32( B0, M0 );\
+    B1 = _mm_add_epi32( B1, M1 );\
+    B2 = _mm_add_epi32( B2, M2 );\
+    B3 = _mm_add_epi32( B3, M3 );\
+    B4 = _mm_add_epi32( B4, M4 );\
+    B5 = _mm_add_epi32( B5, M5 );\
+    B6 = _mm_add_epi32( B6, M6 );\
+    B7 = _mm_add_epi32( B7, M7 );\
+    B8 = _mm_add_epi32( B8, M8 );\
+    B9 = _mm_add_epi32( B9, M9 );\
+    BA = _mm_add_epi32( BA, MA );\
+    BB = _mm_add_epi32( BB, MB );\
+    BC = _mm_add_epi32( BC, MC );\
+    BD = _mm_add_epi32( BD, MD );\
+    BE = _mm_add_epi32( BE, ME );\
+    BF = _mm_add_epi32( BF, MF );\
+} while (0)
+
+#define INPUT_BLOCK_SUB \
+do { \
+    C0 = _mm_sub_epi32( C0, M0 ); \
+    C1 = _mm_sub_epi32( C1, M1 ); \
+    C2 = _mm_sub_epi32( C2, M2 ); \
+    C3 = _mm_sub_epi32( C3, M3 ); \
+    C4 = _mm_sub_epi32( C4, M4 ); \
+    C5 = _mm_sub_epi32( C5, M5 ); \
+    C6 = _mm_sub_epi32( C6, M6 ); \
+    C7 = _mm_sub_epi32( C7, M7 ); \
+    C8 = _mm_sub_epi32( C8, M8 ); \
+    C9 = _mm_sub_epi32( C9, M9 ); \
+    CA = _mm_sub_epi32( CA, MA ); \
+    CB = _mm_sub_epi32( CB, MB ); \
+    CC = _mm_sub_epi32( CC, MC ); \
+    CD = _mm_sub_epi32( CD, MD ); \
+    CE = _mm_sub_epi32( CE, ME ); \
+    CF = _mm_sub_epi32( CF, MF ); \
+} while (0)
+
+#define XOR_W \
+do { \
+   A00 = _mm_xor_si128( A00, _mm_set1_epi32( Wlow ) ); \
+   A01 = _mm_xor_si128( A01, _mm_set1_epi32( Whigh ) ); \
+} while (0)
+/*
+#define SWAP(v1, v2)   do { \
+		sph_u32 tmp = (v1); \
+		(v1) = (v2); \
+		(v2) = tmp; \
+	} while (0)
+*/
+#define SWAP_BC \
+do { \
+    mm_swap_128( B0, C0 ); \
+    mm_swap_128( B1, C1 ); \
+    mm_swap_128( B2, C2 ); \
+    mm_swap_128( B3, C3 ); \
+    mm_swap_128( B4, C4 ); \
+    mm_swap_128( B5, C5 ); \
+    mm_swap_128( B6, C6 ); \
+    mm_swap_128( B7, C7 ); \
+    mm_swap_128( B8, C8 ); \
+    mm_swap_128( B9, C9 ); \
+    mm_swap_128( BA, CA ); \
+    mm_swap_128( BB, CB ); \
+    mm_swap_128( BC, CC ); \
+    mm_swap_128( BD, CD ); \
+    mm_swap_128( BE, CE ); \
+    mm_swap_128( BF, CF ); \
+} while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
+do { \
+   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
+            _mm_andnot_si128( xb3, xb2 ), \
+            _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
+               _mm_mullo_epi32(  mm_rotl_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
+                   ) ), _mm_set1_epi32(3UL) ) ) ) ); \
+   xb0 = mm_not( _mm_xor_si128( xa0, mm_rotl_32( xb0, 1 ) ) ); \
+} while (0)
+
+#define PERM_STEP_0   do { \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_1   do { \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define PERM_STEP_2   do { \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+	} while (0)
+
+#define APPLY_P \
+do { \
+    B0 = mm_rotr_32( B0, 15 ); \
+    B1 = mm_rotr_32( B1, 15 ); \
+    B2 = mm_rotr_32( B2, 15 ); \
+    B3 = mm_rotr_32( B3, 15 ); \
+    B4 = mm_rotr_32( B4, 15 ); \
+    B5 = mm_rotr_32( B5, 15 ); \
+    B6 = mm_rotr_32( B6, 15 ); \
+    B7 = mm_rotr_32( B7, 15 ); \
+    B8 = mm_rotr_32( B8, 15 ); \
+    B9 = mm_rotr_32( B9, 15 ); \
+    BA = mm_rotr_32( BA, 15 ); \
+    BB = mm_rotr_32( BB, 15 ); \
+    BC = mm_rotr_32( BC, 15 ); \
+    BD = mm_rotr_32( BD, 15 ); \
+    BE = mm_rotr_32( BE, 15 ); \
+    BF = mm_rotr_32( BF, 15 ); \
+    PERM_STEP_0; \
+    PERM_STEP_1; \
+    PERM_STEP_2; \
+    A0B = _mm_add_epi32( A0B, C6 ); \
+    A0A = _mm_add_epi32( A0A, C5 ); \
+    A09 = _mm_add_epi32( A09, C4 ); \
+    A08 = _mm_add_epi32( A08, C3 ); \
+    A07 = _mm_add_epi32( A07, C2 ); \
+    A06 = _mm_add_epi32( A06, C1 ); \
+    A05 = _mm_add_epi32( A05, C0 ); \
+    A04 = _mm_add_epi32( A04, CF ); \
+    A03 = _mm_add_epi32( A03, CE ); \
+    A02 = _mm_add_epi32( A02, CD ); \
+    A01 = _mm_add_epi32( A01, CC ); \
+    A00 = _mm_add_epi32( A00, CB ); \
+    A0B = _mm_add_epi32( A0B, CA ); \
+    A0A = _mm_add_epi32( A0A, C9 ); \
+    A09 = _mm_add_epi32( A09, C8 ); \
+    A08 = _mm_add_epi32( A08, C7 ); \
+    A07 = _mm_add_epi32( A07, C6 ); \
+    A06 = _mm_add_epi32( A06, C5 ); \
+    A05 = _mm_add_epi32( A05, C4 ); \
+    A04 = _mm_add_epi32( A04, C3 ); \
+    A03 = _mm_add_epi32( A03, C2 ); \
+    A02 = _mm_add_epi32( A02, C1 ); \
+    A01 = _mm_add_epi32( A01, C0 ); \
+    A00 = _mm_add_epi32( A00, CF ); \
+    A0B = _mm_add_epi32( A0B, CE ); \
+    A0A = _mm_add_epi32( A0A, CD ); \
+    A09 = _mm_add_epi32( A09, CC ); \
+    A08 = _mm_add_epi32( A08, CB ); \
+    A07 = _mm_add_epi32( A07, CA ); \
+    A06 = _mm_add_epi32( A06, C9 ); \
+    A05 = _mm_add_epi32( A05, C8 ); \
+    A04 = _mm_add_epi32( A04, C7 ); \
+    A03 = _mm_add_epi32( A03, C6 ); \
+    A02 = _mm_add_epi32( A02, C5 ); \
+    A01 = _mm_add_epi32( A01, C4 ); \
+    A00 = _mm_add_epi32( A00, C3 ); \
+} while (0)
+
+#define INCR_W   do { \
+		if ((Wlow = T32(Wlow + 1)) == 0) \
+			Whigh = T32(Whigh + 1); \
+	} while (0)
+
+static const sph_u32 A_init_256[] = {
+	C32(0x52F84552), C32(0xE54B7999), C32(0x2D8EE3EC), C32(0xB9645191),
+	C32(0xE0078B86), C32(0xBB7C44C9), C32(0xD2B5C1CA), C32(0xB0D2EB8C),
+	C32(0x14CE5A45), C32(0x22AF50DC), C32(0xEFFDBC6B), C32(0xEB21B74A)
+};
+
+static const sph_u32 B_init_256[] = {
+	C32(0xB555C6EE), C32(0x3E710596), C32(0xA72A652F), C32(0x9301515F),
+	C32(0xDA28C1FA), C32(0x696FD868), C32(0x9CB6BF72), C32(0x0AFE4002),
+	C32(0xA6E03615), C32(0x5138C1D4), C32(0xBE216306), C32(0xB38B8890),
+	C32(0x3EA8B96B), C32(0x3299ACE4), C32(0x30924DD4), C32(0x55CB34A5)
+};
+
+static const sph_u32 C_init_256[] = {
+	C32(0xB405F031), C32(0xC4233EBA), C32(0xB3733979), C32(0xC0DD9D55),
+	C32(0xC51C28AE), C32(0xA327B8E1), C32(0x56C56167), C32(0xED614433),
+	C32(0x88B59D60), C32(0x60E2CEBA), C32(0x758B4B8B), C32(0x83E82A7F),
+	C32(0xBC968828), C32(0xE6E00BF7), C32(0xBA839E55), C32(0x9B491C60)
+};
+
+static const sph_u32 A_init_512[] = {
+	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+static const sph_u32 B_init_512[] = {
+	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+static const sph_u32 C_init_512[] = {
+	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+static void
+shabal_4way_init( void *cc, unsigned size )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+   int i;
+
+   if ( size == 512 )
+   {
+      for ( i = 0; i < 12; i++ )
+         sc->A[i] = _mm_set1_epi32( A_init_512[i] );
+      for ( i = 0; i < 16; i++ )
+      {
+         sc->B[i] = _mm_set1_epi32( B_init_512[i] );
+         sc->C[i] = _mm_set1_epi32( C_init_512[i] );
+      }
+   }
+   else
+   {
+      for ( i = 0; i < 12; i++ )
+         sc->A[i] = _mm_set1_epi32( A_init_256[i] );
+      for ( i = 0; i < 16; i++ )
+      {
+         sc->B[i] = _mm_set1_epi32( B_init_256[i] );
+         sc->C[i] = _mm_set1_epi32( C_init_256[i] );
+      }
+    }
+    sc->Wlow = 1;
+    sc->Whigh = 0;
+    sc->ptr = 0;
+}
+
+static void
+shabal_4way_core( void *cc, const unsigned char *data, size_t len )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+    __m128i *buf;
+    __m128i *vdata = (__m128i*)data;
+   const int buf_size = 64;  
+   size_t ptr;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr ) )
+   {
+      memcpy_128( buf + (ptr>>2), vdata, len>>2 );
+      ptr += len;
+      sc->ptr = ptr;
+      return;
+   }
+   READ_STATE(sc);
+
+   while ( len > 0 )
+   {
+      size_t clen;
+      clen = buf_size - ptr;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( buf + (ptr>>2), vdata, clen>>2 );
+
+      ptr += clen;
+      vdata += clen>>2;
+      len -= clen;
+      if ( ptr == buf_size )
+      {
+         DECODE_BLOCK;
+         INPUT_BLOCK_ADD;
+         XOR_W;
+         APPLY_P;
+         INPUT_BLOCK_SUB;
+         SWAP_BC;
+         INCR_W;
+         ptr = 0;
+      }
+   }
+   WRITE_STATE(sc);
+   sc->ptr = ptr;
+}
+
+static void
+shabal_4way_close( void *cc, unsigned ub, unsigned n, void *dst,
+                   unsigned size_words )
+{
+   shabal_4way_context *sc = (shabal_4way_context*)cc;
+    __m128i *buf;
+   const int buf_size = 64;
+   size_t ptr;
+   int i;
+   unsigned z, zz;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+   z = 0x80 >> n;
+   zz = ((ub & -z) | z) & 0xFF;
+   buf[ptr>>2] = _mm_set1_epi32( zz );
+   memset_zero_128( buf + (ptr>>2) + 1, ( (buf_size - ptr) >> 2 ) - 1 );
+   READ_STATE(sc);
+   DECODE_BLOCK;
+   INPUT_BLOCK_ADD;
+   XOR_W;
+   APPLY_P;
+
+   for ( i = 0; i < 3; i ++ )
+   {
+      SWAP_BC;
+      XOR_W;
+      APPLY_P;
+   }
+
+   __m128i *d = (__m128i*)dst;
+   if ( size_words == 16 )   // 512
+   {
+      d[ 0] = B0; d[ 1] = B1; d[ 2] = B2; d[ 3] = B3;
+      d[ 4] = B4; d[ 5] = B5; d[ 6] = B6; d[ 7] = B7;
+      d[ 8] = B8; d[ 9] = B9; d[10] = BA; d[11] = BB;
+      d[12] = BC; d[13] = BD; d[14] = BE; d[15] = BF;
+   }
+   else    // 256
+   {
+      d[ 0] = B8; d[ 1] = B9; d[ 2] = BA; d[ 3] = BB;
+      d[ 4] = BC; d[ 5] = BD; d[ 6] = BE; d[ 7] = BF;
+   }
+}
+
+void
+shabal256_4way_init( void *cc )
+{
+	shabal_4way_init(cc, 256);
+}
+
+void
+shabal256_4way( void *cc, const void *data, size_t len )
+{
+	shabal_4way_core( cc, data, len );
+}
+
+void
+shabal256_4way_close( void *cc, void *dst )
+{
+	shabal_4way_close(cc, 0, 0, dst, 8);
+}
+
+void
+shabal256_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                  void *dst )
+{
+	shabal_4way_close(cc, ub, n, dst, 8);
+}
+
+void
+shabal512_4way_init(void *cc)
+{
+	shabal_4way_init(cc, 512);
+}
+
+void
+shabal512_4way(void *cc, const void *data, size_t len)
+{
+	shabal_4way_core(cc, data, len);
+}
+
+void
+shabal512_4way_close(void *cc, void *dst)
+{
+	shabal_4way_close(cc, 0, 0, dst, 16);
+}
+
+void
+shabal512_4way_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	shabal_4way_close(cc, ub, n, dst, 16);
+}
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/shabal/shabal-hash-4way.h
+++ b/algo/shabal/shabal-hash-4way.h
@@ -0,0 +1,82 @@
+/* $Id: sph_shabal.h 175 2010-05-07 16:03:20Z tp $ */
+/**
+ * Shabal interface. Shabal is a family of functions which differ by
+ * their output size; this implementation defines Shabal for output
+ * sizes 192, 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_shabal.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SHABAL_HASH_4WAY_H__
+#define SHABAL_HASH_4WAY_H__ 1
+
+#ifdef __AVX2__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#define SPH_SIZE_shabal256   256
+
+#define SPH_SIZE_shabal512   512
+
+typedef struct {
+	__m128i buf[16] __attribute__ ((aligned (64)));
+	__m128i A[12], B[16], C[16];
+	sph_u32 Whigh, Wlow;
+        size_t ptr;
+} shabal_4way_context;
+
+typedef shabal_4way_context shabal256_4way_context;
+typedef shabal_4way_context shabal512_4way_context;
+
+void shabal256_4way_init( void *cc );
+void shabal256_4way( void *cc, const void *data, size_t len );
+void shabal256_4way_close( void *cc, void *dst );
+void shabal256_4way_addbits_and_close(	void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+void shabal512_4way_init( void *cc );
+void shabal512_4way( void *cc, const void *data, size_t len );
+void shabal512_4way_close( void *cc, void *dst );
+void shabal512_4way_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                       void *dst );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
+
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -0,0 +1,670 @@
+/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SHAvite-3 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+
+#ifdef __AES__
+
+#include "sph_shavite.h"
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE
+#define SPH_SMALL_FOOTPRINT_SHAVITE   1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#define C32   SPH_C32
+
+/*
+ * As of round 2 of the SHA-3 competition, the published reference
+ * implementation and test vectors are wrong, because they use
+ * big-endian AES tables while the internal decoding uses little-endian.
+ * The code below follows the specification. To turn it into a code
+ * which follows the reference implementation (the one called "BugFix"
+ * on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out
+ * the code below (from the '#define AES_BIG_ENDIAN...' to the definition
+ * of the AES_ROUND_NOKEY macro) and replace it with the version which
+ * is commented out afterwards.
+ */
+
+#define AES_BIG_ENDIAN   0
+#include "algo/sha/aes_helper.c"
+
+static const sph_u32 IV512[] = {
+	C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC),
+	C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC),
+	C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47),
+	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
+};
+
+#define AES_ROUND_NOKEY(x0, x1, x2, x3)   do { \
+		sph_u32 t0 = (x0); \
+		sph_u32 t1 = (x1); \
+		sph_u32 t2 = (x2); \
+		sph_u32 t3 = (x3); \
+		AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \
+	} while (0)
+
+  
+#define KEY_EXPAND_ELT(k0, k1, k2, k3)   do { \
+		sph_u32 kt; \
+		AES_ROUND_NOKEY(k1, k2, k3, k0); \
+		kt = (k0); \
+		(k0) = (k1); \
+		(k1) = (k2); \
+		(k2) = (k3); \
+		(k3) = kt; \
+	} while (0)
+
+
+#if SPH_SMALL_FOOTPRINT_SHAVITE
+
+/*
+ * This function assumes that "msg" is aligned for 32-bit access.
+ */
+static void
+c512(sph_shavite_big_context *sc, const void *msg)
+{
+	sph_u32 p0, p1, p2, p3, p4, p5, p6, p7;
+	sph_u32 p8, p9, pA, pB, pC, pD, pE, pF;
+	sph_u32 rk[448];
+	size_t u;
+	int r, s;
+
+#if SPH_LITTLE_ENDIAN
+	memcpy(rk, msg, 128);
+#else
+	for (u = 0; u < 32; u += 4) {
+		rk[u + 0] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  0);
+		rk[u + 1] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  4);
+		rk[u + 2] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) +  8);
+		rk[u + 3] = sph_dec32le_aligned(
+			(const unsigned char *)msg + (u << 2) + 12);
+	}
+#endif
+	u = 32;
+	for (;;) {
+		for (s = 0; s < 4; s ++) {
+			sph_u32 x0, x1, x2, x3;
+
+			x0 = rk[u - 31];
+			x1 = rk[u - 30];
+			x2 = rk[u - 29];
+			x3 = rk[u - 32];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 32) {
+				rk[ 32] ^= sc->count0;
+				rk[ 33] ^= sc->count1;
+				rk[ 34] ^= sc->count2;
+				rk[ 35] ^= SPH_T32(~sc->count3);
+			} else if (u == 440) {
+				rk[440] ^= sc->count1;
+				rk[441] ^= sc->count0;
+				rk[442] ^= sc->count3;
+				rk[443] ^= SPH_T32(~sc->count2);
+			}
+			u += 4;
+
+			x0 = rk[u - 31];
+			x1 = rk[u - 30];
+			x2 = rk[u - 29];
+			x3 = rk[u - 32];
+			AES_ROUND_NOKEY(x0, x1, x2, x3);
+			rk[u + 0] = x0 ^ rk[u - 4];
+			rk[u + 1] = x1 ^ rk[u - 3];
+			rk[u + 2] = x2 ^ rk[u - 2];
+			rk[u + 3] = x3 ^ rk[u - 1];
+			if (u == 164) {
+				rk[164] ^= sc->count3;
+				rk[165] ^= sc->count2;
+				rk[166] ^= sc->count1;
+				rk[167] ^= SPH_T32(~sc->count0);
+			} else if (u == 316) {
+				rk[316] ^= sc->count2;
+				rk[317] ^= sc->count3;
+				rk[318] ^= sc->count0;
+				rk[319] ^= SPH_T32(~sc->count1);
+			}
+			u += 4;
+		}
+		if (u == 448)
+			break;
+		for (s = 0; s < 8; s ++) {
+			rk[u + 0] = rk[u - 32] ^ rk[u - 7];
+			rk[u + 1] = rk[u - 31] ^ rk[u - 6];
+			rk[u + 2] = rk[u - 30] ^ rk[u - 5];
+			rk[u + 3] = rk[u - 29] ^ rk[u - 4];
+			u += 4;
+		}
+	}
+
+	p0 = sc->h[0x0];
+	p1 = sc->h[0x1];
+	p2 = sc->h[0x2];
+	p3 = sc->h[0x3];
+	p4 = sc->h[0x4];
+	p5 = sc->h[0x5];
+	p6 = sc->h[0x6];
+	p7 = sc->h[0x7];
+	p8 = sc->h[0x8];
+	p9 = sc->h[0x9];
+	pA = sc->h[0xA];
+	pB = sc->h[0xB];
+	pC = sc->h[0xC];
+	pD = sc->h[0xD];
+	pE = sc->h[0xE];
+	pF = sc->h[0xF];
+	u = 0;
+	for (r = 0; r < 14; r ++) {
+#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3)   do { \
+		sph_u32 x0, x1, x2, x3; \
+		x0 = r0 ^ rk[u ++]; \
+		x1 = r1 ^ rk[u ++]; \
+		x2 = r2 ^ rk[u ++]; \
+		x3 = r3 ^ rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		x0 ^= rk[u ++]; \
+		x1 ^= rk[u ++]; \
+		x2 ^= rk[u ++]; \
+		x3 ^= rk[u ++]; \
+		AES_ROUND_NOKEY(x0, x1, x2, x3); \
+		l0 ^= x0; \
+		l1 ^= x1; \
+		l2 ^= x2; \
+		l3 ^= x3; \
+	} while (0)
+
+#define WROT(a, b, c, d)   do { \
+		sph_u32 t = d; \
+		d = c; \
+		c = b; \
+		b = a; \
+		a = t; \
+	} while (0)
+
+		C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7);
+		C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF);
+
+		WROT(p0, p4, p8, pC);
+		WROT(p1, p5, p9, pD);
+		WROT(p2, p6, pA, pE);
+		WROT(p3, p7, pB, pF);
+
+#undef C512_ELT
+#undef WROT
+	}
+	sc->h[0x0] ^= p0;
+	sc->h[0x1] ^= p1;
+	sc->h[0x2] ^= p2;
+	sc->h[0x3] ^= p3;
+	sc->h[0x4] ^= p4;
+	sc->h[0x5] ^= p5;
+	sc->h[0x6] ^= p6;
+	sc->h[0x7] ^= p7;
+	sc->h[0x8] ^= p8;
+	sc->h[0x9] ^= p9;
+	sc->h[0xA] ^= pA;
+	sc->h[0xB] ^= pB;
+	sc->h[0xC] ^= pC;
+	sc->h[0xD] ^= pD;
+	sc->h[0xE] ^= pE;
+	sc->h[0xF] ^= pF;
+}
+
+#else
+
+static void
+c512( sph_shavite_big_context *sc, const void *msg )
+{
+   __m128i p0, p1, p2, p3, x;
+   __m128i k00, k01, k02, k03, k10, k11, k12, k13;
+   __m128i *m = (__m128i*)msg;
+   __m128i *h = (__m128i*)sc->h;
+   int r;
+
+   p0 = h[0];
+   p1 = h[1];
+   p2 = h[2];
+   p3 = h[3];   
+
+   // round
+   k00 = m[0];
+   x = _mm_xor_si128( p1, k00 );
+   x = _mm_aesenc_si128( x, mm_zero );
+  
+   k01 = m[1];
+   x = _mm_xor_si128( x, k01 );
+   x = _mm_aesenc_si128( x, mm_zero );
+
+   k02 = m[2];
+   x = _mm_xor_si128( x, k02 );
+   x = _mm_aesenc_si128( x, mm_zero );
+
+   k03 = m[3];
+   x = _mm_xor_si128( x, k03 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   p0 = _mm_xor_si128( p0, x );
+
+   k10 = m[4];
+   x = _mm_xor_si128( p3, k10 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   
+   k11 = m[5];
+   x = _mm_xor_si128( x, k11 );
+   x = _mm_aesenc_si128( x, mm_zero );
+
+   k12 = m[6];
+   x = _mm_xor_si128( x, k12 );
+   x = _mm_aesenc_si128( x, mm_zero );
+
+   k13 = m[7];
+   x = _mm_xor_si128( x, k13 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   p2 = _mm_xor_si128( p2, x );
+
+   for ( r = 0; r < 3; r ++ )
+   {
+      // round 1, 5, 9
+      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = _mm_xor_si128( k00, k13 ); 
+
+      if ( r == 0 )
+         k00 = _mm_xor_si128( k00, _mm_set_epi32(
+                  ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 
+
+      x = _mm_xor_si128( p0, k00 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      k01 = _mm_xor_si128( k01, k00 );
+
+      if ( r == 1 )
+         k01 = _mm_xor_si128( k01, _mm_set_epi32(
+                  ~sc->count0, sc->count1, sc->count2, sc->count3 ) );
+
+      x = _mm_xor_si128( x, k01 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      k02 = _mm_xor_si128( k02, k01 );
+
+      x = _mm_xor_si128( x, k02 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      k03 = _mm_xor_si128( k03, k02 );
+
+      x = _mm_xor_si128( x, k03 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p3 = _mm_xor_si128( p3, x );
+      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = _mm_xor_si128( k10, k03 );
+
+      x = _mm_xor_si128( p2, k10 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      k11 = _mm_xor_si128( k11, k10 );
+
+      x = _mm_xor_si128( x, k11 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      k12 = _mm_xor_si128( k12, k11 );
+
+      x = _mm_xor_si128( x, k12 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      k13 = _mm_xor_si128( k13, k12 );
+
+      if ( r == 2 )
+         k13 = _mm_xor_si128( k13, _mm_set_epi32(
+                  ~sc->count1, sc->count0, sc->count3, sc->count2 ) );
+
+      x = _mm_xor_si128( x, k13 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p1 = _mm_xor_si128( p1, x );
+
+      // round 2, 6, 10
+
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
+      x = _mm_xor_si128( p3, k00 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
+      x = _mm_xor_si128( x, k01 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
+      x = _mm_xor_si128( x, k02 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
+      x = _mm_xor_si128( x, k03 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      p2 = _mm_xor_si128( p2, x );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
+      x = _mm_xor_si128( p1, k10 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
+      x = _mm_xor_si128( x, k11 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
+      x = _mm_xor_si128( x, k12 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
+      x = _mm_xor_si128( x, k13 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p0 = _mm_xor_si128( p0, x );
+
+      // round 3, 7, 11
+
+      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = _mm_xor_si128( k00, k13 );
+
+      x = _mm_xor_si128( p2, k00 );
+      x = _mm_aesenc_si128( x, mm_zero );
+
+      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      k01 = _mm_xor_si128( k01, k00 );
+
+      x = _mm_xor_si128( x, k01 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      k02 = _mm_xor_si128( k02, k01 );
+
+      x = _mm_xor_si128( x, k02 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      k03 = _mm_xor_si128( k03, k02 );
+
+      x = _mm_xor_si128( x, k03 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p1 = _mm_xor_si128( p1, x );
+      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = _mm_xor_si128( k10, k03 );
+
+      x = _mm_xor_si128( p0, k10 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      k11 = _mm_xor_si128( k11, k10 );
+
+      x = _mm_xor_si128( x, k11 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      k12 = _mm_xor_si128( k12, k11 );
+
+      x = _mm_xor_si128( x, k12 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      k13 = _mm_xor_si128( k13, k12 );
+
+      x = _mm_xor_si128( x, k13 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p3 = _mm_xor_si128( p3, x );
+
+      // round 4, 8, 12
+
+      k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
+
+      x = _mm_xor_si128( p1, k00 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
+
+      x = _mm_xor_si128( x, k01 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
+
+      x = _mm_xor_si128( x, k02 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
+
+      x = _mm_xor_si128( x, k03 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p0 = _mm_xor_si128( p0, x );
+      k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
+
+      x = _mm_xor_si128( p3, k10 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
+
+      x = _mm_xor_si128( x, k11 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
+
+      x = _mm_xor_si128( x, k12 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
+
+      x = _mm_xor_si128( x, k13 );
+      x = _mm_aesenc_si128( x, mm_zero );
+      p2 = _mm_xor_si128( p2, x );
+   }
+
+   // round 13
+
+   k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+   k00 = _mm_xor_si128( k00, k13 );
+
+   x = _mm_xor_si128( p0, k00 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); 
+   k01 = _mm_xor_si128( k01, k00 );
+
+   x = _mm_xor_si128( x, k01 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+   k02 = _mm_xor_si128( k02, k01 );
+
+   x = _mm_xor_si128( x, k02 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+   k03 = _mm_xor_si128( k03, k02 );
+
+   x = _mm_xor_si128( x, k03 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   p3 = _mm_xor_si128( p3, x );
+   k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+   k10 = _mm_xor_si128( k10, k03 );
+
+   x = _mm_xor_si128( p2, k10 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+   k11 = _mm_xor_si128( k11, k10 );
+
+   x = _mm_xor_si128( x, k11 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+   k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
+               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
+
+   x = _mm_xor_si128( x, k12 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+   k13 = _mm_xor_si128( k13, k12 );
+
+   x = _mm_xor_si128( x, k13 );
+   x = _mm_aesenc_si128( x, mm_zero );
+   p1 = _mm_xor_si128( p1, x );
+
+   h[0] = _mm_xor_si128( h[0], p2 );
+   h[1] = _mm_xor_si128( h[1], p3 );
+   h[2] = _mm_xor_si128( h[2], p0 );
+   h[3] = _mm_xor_si128( h[3], p1 );
+}
+
+#endif
+
+static void
+shavite_big_aesni_init( sph_shavite_big_context *sc, const sph_u32 *iv )
+{
+	memcpy( sc->h, iv, sizeof sc->h );
+	sc->ptr    = 0;
+	sc->count0 = 0;
+	sc->count1 = 0;
+	sc->count2 = 0;
+	sc->count3 = 0;
+}
+
+static void
+shavite_big_aesni_core( sph_shavite_big_context *sc, const void *data,
+                        size_t len )
+{
+	unsigned char *buf;
+	size_t ptr;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	while (len > 0) {
+		size_t clen;
+
+		clen = (sizeof sc->buf) - ptr;
+		if (clen > len)
+			clen = len;
+		memcpy(buf + ptr, data, clen);
+		data = (const unsigned char *)data + clen;
+		ptr += clen;
+		len -= clen;
+		if (ptr == sizeof sc->buf) {
+			if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) {
+				sc->count1 = SPH_T32(sc->count1 + 1);
+				if (sc->count1 == 0) {
+					sc->count2 = SPH_T32(sc->count2 + 1);
+					if (sc->count2 == 0) {
+						sc->count3 = SPH_T32(
+							sc->count3 + 1);
+					}
+				}
+			}
+			c512(sc, buf);
+			ptr = 0;
+		}
+	}
+	sc->ptr = ptr;
+}
+
+static void
+shavite_big_aesni_close( sph_shavite_big_context *sc, unsigned ub, unsigned n,
+                         void *dst, size_t out_size_w32 )
+{
+	unsigned char *buf;
+	size_t ptr, u;
+	unsigned z;
+	sph_u32 count0, count1, count2, count3;
+
+	buf = sc->buf;
+	ptr = sc->ptr;
+	count0 = (sc->count0 += SPH_T32(ptr << 3) + n);
+	count1 = sc->count1;
+	count2 = sc->count2;
+	count3 = sc->count3;
+	z = 0x80 >> n;
+	z = ((ub & -z) | z) & 0xFF;
+	if (ptr == 0 && n == 0) {
+		buf[0] = 0x80;
+		memset(buf + 1, 0, 109);
+		sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
+	} else if (ptr < 110) {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 110 - ptr);
+	} else {
+		buf[ptr ++] = z;
+		memset(buf + ptr, 0, 128 - ptr);
+		c512(sc, buf);
+		memset(buf, 0, 110);
+		sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0;
+	}
+	sph_enc32le(buf + 110, count0);
+	sph_enc32le(buf + 114, count1);
+	sph_enc32le(buf + 118, count2);
+	sph_enc32le(buf + 122, count3);
+	buf[126] = (unsigned char) (out_size_w32 << 5);
+	buf[127] = (unsigned char) (out_size_w32 >> 3);
+	c512(sc, buf);
+	for (u = 0; u < out_size_w32; u ++)
+		sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]);
+}
+
+void
+sph_shavite512_aesni_init(void *cc)
+{
+	shavite_big_aesni_init(cc, IV512);
+}
+
+void
+sph_shavite512_aesni(void *cc, const void *data, size_t len)
+{
+	shavite_big_aesni_core(cc, data, len);
+}
+
+void
+sph_shavite512_aesni_close(void *cc, void *dst)
+{
+	shavite_big_aesni_close(cc, 0, 0, dst, 16);
+}
+
+void
+sph_shavite512_aesni_addbits_and_close( void *cc, unsigned ub, unsigned n,
+                                        void *dst)
+{
+	shavite_big_aesni_close(cc, ub, n, dst, 16);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/shavite/sph_shavite.c
+++ b/algo/shavite/sph_shavite.c
@@ -1731,21 +1731,21 @@ sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)

 /* see sph_shavite.h */
 void
-sph_shavite512_init(void *cc)
+sph_shavite512_sw_init(void *cc)
 {
 	shavite_big_init(cc, IV512);
 }

 /* see sph_shavite.h */
 void
-sph_shavite512(void *cc, const void *data, size_t len)
+sph_shavite512_sw(void *cc, const void *data, size_t len)
 {
 	shavite_big_core(cc, data, len);
 }

 /* see sph_shavite.h */
 void
-sph_shavite512_close(void *cc, void *dst)
+sph_shavite512_sw_close(void *cc, void *dst)
 {
 	shavite_big_close(cc, 0, 0, dst, 16);
 //	shavite_big_init(cc, IV512);
@@ -1753,7 +1753,7 @@ sph_shavite512_close(void *cc, void *dst)

 /* see sph_shavite.h */
 void
-sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+sph_shavite512_sw_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	shavite_big_close(cc, ub, n, dst, 16);
 //	shavite_big_init(cc, IV512);
--- a/algo/shavite/sph_shavite.h
+++ b/algo/shavite/sph_shavite.h
@@ -77,9 +77,9 @@ extern "C"{
 */
 typedef struct {
 #ifndef DOXYGEN_IGNORE
-	unsigned char buf[64];    /* first field, for alignment */
+	unsigned char buf[64] __attribute__ ((aligned (64))); 
+        sph_u32 h[8] __attribute__ ((aligned (32)));
 	size_t ptr;
-	sph_u32 h[8];
 	sph_u32 count0, count1;
 #endif
 } sph_shavite_small_context;
@@ -108,9 +108,9 @@ typedef sph_shavite_small_context sph_shavite256_context;
 */
 typedef struct {
 #ifndef DOXYGEN_IGNORE
-	unsigned char buf[128];    /* first field, for alignment */
+	unsigned char buf[128] __attribute__ ((aligned (64))); 
+        sph_u32 h[16] __attribute__ ((aligned (32)));;
 	size_t ptr;
-	sph_u32 h[16];
 	sph_u32 count0, count1, count2, count3;
 #endif
 } sph_shavite_big_context;
@@ -262,51 +262,37 @@ void sph_shavite384_close(void *cc, void *dst);
 void sph_shavite384_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

-/**
- * Initialize a SHAvite-512 context. This process performs no memory allocation.
- *
- * @param cc   the SHAvite-512 context (pointer to a
- *             <code>sph_shavite512_context</code>)
- */
-void sph_shavite512_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the SHAvite-512 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_shavite512(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current SHAvite-512 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (64 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the SHAvite-512 context
- * @param dst   the destination buffer
- */
-void sph_shavite512_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (64 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the SHAvite-512 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_shavite512_addbits_and_close(
+// Always define sw but only define aesni when available
+// Define fptrs for aesni or sw, not both.
+void sph_shavite512_sw_init(void *cc);
+void sph_shavite512_sw(void *cc, const void *data, size_t len);
+void sph_shavite512_sw_close(void *cc, void *dst);
+void sph_shavite512_sw_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);
-	
+
+#ifdef __AES__
+void sph_shavite512_aesni_init(void *cc);
+void sph_shavite512_aesni(void *cc, const void *data, size_t len);
+void sph_shavite512_aesni_close(void *cc, void *dst);
+void sph_shavite512_aesni_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#define sph_shavite512_init  sph_shavite512_aesni_init
+#define sph_shavite512       sph_shavite512_aesni
+#define sph_shavite512_close sph_shavite512_aesni_close
+#define sph_shavite512_addbits_and_close \
+                             sph_shavite512_aesni_addbits_and_close
+
+#else
+
+#define sph_shavite512_init  sph_shavite512_sw_init
+#define sph_shavite512       sph_shavite512_sw
+#define sph_shavite512_close sph_shavite512_sw_close
+#define sph_shavite512_addbits_and_close \
+                             sph_shavite512_sw_addbits_and_close
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif	
--- a/algo/simd/sse2/nist.h
+++ b/algo/simd/sse2/nist.h
@@ -8,28 +8,12 @@
 #define DATA_ALIGN(x) __declspec(align(16)) x
 #endif

-#include "compat.h"
+#include "simd-compat.h"
 #include "algo/sha/sha3-defs.h"
 /*
 * NIST API Specific types.
 */

-//typedef unsigned char BitSequence;
-
-//#ifdef HAS_64
- // typedef u64 DataLength;
-//#else
- // typedef unsigned long DataLength;
-//#endif
-
-// can't find u32 or fft-t
-#include <stdint.h>
-typedef uint32_t u32;
-typedef int fft_t;
-
-
-//typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
-
 typedef struct {
  unsigned int hashbitlen;
  unsigned int blocksize;
--- a/algo/simd/sse2/simd-compat.h
+++ b/algo/simd/sse2/simd-compat.h
@@ -1,5 +1,5 @@
-#ifndef __COMPAT_H__
-#define __COMPAT_H__
+#ifndef __SIMD_COMPAT_H__
+#define __SIMD_COMPAT_H__

 #include <limits.h>

@@ -24,14 +24,7 @@
 */

 #include <stdint.h>
-
-#ifdef UINT32_MAX
-typedef uint32_t u32;
-#else
-typedef uint_fast32_t u32;
-#endif
-
-typedef unsigned long long u64;
+#include "algo/sha/brg_types.h"

 #define C32(x)    ((u32)(x))

--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -0,0 +1,119 @@
+#include "skein-gate.h"
+#include <string.h>
+#include <stdint.h>
+#include <openssl/sha.h>
+#include "skein-hash-4way.h"
+
+#if defined (__AVX2__)
+
+void skeinhash_4way( void *state, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     skein512_4way_context ctx_skein;
+     SHA256_CTX            ctx_sha256;
+
+     skein512_4way_init( &ctx_skein );
+     skein512_4way( &ctx_skein, input, 80 );
+     skein512_4way_close( &ctx_skein, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash0, 64 );
+     SHA256_Final( (unsigned char*)hash0, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash1, 64 );
+     SHA256_Final( (unsigned char*)hash1, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash2, 64 );
+     SHA256_Final( (unsigned char*)hash2, &ctx_sha256 );
+
+     SHA256_Init( &ctx_sha256 );
+     SHA256_Update( &ctx_sha256, (unsigned char*)hash3, 64 );
+     SHA256_Final( (unsigned char*)hash3, &ctx_sha256 );
+
+     memcpy( state,      hash0, 32 );
+     memcpy( state + 32, hash1, 32 );
+     memcpy( state + 64, hash2, 32 );
+     memcpy( state + 96, hash3, 32 );
+}
+
+int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done )
+{
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t hash[8*4] __attribute__ ((aligned (64)));
+    uint32_t edata[20] __attribute__ ((aligned (64)));
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    const uint32_t Htarg = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t n = first_nonce;
+    // hash is returned deinterleaved
+    uint32_t *nonces = work->nonces;
+    bool *found = work->nfound;
+    int num_found = 0;
+
+// data is 80 bytes, 20 u32 or 4 u64.
+	
+    swab32_array( edata, pdata, 20 );
+ 
+    mm256_interleave_4x64( vdata, edata, edata, edata, edata, 640 );
+
+    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
+    uint32_t *noncep1 = vdata + 75;
+    uint32_t *noncep2 = vdata + 77;
+    uint32_t *noncep3 = vdata + 79;
+
+   do
+   {
+       found[0] = found[1] = found[2] = found[3] = false;
+       be32enc( noncep0, n   );
+       be32enc( noncep1, n+1 );
+       be32enc( noncep2, n+2 );
+       be32enc( noncep3, n+3 );
+
+       skeinhash_4way( hash, vdata );
+
+       if ( hash[7] < Htarg && fulltest( hash, ptarget ) )
+       {
+           found[0] = true;
+           num_found++;
+           nonces[0] = n;
+           // always put nonce0 in work data for compartibility with 
+           // non vectored algos.
+           pdata[19] = n;
+       }
+       if ( (hash+8)[7] < Htarg && fulltest( hash+8, ptarget ) )
+       {
+           found[1] = true;
+           num_found++;
+           nonces[1] = n+1;           
+       }
+       if ( (hash+16)[7] < Htarg && fulltest( hash+16, ptarget ) )
+       {
+           found[2] = true;
+           num_found++;
+           nonces[2] = n+2;           
+       }
+       if ( (hash+24)[7] < Htarg && fulltest( hash+24, ptarget ) )
+       {
+           found[3] = true;
+           num_found++;
+           nonces[3] = n+3;           
+       }
+       n += 4;
+    } while ( (num_found == 0) && (n < max_nonce)
+               && !work_restart[thr_id].restart );
+
+    *hashes_done = n - first_nonce + 1;
+    return num_found;
+}
+
+#endif
--- a/algo/skein/skein-gate.c
+++ b/algo/skein/skein-gate.c
@@ -0,0 +1,20 @@
+#include "skein-gate.h"
+#include "sph_skein.h"
+#include "skein-hash-4way.h"
+
+int64_t skein_get_max64() { return 0x7ffffLL; }
+
+bool register_skein_algo( algo_gate_t* gate )
+{
+    gate->optimizations = FOUR_WAY_OPT | SHA_OPT;
+#if defined (SKEIN_4WAY)
+    gate->scanhash  = (void*)&scanhash_skein_4way;
+    gate->hash      = (void*)&skeinhash_4way;
+#else
+    gate->scanhash  = (void*)&scanhash_skein;
+    gate->hash      = (void*)&skeinhash;
+#endif
+    gate->get_max64 = (void*)&skein_get_max64;
+    return true;
+};
+
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	bee78eac76	v3.7.9	2018-01-08 22:04:43 -05:00
Jay D Dee	2d2e54f001	v3.7.8	2017-12-30 19:19:46 -05:00
Jay D Dee	79164c24b5	v3.7.7	2017-12-17 12:00:42 -05:00
Jay D Dee	7a1389998b	v3.7.6	2017-12-14 18:28:51 -05:00
Jay D Dee	af1c940919	v3.7.5	2017-12-08 15:39:28 -05:00
Jay D Dee	4b57ac0eb9	v3.7.4	2017-11-28 16:32:04 -05:00
Jay D Dee	6d1361c87f	v3.7.3	2017-11-20 21:19:15 -05:00
Jay D Dee	ab39e88318	v3.7.2	2017-11-01 11:03:23 -04:00
Jay D Dee	8ff52e7ad6	v3.7.1	2017-10-31 00:25:24 -04:00
Jay D Dee	aaa48599ad	v3.7.0	2017-10-17 11:38:59 -04:00
Jay D Dee	c76574b2cd	v3.6.11	2017-10-12 15:14:37 -04:00
Jay D Dee	989fb42d20	v3.6.10	2017-10-12 11:49:40 -04:00
Jay D Dee	710c852f05	v3.6.9	2017-10-09 21:45:27 -04:00
Jay D Dee	39f089d3dc	v3.6.8	2017-07-31 20:02:45 -04:00
Jay D Dee	ec4f6028a2	v3.6.7	2017-07-24 21:38:32 -04:00
Jay D Dee	f8907677f6	v3.6.6	2017-07-01 14:37:11 -04:00
Jay D Dee	7544cb956c	v3.6.5	2017-05-19 16:38:26 -04:00
Jay D Dee	e7dbd27636	v3.6.4	2017-05-02 10:28:19 -04:00