v3.8.3

v3.8.2.1
v3.8.2
2025-09-17 23:44:27 +00:00 · 2018-02-23 12:39:15 -05:00 · 2018-02-17 13:52:24 -05:00 · 2018-02-15 14:48:50 -05:00 · 2018-02-09 23:30:14 -05:00 · 2018-02-07 16:38:45 -05:00
298 changed files with 43578 additions and 6480 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -11,7 +11,6 @@ autom4te.cache
 Makefile
 Makefile.in
 INSTALL
-configure
 configure.lineno
 depcomp
 missing
--- a/12
+++ b/12
@@ -16,4 +16,16 @@ LucasJones

 tpruvot@github

+elmad
+
+djm34
+
+palmd
+
+ig0tik3d
+
+Wolf0
+
+Optiminer
+
 Jay D Dee
--- a/34
+++ b/34
@@ -5,19 +5,31 @@
 # ex: docker run -it --rm cpuminer-opt:latest -a cryptonight -o cryptonight.eu.nicehash.com:3355 -u 1MiningDW2GKzf4VQfmp4q2XoUvR6iy6PD.worker1 -p x -t 3
 #

-FROM ubuntu:16.04
-RUN BUILD_DEPS="build-essential \
-    libssl-dev \
-	  libgmp-dev \
-	  libcurl4-openssl-dev \
-	  libjansson-dev \
-	  automake" && \
+# Build
+FROM ubuntu:16.04 as builder

-	  apt-get update && \
-	  apt-get install -y ${BUILD_DEPS}
+RUN apt-get update \
+  && apt-get install -y \
+    build-essential \
+    libssl-dev \
+    libgmp-dev \
+    libcurl4-openssl-dev \
+    libjansson-dev \
+    automake \
+  && rm -rf /var/lib/apt/lists/*

 COPY . /app/
-RUN	cd /app/ && ./build.sh
+RUN cd /app/ && ./build.sh

-ENTRYPOINT ["/app/cpuminer"]
+# App
+FROM ubuntu:16.04
+
+RUN apt-get update \
+  && apt-get install -y \
+    libcurl3 \
+    libjansson4 \
+  && rm -rf /var/lib/apt/lists/*
+
+COPY --from=builder /app/cpuminer .
+ENTRYPOINT ["./cpuminer"]
 CMD ["-h"]
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,30 +22,6 @@ cpuminer_SOURCES = \
  api.c \
  sysinfos.c \
  algo-gate-api.c\
-  algo/groestl/sph_groestl.c \
-  algo/skein/sph_skein.c \
-  algo/bmw/sph_bmw.c \
-  algo/shavite/sph_shavite.c \
-  algo/shavite/shavite.c \
-  algo/echo/sph_echo.c \
-  algo/blake/sph_blake.c \
-  algo/blake/sph_blake2b.c \
-  algo/heavy/sph_hefty1.c \
-  algo/blake/mod_blakecoin.c \
-  algo/luffa/sph_luffa.c \
-  algo/cubehash/sph_cubehash.c \
-  algo/simd/sph_simd.c \
-  algo/hamsi/sph_hamsi.c \
-  algo/fugue/sph_fugue.c \
-  algo/gost/sph_gost.c \
-  algo/jh/sph_jh.c \
-  algo/keccak/sph_keccak.c \
-  algo/keccak/keccak.c\
-  algo/sha/sph_sha2.c \
-  algo/sha/sph_sha2big.c \
-  algo/shabal/sph_shabal.c \
-  algo/whirlpool/sph_whirlpool.c\
-  crypto/blake2s.c \
  crypto/oaes_lib.c \
  crypto/c_keccak.c \
  crypto/c_groestl.c \
@@ -61,82 +37,212 @@ cpuminer_SOURCES = \
  algo/argon2/ar2/cores.c \
  algo/argon2/ar2/ar2-scrypt-jane.c \
  algo/argon2/ar2/blake2b.c \
-  algo/axiom.c \
+  algo/blake/sph_blake.c \
+  algo/blake/blake-hash-4way.c \
+  algo/blake/blake-gate.c \
  algo/blake/blake.c \
+  algo/blake/blake-4way.c \
+  algo/blake/sph_blake2b.c \
  algo/blake/blake2b.c \
+  algo/blake/sph-blake2s.c \
+  algo/blake/blake2s-hash-4way.c \
  algo/blake/blake2s.c \
+  algo/blake/blake2s-gate.c \
+  algo/blake/blake2s-4way.c \
+  algo/blake/blakecoin-gate.c \
+  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
+  algo/blake/blakecoin-4way.c \
+  algo/blake/decred-gate.c \
  algo/blake/decred.c \
+  algo/blake/decred-4way.c \
+  algo/blake/pentablake-gate.c \
+  algo/blake/pentablake-4way.c \
  algo/blake/pentablake.c \
+  algo/bmw/sph_bmw.c \
+  algo/bmw/bmw-hash-4way.c \
  algo/bmw/bmw256.c \
-  algo/cubehash/sse2/cubehash_sse2.c\
  algo/cryptonight/cryptolight.c \
  algo/cryptonight/cryptonight-common.c\
  algo/cryptonight/cryptonight-aesni.c\
  algo/cryptonight/cryptonight.c\
-  algo/drop.c \
+  algo/cubehash/sph_cubehash.c \
+  algo/cubehash/sse2/cubehash_sse2.c\
+  algo/echo/sph_echo.c \
  algo/echo/aes_ni/hash.c\
-  algo/fresh.c \
+  algo/gost/sph_gost.c \
+  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
+  algo/groestl/myrgr-gate.c \
+  algo/groestl/myrgr-4way.c \
  algo/groestl/myr-groestl.c \
  algo/groestl/aes_ni/hash-groestl.c \
  algo/groestl/aes_ni/hash-groestl256.c \
-  algo/haval/haval.c\
+  algo/fugue/sph_fugue.c \
+  algo/hamsi/sph_hamsi.c \
+  algo/hamsi/hamsi-hash-4way.c \
+  algo/haval/haval.c \
+  algo/haval/haval-hash-4way.c \
+  algo/heavy/sph_hefty1.c \
  algo/heavy/heavy.c \
  algo/heavy/bastion.c \
-  algo/hmq1725.c \
  algo/hodl/aes.c \
  algo/hodl/hodl-gate.c \
  algo/hodl/hodl-wolf.c \
  algo/hodl/sha512_avx.c \
  algo/hodl/sha512_avx2.c \
-  algo/lbry.c \
+  algo/jh/sph_jh.c \
+  algo/jh/jh-hash-4way.c \
+  algo/jh/jha-gate.c \
+  algo/jh/jha-4way.c \
+  algo/jh/jha.c \
+  algo/keccak/sph_keccak.c \
+  algo/keccak/keccak.c\
+  algo/keccak/keccak-hash-4way.c \
+  algo/keccak/keccak-4way.c\
+  algo/keccak/keccak-gate.c \
+  algo/keccak/sse2/keccak.c \
+  algo/luffa/sph_luffa.c \
  algo/luffa/luffa.c \
-  algo/luffa/sse2/luffa_for_sse2.c \
+  algo/luffa/luffa_for_sse2.c \
+  algo/luffa/luffa-hash-2way.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
+  algo/lyra2/lyra2rev2-gate.c \
  algo/lyra2/lyra2rev2.c \
+  algo/lyra2/lyra2rev2-4way.c \
  algo/lyra2/lyra2re.c \
-  algo/lyra2/zcoin.c \
+  algo/lyra2/lyra2z-gate.c \
+  algo/lyra2/lyra2z.c \
+  algo/lyra2/lyra2z-4way.c \
  algo/lyra2/lyra2z330.c \
-  algo/keccak/sse2/keccak.c \
+  algo/lyra2/lyra2h-gate.c \
+  algo/lyra2/lyra2h.c \
+  algo/lyra2/lyra2h-4way.c \
+  algo/lyra2/allium-gate.c \
+  algo/lyra2/allium-4way.c \
+  algo/lyra2/allium.c \
  algo/m7m.c \
-  algo/neoscrypt.c \
-  algo/nist5.c \
+  algo/neoscrypt/neoscrypt.c \
+  algo/nist5/nist5-gate.c \
+  algo/nist5/nist5-4way.c \
+  algo/nist5/nist5.c \
+  algo/nist5/zr5.c \
  algo/pluck.c \
+  algo/quark/quark-gate.c \
  algo/quark/quark.c \
+  algo/quark/quark-4way.c \
+  algo/quark/anime-gate.c \
+  algo/quark/anime.c \
+  algo/quark/anime-4way.c \
+  algo/qubit/qubit-gate.c \
  algo/qubit/qubit.c \
+  algo/qubit/qubit-2way.c \
+  algo/qubit/deep-gate.c \
+  algo/qubit/deep-2way.c \
  algo/qubit/deep.c \
  algo/ripemd/sph_ripemd.c \
+  algo/ripemd/ripemd-hash-4way.c \
+  algo/ripemd/lbry-gate.c \
+  algo/ripemd/lbry.c \
+  algo/ripemd/lbry-4way.c \
  algo/scrypt.c \
  algo/scryptjane/scrypt-jane.c \
+  algo/sha/sph_sha2.c \
+  algo/sha/sph_sha2big.c \
+  algo/sha/sha2-hash-4way.c \
  algo/sha/sha2.c \
  algo/sha/sha256t.c \
-  algo/simd/sse2/nist.c \
-  algo/simd/sse2/vector.c \
+  algo/shabal/sph_shabal.c \
+  algo/shabal/shabal-hash-4way.c \
+  algo/shavite/sph_shavite.c \
+  algo/shavite/sph-shavite-aesni.c \
+  algo/shavite/shavite.c \
+  algo/simd/sph_simd.c \
+  algo/simd/nist.c \
+  algo/simd/vector.c \
+  algo/simd/simd-hash-2way.c \
+  algo/skein/sph_skein.c \
+  algo/skein/skein-hash-4way.c \
  algo/skein/skein.c \
+  algo/skein/skein-4way.c \
+  algo/skein/skein-gate.c \
  algo/skein/skein2.c \
-  algo/s3.c \
+  algo/skein/skein2-4way.c \
+  algo/skein/skein2-gate.c \
+  algo/sm3/sm3.c \
+  algo/sm3/sm3-hash-4way.c \
  algo/tiger/sph_tiger.c \
-  algo/timetravel.c \
-  algo/veltor.c \
+  algo/whirlpool/sph_whirlpool.c \
+  algo/whirlpool/whirlpool-hash-4way.c \
+  algo/whirlpool/whirlpool-gate.c \
+  algo/whirlpool/whirlpool-4way.c \
  algo/whirlpool/whirlpool.c \
  algo/whirlpool/whirlpoolx.c \
+  algo/x11/x11-gate.c \
  algo/x11/x11.c \
-  algo/x11/x11evo.c \
+  algo/x11/x11-4way.c \
+  algo/x11/x11gost-gate.c \
  algo/x11/x11gost.c \
+  algo/x11/x11gost-4way.c \
+  algo/x11/c11-gate.c \
  algo/x11/c11.c \
+  algo/x11/c11-4way.c \
+  algo/x11/tribus-gate.c \
+  algo/x11/tribus.c \
+  algo/x11/tribus-4way.c \
+  algo/x11/timetravel-gate.c \
+  algo/x11/timetravel.c \
+  algo/x11/timetravel-4way.c \
+  algo/x11/timetravel10-gate.c \
+  algo/x11/timetravel10.c \
+  algo/x11/timetravel10-4way.c \
+  algo/x11/fresh.c \
+  algo/x11/x11evo.c \
+  algo/x11/x11evo-4way.c \
+  algo/x11/x11evo-gate.c \
+  algo/x12/x12-gate.c \
+  algo/x12/x12.c \
+  algo/x12/x12-4way.c \
+  algo/x13/x13-gate.c \
  algo/x13/x13.c \
+  algo/x13/x13-4way.c \
+  algo/x13/x13sm3-gate.c \
+  algo/x13/x13sm3.c \
+  algo/x13/x13sm3-4way.c \
+  algo/x13/phi1612-gate.c \
+  algo/x13/phi1612.c \
+  algo/x13/phi1612-4way.c \
+  algo/x13/skunk-gate.c \
+  algo/x13/skunk-4way.c \
+  algo/x13/skunk.c \
+  algo/x13/drop.c \
+  algo/x14/x14-gate.c \
  algo/x14/x14.c \
+  algo/x14/x14-4way.c \
+  algo/x14/veltor-gate.c \
+  algo/x14/veltor.c \
+  algo/x14/veltor-4way.c \
+  algo/x14/polytimos-gate.c \
+  algo/x14/polytimos.c \
+  algo/x14/polytimos-4way.c \
+  algo/x14/axiom.c \
+  algo/x15/x15-gate.c \
  algo/x15/x15.c \
+  algo/x15/x15-4way.c \
+  algo/x17/x17-gate.c \
  algo/x17/x17.c \
-  algo/xevan.c \
+  algo/x17/x17-4way.c \
+  algo/x17/xevan-gate.c \
+  algo/x17/xevan.c \
+  algo/x17/xevan-4way.c \
+  algo/x17/x16r-gate.c \
+  algo/x17/x16r.c \
+  algo/x17/x16r-4way.c \
+  algo/x17/hmq1725.c \
  algo/yescrypt/yescrypt.c \
-  algo/yescrypt/yescrypt-common.c \
-  algo/yescrypt/sha256_Y.c\
-  algo/yescrypt/yescrypt-simd.c\
-  algo/zr5.c
-
+  algo/yescrypt/sha256_Y.c \
+  algo/yescrypt/yescrypt-simd.c

 disable_flags =

--- a/README.md
+++ b/README.md
@@ -13,66 +13,6 @@ mailto://jayddee246@gmail.com

 See file RELEASE_NOTES for change log and compile instructions.

-Supported Algorithms
--------------------
-
-                          argon2
-                          axiom        Shabal-256 MemoHash
-                          bastion
-                          blake        Blake-256 (SFR)
-                          blakecoin    blake256r8
-                          blake2s      Blake-2 S
-                          bmw          BMW 256
-                          c11          Chaincoin
-                          cryptolight  Cryptonight-light
-                          cryptonight  cryptonote, Monero (XMR)
-                          decred
-                          deep         Deepcoin (DCN)
-                          drop         Dropcoin
-                          fresh        Fresh
-                          groestl      dmd-gr, Groestl coin
-                          heavy        Heavy
-                          hmq1725      Espers
-                          hodl         Hodlcoin
-                          keccak       Keccak
-                          lbry         LBC, LBRY Credits
-                          luffa        Luffa
-                          lyra2re      lyra2
-                          lyra2rev2    lyrav2, Vertcoin
-                          lyra2z       Zcoin (XZC)
-                          lyra2z330    Lyra2 330 rows, Zoin (ZOI)
-                          m7m          Magi (XMG)
-                          myr-gr       Myriad-Groestl
-                          neoscrypt    NeoScrypt(128, 2, 1)
-                          nist5        Nist5
-                          pluck        Pluck:128 (Supcoin)
-                          pentablake   Pentablake
-                          quark        Quark
-                          qubit        Qubit
-                          scrypt       scrypt(1024, 1, 1) (default)
-                          scrypt:N     scrypt(N, 1, 1)
-                          scryptjane:nf
-                          sha256d      Double SHA-256
-                          sha256t      Triple SHA-256, Onecoin (OC)
-                          shavite3     Shavite3
-                          skein        Skein+Sha (Skeincoin)
-                          skein2       Double Skein (Woodcoin)
-                          timetravel   Machinecoin (MAC)
-                          vanilla      blake256r8vnl (VCash)
-                          veltor
-                          whirlpool
-                          whirlpoolx
-                          x11          Dash
-                          x11evo       Revolvercoin
-                          x11gost      sib (SibCoin)
-                          x13          X13
-                          x14          X14
-                          x15          X15
-                          x17
-                          xevan        Bitsend
-                          yescrypt
-                          zr5          Ziftr
-
 Requirements
 ------------

@@ -85,13 +25,92 @@ algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
 Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
 performance.

+ARM CPUs are not supported.
+
 2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
 Centos are known to work and have all dependencies in their repositories.
 Others may work but may require more effort.
 64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.

-3. Stratum pool, cpuminer-opt only supports stratum minning. Some algos
-may work wallet mining but there are no guarantees.
+MacOS, OSx is not supported.
+
+3. Stratum pool. Some algos may work wallet mining using getwork or GBT. YMMV.
+
+Supported Algorithms
+--------------------
+
+                          allium       Garlicoin
+                          anime        Animecoin
+                          argon2
+                          axiom        Shabal-256 MemoHash
+                          bastion
+                          blake        Blake-256 (SFR)
+                          blakecoin    blake256r8
+                          blake2s      Blake-2 S
+                          bmw          BMW 256
+                          c11          Chaincoin
+                          cryptolight  Cryptonight-light
+                          cryptonight  cryptonote, Monero (XMR)
+                          decred
+                          deep         Deepcoin (DCN)
+                          dmd-gr       Diamond-Groestl
+                          drop         Dropcoin
+                          fresh        Fresh
+                          groestl      Groestl coin
+                          heavy        Heavy
+                          hmq1725      Espers
+                          hodl         Hodlcoin
+                          jha          Jackpotcoin
+                          keccak       Maxcoin
+                          keccakc      Creative coin
+                          lbry         LBC, LBRY Credits
+                          luffa        Luffa
+                          lyra2h       Hppcoin
+                          lyra2re      lyra2
+                          lyra2rev2    lyra2v2, Vertcoin
+                          lyra2z       Zcoin (XZC)
+                          lyra2z330    Lyra2 330 rows, Zoin (ZOI)
+                          m7m          Magi (XMG)
+                          myr-gr       Myriad-Groestl
+                          neoscrypt    NeoScrypt(128, 2, 1)
+                          nist5        Nist5
+                          pentablake   Pentablake
+                          phi1612      phi, LUX coin
+                          pluck        Pluck:128 (Supcoin)
+                          polytimos    Ninja
+                          quark        Quark
+                          qubit        Qubit
+                          scrypt       scrypt(1024, 1, 1) (default)
+                          scrypt:N     scrypt(N, 1, 1)
+                          scryptjane:nf
+                          sha256d      Double SHA-256
+                          sha256t      Triple SHA-256, Onecoin (OC)
+                          shavite3     Shavite3
+                          skein        Skein+Sha (Skeincoin)
+                          skein2       Double Skein (Woodcoin)
+                          skunk        Signatum (SIGT)
+                          timetravel   Machinecoin (MAC)
+                          timetravel10 Bitcore
+                          tribus       Denarius (DNR)
+                          vanilla      blake256r8vnl (VCash)
+                          veltor       (VLT)
+                          whirlpool
+                          whirlpoolx
+                          x11          Dash
+                          x11evo       Revolvercoin
+                          x11gost      sib (SibCoin)
+                          x12          Galaxie Cash (GCH)
+                          x13          X13
+                          x13sm3       hsr (Hshare)
+                          x14          X14
+                          x15          X15
+                          x16r         Ravencoin
+                          x17
+                          xevan        Bitsend
+                          yescrypt     Globalboost-Y (BSTY)
+                          yescryptr8   BitZeny (ZNY)
+                          yescryptr16  Yenten (YTN)
+                          zr5          Ziftr

 Errata
 ------
@@ -114,13 +133,20 @@ forum at:

 https://bitcointalk.org/index.php?topic=1326803.0

+All problem reports must be accompanied by a proper definition.
+This should include how the problem occurred, the command line and
+output from the miner showing the startup and any errors.
+
 Donations
 ---------

-I do not do this for money but I have a donation address if users
-are so inclined.
+cpuminer-opt has no fees of any kind but donations are accepted.

-bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations
+BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
+ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
+LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
+BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
+BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ

 Happy mining!

--- a/README.txt
+++ b/README.txt
@@ -1,6 +1,9 @@
 This file is included in the Windows binary package. Compile instructions
 for Linux and Windows can be found in RELEASE_NOTES.

+cpuminer is a console program that is executed from a DOS command prompt.
+There is no GUI and no mouse support.
+
 Choose the exe that best matches you CPU's features or use trial and
 error to find the fastest one that doesn't crash. Pay attention to
 the features listed at cpuminer startup to ensure you are mining at
@@ -8,15 +11,26 @@ optimum speed using all the available features.

 Architecture names and compile options used are only provided for Intel
 Core series. Pentium and Celeron often have fewer features.
-AMD is YMMV, see previous paragraph.

-Exe name                  Compile opts       Arch name
-
-cpuminer-sse2.exe         -march=core2,      Core2   
-cpuminer-sse42.exe        -march=corei7,     Nehalem
-cpuminer-aes-sse42.exe    -maes -msse4.2     Westmere
-cpuminer-aes-avx.exe      -march=corei7-avx, Sandybridge, Ivybridge
-cpuminer-aes-avx2.exe     -march=core-avx2,  Haswell, Broadwell, Skylake, Kabylake
+AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
+supported by cpuminer-opt due to an incompatible implementation of SSE2 on
+these CPUs. Some algos may crash the miner with an invalid instruction.
+Users are recommended to use an unoptimized miner such as cpuminer-multi.

+Exe name                Compile flags              Arch name
+
+cpuminer-sse2.exe      "-march=core2"              Core2, Nehalem   
+cpuminer-aes-sse42.exe "-maes -msse4.2"            Westmere
+cpuminer-aes-avx.exe   "-march=corei7-avx"         Sandybridge, Ivybridge
+cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
+cpuminer-avx2-sha.exe  "-march=core-avx2 -msha"    Ryzen
+
+If you like this software feel free to donate:
+
+BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
+ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
+LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
+BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
+BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ


--- a/233
+++ b/233
@@ -6,9 +6,31 @@ compile flag.
 HW SHA support is only available when compiled from source, Windows binaries
 are not yet available.

+cpuminer-opt is a console program, if you're using a mouse you're doing it
+wrong.
+
+Security warning
+----------------
+
+Miner programs are often flagged as malware by antivirus programs. This is
+a false positive, they are flagged simply because they are miners. The source
+code is open for anyone to inspect. If you don't trust the software, don't use
+it.
+
+The cryptographic code has been taken from trusted sources but has been
+modified for speed at the expense of accepted security practices. This
+code should not be imported into applications where secure cryptography is
+required.
+
 Compile Instructions
 --------------------

+Requirements:
+
+Intel Core2 or newer, or AMD Steamroller or newer CPU. ARM CPUs are not
+supported.
+64 bit Linux or Windows operating system. Apple is not supported.
+
 Building on linux prerequisites:

 It is assumed users know how to install packages on their system and
@@ -25,14 +47,11 @@ are some of the ones that may not be in the default install and need to
 be installed manually. There may be others, read the error messages they
 will give a clue as to the missing package.

-The folliwing command should install everything you need on Debian based
-packages:
+The following command should install everything you need on Debian based
+distributions such as Ubuntu:

 sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake

-Building on Linux, see below for Windows.
-
-Dependencies

 build-essential  (for Ubuntu, Development Tools package group on Fedora)
 automake
@@ -44,9 +63,16 @@ pthreads
 zlib

 SHA support on AMD Ryzen CPUs requires gcc version 5 or higher and openssl 1.1
-or higher. Additional compile options may also be required such as
+or higher. Reports of improved performiance on Ryzen when using openssl 1.0.2
+have been due to AVX and AVX2 optimizations added to that version.
+Additional improvements are expected on Ryzen with openssl 1.1.
 "-march-znver1" or "-msha".

+Additional instructions for static compilalation can be found here:
+https://lxadm.com/Static_compilation_of_cpuminer
+Static builds should only considered in a homogeneous HW and SW environment.
+Local builds will always have the best performance and compatibility.
+
 Extract cpuminer source.

 tar xvzf cpuminer-opt-x.y.z.tar.gz
@@ -58,9 +84,22 @@ Run ./build.sh to build on Linux or execute the following commands.
 CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 make

+Additional optional compile flags, add the following to CFLAGS to activate:
+
+-DUSE_SPH_SHA
+
+SPH may give slightly better performance on algos that use sha256 when using
+openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
+better than SPH.
+
 Start mining.

-./cpuminer -a algo ...
+./cpuminer -a algo -o url -u username -p password
+
+Windows
+
+Precompiled Windows binaries are built on a Linux host using Mingw
+with a more recent compiler than the following Windows hosted procedure.

 Building on Windows prerequisites:

@@ -92,17 +131,21 @@ or similar Windows program.
 In msys shell cd to miner directory.
 cd /c/path/to/cpuminer-opt

-Run winbuild.sh to build on Windows or execute the following commands.
+Run build.sh to build on Windows or execute the following commands.

 ./autogen.sh
-CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 make

+Start mining
+
+cpuminer.exe -a algo -o url -u user -p password
+
 The following tips may be useful for older AMD CPUs.

-AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
-supported by cpuminer-opt due to an incompatible implementation of SSE2 on
-these CPUs. Some algos may crash the miner with an invalid instruction.
+AMD CPUs older than Steamroller, including Athlon x2 and Phenom II x4, are
+not supported by cpuminer-opt due to an incompatible implementation of SSE2
+on these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.

 Some users with AMD CPUs without AES_NI have reported problems compiling
@@ -116,6 +159,172 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.8.3
+
+More restoration of lost lyra2 hash.
+8 way AVX2 and 4way AVX optimization for blakecoin, vanilla & blake2s.
+8 way AVX2 for lbry.
+Scaled hashrate for API output.
+A couple of GBT fixes.
+
+v3.8.2.1
+
+Fixed low difficulty rejects with allium.
+Fixed qubit AVX2.
+Restored lyra2z lost hash.
+Fixed build.sh
+
+v3.8.2
+
+Fixed and faster myr-gr.
+Added x12 algo (Galaxie Cash), allium algo (Garlicoin).
+Faster lyra2rev2, lbry, skein.
+Large reduction in compiler warnings.
+
+v3.8.1.1
+
+Fixed Windows AVX2 crash.
+
+v3.8.1
+
+Fixes x16r on CPUs with only SSE2.
+More Optimizations for X algos, qubit & deep.
+Corrected algo optimizations for scrypt and yescrypt, no new optimizations.
+
+v3.8.0.1
+
+Fixed x16r AVX2 low hash rate.
+
+v3.8.0
+
+4way no longer a seperate feature, included in AVX2.
+Added x16r algo for Ravencoin, anime algo for Animecoin.
+More 4way optimizations for X13 and up.
+Tweaked CPU affinity to better support more than 64 CPUs.
+Fixed compile problem on some old AMD CPUs.
+
+v3.7.10
+
+4way optimizations for lyra2rev2, lyra2h, quark, timetravel8, timetravel10
+   x11evo, blakecoin.
+Faster x13sm3 (hsr).
+Added share difficulty to accepted message.
+
+v3.7.9
+
+Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.
+Additional 4way optimizations for X algos.
+New algo yescryptr8 for BitZeny, not to be confused with original
+yescrypt Globalboost-Y.
+
+v3.7.8
+
+Partial 4way optimization for most X algos including c11, xevan, phi, hsr
+
+v3.7.7
+
+Fixed regression caused by 64 CPU support.
+Fixed lyra2h.
+
+v3.7.6
+
+Added lyra2h algo for Hppcoin.
+Added support for more than 64 CPUs.
+Optimized shavite512 with AES, improves x11 etc.
+
+v3.7.5
+
+New algo keccakc for Creative coin with 4way optimizations
+
+Rewrote some AVX/AVX2 code for more consistent implementation and some
+optimizing.
+
+Enhanced capabilities check to support 4way, more precise reporting of
+features (not all algos use SSE2), and better error messages when using
+an incompatible pre-built version (Windows users).
+
+v3.7.4
+
+Removed unnecessary build options.
+
+Added 4way support for tribus and nist5.
+
+v3.7.3
+
+Added polytimos algo.
+
+Introducing 4-way AVX2 optimization giving up to 4x performance inprovement
+on many compute bound algos. First supported algos: skein, skein2, blake &
+keccak. This feature is only available when compiled from source. See above
+for instcuctions how to enable 4-way during compilation.
+
+Updated Dockerfile.
+
+v3.7.2
+
+Fixed yescryptr16
+Changed default sha256 and sha512 to openssl. This should be used when
+compiling with openssl 1.0.2 or higher (Ubuntu 16.04).
+This should increase the hashrate for yescrypt, yescryptr16, m7m, xevan, skein,
+myr-gr & others  when openssl 1.0.2 is installed.
+Users with openssl 1.0.1 (Ubuntu 14.04) may get better perforance by adding
+"-DUSE_SPH_SHA" to CLAGS. 
+Windows binaries are compiled with -DUSE_SPH_SHA and won't get the speedup.
+
+v3.7.1
+
+Added yescryptr16 algo for Yenten coin
+Added SHA support to yescrypt and yescryptr16
+Small code cleanup
+
+v3.7.0
+
+Fixed x14 misalignment bug.
+Fixed decred stake version bug.
+Getwork fixes for algos that use big endian data encoding: m7m, zr5, neoscrypt,
+decred.
+
+v3.6.10
+
+Fixed misalignment bug in hsr.
+
+v3.6.9
+
+Added phi1612 algo for LUX coin
+Added x13sm3 algo, alias hsr, for Hshare coin
+
+v3.6.8
+
+Fixed timetravel10 on Windows.
+
+v3.6.7
+
+Skunk algo added.
+Tribus a little faster.
+Minor restructuring.
+
+v3.6.6
+
+added tribus algo for Denarius (DNR)
+
+configure removed from .gitignore. This should allow git clone to compile
+on Windows/mingw.
+
+Fixed CPU temperature monitoring on some CPUs (Linux only).
+
+Fixed a compile error on FreeBSD (unsupported YMMV).
+
+v3.6.5
+
+Cryptonight a little faster.
+Added jha algo (Jackpotcoin) with AES optimizations.
+
+v3.6.4
+
+Added support for Bitcore (BTX) using the timetravel10 algo, optimized for
+AES and AVX2. 
+"-a bitcore" works as an alias and is less typing that "-a timetravel10".
+
 v3.6.3

 Fixed all known issues with SHA support on AMD Ryzen CPUs, still no
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -16,7 +16,7 @@
 #include <memory.h>
 #include <unistd.h>
 #include <openssl/sha.h>
-#include "miner.h"
+//#include "miner.h"
 #include "algo-gate-api.h"

 // Define null and standard functions.
@@ -77,6 +77,12 @@ void algo_not_tested()
  applog(LOG_WARNING,"and bad things may happen. Use at your own risk.");
 }

+void four_way_not_tested()
+{
+  applog( LOG_WARNING,"Algo %s has not been tested using 4way. It may not", algo_names[opt_algo] );
+  applog( LOG_WARNING,"work or may be slower. Please report your results.");
+}
+
 void algo_not_implemented()
 {
  applog(LOG_ERR,"Algo %s has not been Implemented.",algo_names[opt_algo]);
@@ -114,8 +120,8 @@ void init_algo_gate( algo_gate_t* gate )
   gate->stratum_gen_work        = (void*)&std_stratum_gen_work;
   gate->build_stratum_request   = (void*)&std_le_build_stratum_request;
   gate->set_target              = (void*)&std_set_target;
-   gate->work_decode             = (void*)&std_work_decode;
-   gate->submit_getwork_result   = (void*)&std_submit_getwork_result;
+   gate->work_decode             = (void*)&std_le_work_decode;
+   gate->submit_getwork_result   = (void*)&std_le_submit_getwork_result;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
   gate->calc_network_diff       = (void*)&std_calc_network_diff;
@@ -124,7 +130,7 @@ void init_algo_gate( algo_gate_t* gate )
   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
   gate->stratum_handle_response = (void*)&std_stratum_handle_response;
-   gate->optimizations           = SSE2_OPT;
+   gate->optimizations           = EMPTY_SET;
   gate->ntime_index             = STD_NTIME_INDEX;
   gate->nbits_index             = STD_NBITS_INDEX;
   gate->nonce_index             = STD_NONCE_INDEX;
@@ -132,6 +138,10 @@ void init_algo_gate( algo_gate_t* gate )
   gate->work_cmp_size           = STD_WORK_CMP_SIZE;
 }

+// Ignore warnings for not yet defined register functions
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
+
 // called by each thread that uses the gate
 bool register_algo_gate( int algo, algo_gate_t *gate )
 {
@@ -145,72 +155,77 @@ bool register_algo_gate( int algo, algo_gate_t *gate )

   switch (algo)
   {
-
-// Ignore warnings for not yet defined register fucntions
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wimplicit-function-declaration"
-
-     case ALGO_ARGON2:      register_argon2_algo     ( gate ); break;
-     case ALGO_AXIOM:       register_axiom_algo      ( gate ); break;
-     case ALGO_BASTION:     register_bastion_algo    ( gate ); break;
-     case ALGO_BLAKE:       register_blake_algo      ( gate ); break;
-     case ALGO_BLAKECOIN:   register_blakecoin_algo  ( gate ); break;
-//     case ALGO_BLAKE2B:     register_blake2b_algo    ( gate ); break;
-     case ALGO_BLAKE2S:     register_blake2s_algo    ( gate ); break;
-     case ALGO_C11:         register_c11_algo        ( gate ); break;
-     case ALGO_CRYPTOLIGHT: register_cryptolight_algo( gate ); break;
-     case ALGO_CRYPTONIGHT: register_cryptonight_algo( gate ); break;
-     case ALGO_DECRED:      register_decred_algo     ( gate ); break;
-     case ALGO_DEEP:        register_deep_algo       ( gate ); break;
-     case ALGO_DMD_GR:      register_dmd_gr_algo     ( gate ); break;
-     case ALGO_DROP:        register_drop_algo       ( gate ); break;
-     case ALGO_FRESH:       register_fresh_algo      ( gate ); break;
-     case ALGO_GROESTL:     register_groestl_algo    ( gate ); break;
-     case ALGO_HEAVY:       register_heavy_algo      ( gate ); break;
-     case ALGO_HMQ1725:     register_hmq1725_algo    ( gate ); break;
-     case ALGO_HODL:        register_hodl_algo       ( gate ); break;
-     case ALGO_KECCAK:      register_keccak_algo     ( gate ); break;
-     case ALGO_LBRY:        register_lbry_algo       ( gate ); break;
-     case ALGO_LUFFA:       register_luffa_algo      ( gate ); break;
-     case ALGO_LYRA2RE:     register_lyra2re_algo    ( gate ); break;
-     case ALGO_LYRA2REV2:   register_lyra2rev2_algo  ( gate ); break;
-     case ALGO_LYRA2Z:      register_zcoin_algo      ( gate ); break;
-     case ALGO_LYRA2Z330:   register_lyra2z330_algo  ( gate ); break;
-     case ALGO_M7M:         register_m7m_algo        ( gate ); break;
-     case ALGO_MYR_GR:      register_myriad_algo     ( gate ); break;
-     case ALGO_NEOSCRYPT:   register_neoscrypt_algo  ( gate ); break;
-     case ALGO_NIST5:       register_nist5_algo      ( gate ); break;
-     case ALGO_PENTABLAKE:  register_pentablake_algo ( gate ); break;
-     case ALGO_PLUCK:       register_pluck_algo      ( gate ); break;
-     case ALGO_QUARK:       register_quark_algo      ( gate ); break;
-     case ALGO_QUBIT:       register_qubit_algo      ( gate ); break;
-     case ALGO_SCRYPT:      register_scrypt_algo     ( gate ); break;
-     case ALGO_SCRYPTJANE:  register_scryptjane_algo ( gate ); break;
-     case ALGO_SHA256D:     register_sha256d_algo    ( gate ); break;
-     case ALGO_SHA256T:     register_sha256t_algo    ( gate ); break;
-     case ALGO_SHAVITE3:    register_shavite_algo    ( gate ); break;
-     case ALGO_SKEIN:       register_skein_algo      ( gate ); break;
-     case ALGO_SKEIN2:      register_skein2_algo     ( gate ); break;
-     case ALGO_S3:          register_s3_algo         ( gate ); break;
-     case ALGO_TIMETRAVEL:  register_timetravel_algo ( gate ); break;
-     case ALGO_VANILLA:     register_vanilla_algo    ( gate ); break;
-     case ALGO_VELTOR:      register_veltor_algo     ( gate ); break;
-     case ALGO_WHIRLPOOL:   register_whirlpool_algo  ( gate ); break;
-     case ALGO_WHIRLPOOLX:  register_whirlpoolx_algo ( gate ); break;
-     case ALGO_X11:         register_x11_algo        ( gate ); break;
-     case ALGO_X11EVO:      register_x11evo_algo     ( gate ); break;
-     case ALGO_X11GOST:     register_sib_algo        ( gate ); break;
-     case ALGO_X13:         register_x13_algo        ( gate ); break;
-     case ALGO_X14:         register_x14_algo        ( gate ); break;
-     case ALGO_X15:         register_x15_algo        ( gate ); break;
-     case ALGO_X17:         register_x17_algo        ( gate ); break;
-     case ALGO_XEVAN:       register_xevan_algo      ( gate ); break;
-     case ALGO_YESCRYPT:    register_yescrypt_algo   ( gate ); break;
-     case ALGO_ZR5:         register_zr5_algo        ( gate ); break;
-
-// restore warnings
-#pragma GCC diagnostic pop
-
+     case ALGO_ALLIUM:       register_allium_algo      ( gate ); break;
+     case ALGO_ANIME:        register_anime_algo       ( gate ); break;
+     case ALGO_ARGON2:       register_argon2_algo      ( gate ); break;
+     case ALGO_AXIOM:        register_axiom_algo       ( gate ); break;
+     case ALGO_BASTION:      register_bastion_algo     ( gate ); break;
+     case ALGO_BLAKE:        register_blake_algo       ( gate ); break;
+     case ALGO_BLAKECOIN:    register_blakecoin_algo   ( gate ); break;
+//     case ALGO_BLAKE2B:      register_blake2b_algo    ( gate ); break;
+     case ALGO_BLAKE2S:      register_blake2s_algo     ( gate ); break;
+     case ALGO_C11:          register_c11_algo         ( gate ); break;
+     case ALGO_CRYPTOLIGHT:  register_cryptolight_algo ( gate ); break;
+     case ALGO_CRYPTONIGHT:  register_cryptonight_algo ( gate ); break;
+     case ALGO_DECRED:       register_decred_algo      ( gate ); break;
+     case ALGO_DEEP:         register_deep_algo        ( gate ); break;
+     case ALGO_DMD_GR:       register_dmd_gr_algo      ( gate ); break;
+     case ALGO_DROP:         register_drop_algo        ( gate ); break;
+     case ALGO_FRESH:        register_fresh_algo       ( gate ); break;
+     case ALGO_GROESTL:      register_groestl_algo     ( gate ); break;
+     case ALGO_HEAVY:        register_heavy_algo       ( gate ); break;
+     case ALGO_HMQ1725:      register_hmq1725_algo     ( gate ); break;
+     case ALGO_HODL:         register_hodl_algo        ( gate ); break;
+     case ALGO_JHA:          register_jha_algo         ( gate ); break;
+     case ALGO_KECCAK:       register_keccak_algo      ( gate ); break;
+     case ALGO_KECCAKC:      register_keccakc_algo     ( gate ); break;
+     case ALGO_LBRY:         register_lbry_algo        ( gate ); break;
+     case ALGO_LUFFA:        register_luffa_algo       ( gate ); break;
+     case ALGO_LYRA2H:       register_lyra2h_algo      ( gate ); break;
+     case ALGO_LYRA2RE:      register_lyra2re_algo     ( gate ); break;
+     case ALGO_LYRA2REV2:    register_lyra2rev2_algo   ( gate ); break;
+     case ALGO_LYRA2Z:       register_lyra2z_algo      ( gate ); break;
+     case ALGO_LYRA2Z330:    register_lyra2z330_algo   ( gate ); break;
+     case ALGO_M7M:          register_m7m_algo         ( gate ); break;
+     case ALGO_MYR_GR:       register_myriad_algo      ( gate ); break;
+     case ALGO_NEOSCRYPT:    register_neoscrypt_algo   ( gate ); break;
+     case ALGO_NIST5:        register_nist5_algo       ( gate ); break;
+     case ALGO_PENTABLAKE:   register_pentablake_algo  ( gate ); break;
+     case ALGO_PHI1612:      register_phi1612_algo     ( gate ); break;
+     case ALGO_PLUCK:        register_pluck_algo       ( gate ); break;
+     case ALGO_POLYTIMOS:    register_polytimos_algo   ( gate ); break;
+     case ALGO_QUARK:        register_quark_algo       ( gate ); break;
+     case ALGO_QUBIT:        register_qubit_algo       ( gate ); break;
+     case ALGO_SCRYPT:       register_scrypt_algo      ( gate ); break;
+     case ALGO_SCRYPTJANE:   register_scryptjane_algo  ( gate ); break;
+     case ALGO_SHA256D:      register_sha256d_algo     ( gate ); break;
+     case ALGO_SHA256T:      register_sha256t_algo     ( gate ); break;
+     case ALGO_SHAVITE3:     register_shavite_algo     ( gate ); break;
+     case ALGO_SKEIN:        register_skein_algo       ( gate ); break;
+     case ALGO_SKEIN2:       register_skein2_algo      ( gate ); break;
+     case ALGO_SKUNK:        register_skunk_algo       ( gate ); break;
+     case ALGO_TIMETRAVEL:   register_timetravel_algo  ( gate ); break;
+     case ALGO_TIMETRAVEL10: register_timetravel10_algo( gate ); break;
+     case ALGO_TRIBUS:       register_tribus_algo      ( gate ); break;
+     case ALGO_VANILLA:      register_vanilla_algo     ( gate ); break;
+     case ALGO_VELTOR:       register_veltor_algo      ( gate ); break;
+     case ALGO_WHIRLPOOL:    register_whirlpool_algo   ( gate ); break;
+     case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo  ( gate ); break;
+     case ALGO_X11:          register_x11_algo         ( gate ); break;
+     case ALGO_X11EVO:       register_x11evo_algo      ( gate ); break;
+     case ALGO_X11GOST:      register_x11gost_algo     ( gate ); break;
+     case ALGO_X12:          register_x12_algo         ( gate ); break;
+     case ALGO_X13:          register_x13_algo         ( gate ); break;
+     case ALGO_X13SM3:       register_x13sm3_algo      ( gate ); break;
+     case ALGO_X14:          register_x14_algo         ( gate ); break;
+     case ALGO_X15:          register_x15_algo         ( gate ); break;
+     case ALGO_X16R:         register_x16r_algo        ( gate ); break;
+     case ALGO_X17:          register_x17_algo         ( gate ); break;
+     case ALGO_XEVAN:        register_xevan_algo       ( gate ); break;
+     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
+     case ALGO_YESCRYPTR8:   register_yescryptr8_algo  ( gate ); break;
+     case ALGO_YESCRYPTR16:  register_yescryptr16_algo ( gate ); break;
+     case ALGO_ZR5:          register_zr5_algo         ( gate ); break;
    default:
        applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
        return false;
@@ -225,6 +240,9 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
  return true;
 }

+// restore warnings
+#pragma GCC diagnostic pop
+
 // override std defaults with jr2 defaults
 bool register_json_rpc2( algo_gate_t *gate )
 {
@@ -253,42 +271,48 @@ void exec_hash_function( int algo, void *output, const void *pdata )
  gate.hash( output, pdata, 0 );  
 }

-// an algo can have multiple aliases but the aliases must be unique
-
 #define PROPER (1)
 #define ALIAS  (0)

 // The only difference between the alias and the proper algo name is the
-// proper name is the one that is defined in ALGO_NAMES, there may be
+// proper name is the one that is defined in ALGO_NAMES. There may be
 // multiple aliases that map to the same proper name.
 // New aliases can be added anywhere in the array as long as NULL is last.
 // Alphabetic order of alias is recommended.
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
-  { "blake256r8",        "blakecoin"   },
-  { "blake256r8vnl",     "vanilla"     },
-  { "sia",               "blake2b"     },
-  { "blake256r14",       "blake"       },
-  { "blake256r14dcr",    "decred"      },
-  { "cryptonote",        "cryptonight" },
-  { "cryptonight-light", "cryptolight" },
-  { "diamond",           "dmd-gr"      },
-  { "droplp",            "drop"        },
-  { "espers",            "hmq1725"     },
-  { "flax",              "c11"         },
-  { "jane",              "scryptjane"  }, 
-  { "lyra2",             "lyra2re"     },
-  { "lyra2v2",           "lyra2rev2"   },
-  { "lyra2zoin",         "lyra2z330"   },
-  { "myriad",            "myr-gr"      },
-  { "neo",               "neoscrypt"   },
-  { "sib",               "x11gost"     },
-  { "yes",               "yescrypt"    },
-  { "ziftr",             "zr5"         },
-  { "zcoin",             "lyra2z"      },
-  { "zoin",              "lyra2z330"   },
-  { NULL,                NULL          }   
+  { "bitcore",           "timetravel10" },
+  { "bitzeny",           "yescryptr8"   },
+  { "blake256r8",        "blakecoin"    },
+  { "blake256r8vnl",     "vanilla"      },
+  { "blake256r14",       "blake"        },
+  { "blake256r14dcr",    "decred"       },
+  { "cryptonote",        "cryptonight"  },
+  { "cryptonight-light", "cryptolight"  },
+  { "diamond",           "dmd-gr"       },
+  { "droplp",            "drop"         },
+  { "espers",            "hmq1725"      },
+  { "flax",              "c11"          },
+  { "hsr",               "x13sm3"       },
+  { "jackpot",           "jha"          },
+  { "jane",              "scryptjane"   }, 
+  { "lyra2",             "lyra2re"      },
+  { "lyra2v2",           "lyra2rev2"    },
+  { "lyra2zoin",         "lyra2z330"    },
+  { "myrgr",             "myr-gr"       },
+  { "myriad",            "myr-gr"       },
+  { "neo",               "neoscrypt"    },
+  { "phi",               "phi1612"      },
+//  { "sia",               "blake2b"      },
+  { "sib",               "x11gost"      },
+  { "timetravel8",       "timetravel"   },
+  { "ziftr",             "zr5"          },
+  { "yenten",            "yescryptr16"  },
+  { "yescryptr8k",       "yescrypt"     },
+  { "zcoin",             "lyra2z"       },
+  { "zoin",              "lyra2z330"    },
+  { NULL,                NULL           }   
 };

 // if arg is a valid alias for a known algo it is updated with the proper name.
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -1,7 +1,6 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <stdint.h>
-
 #include "miner.h"

 /////////////////////////////
@@ -85,12 +84,13 @@

 typedef  uint32_t set_t;

-#define EMPTY_SET 0
-#define SSE2_OPT  1
-#define AES_OPT   2
-#define AVX_OPT   4
-#define AVX2_OPT  8
-#define SHA_OPT  16
+#define EMPTY_SET       0
+#define SSE2_OPT        1
+#define AES_OPT         2  
+#define AVX_OPT         4
+#define AVX2_OPT        8
+#define SHA_OPT      0x10
+//#define FOUR_WAY_OPT 0x20

 // return set containing all elements from sets a & b
 inline set_t set_union ( set_t a, set_t b ) { return a | b; }
@@ -156,7 +156,7 @@ bool return_false();
 void *return_null();
 void algo_not_tested();
 void algo_not_implemented();
-
+void four_way_not_tested();

 // Warning: algo_gate.nonce_index should only be used in targetted code
 // due to different behaviours by different targets. The JR2 index uses an
@@ -212,21 +212,24 @@ int64_t get_max64_0x3fffffLL();
 int64_t get_max64_0x1ffff();
 int64_t get_max64_0xffffLL();

-void std_set_target   ( struct work *work, double job_diff );
+void std_set_target(    struct work *work, double job_diff );
+void alt_set_target(    struct work* work, double job_diff );
 void scrypt_set_target( struct work *work, double job_diff );

-bool std_work_decode( const json_t *val, struct work *work );
+bool std_le_work_decode( const json_t *val, struct work *work );
+bool std_be_work_decode( const json_t *val, struct work *work );
 bool jr2_work_decode( const json_t *val, struct work *work );

-bool std_submit_getwork_result( CURL *curl, struct work *work );
+bool std_le_submit_getwork_result( CURL *curl, struct work *work );
+bool std_be_submit_getwork_result( CURL *curl, struct work *work );
 bool jr2_submit_getwork_result( CURL *curl, struct work *work );

 void std_le_build_stratum_request( char *req, struct work *work );
 void std_be_build_stratum_request( char *req, struct work *work );
 void jr2_build_stratum_request   ( char *req, struct work *work );

-// set_work_data_endian target, default is do_nothing;
-void swab_work_data( struct work *work );
+// Default is do_nothing (assumed LE)
+void set_work_data_big_endian( struct work *work );

 double std_calc_network_diff( struct work *work );

--- a/algo/argon2/argon2a.c
+++ b/algo/argon2/argon2a.c
@@ -1,5 +1,3 @@
-#include "miner.h"
-
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -0,0 +1,143 @@
+#include "blake-gate.h"
+#include "blake-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+
+#if defined (BLAKE_4WAY)
+
+blake256r14_4way_context blake_4w_ctx;
+
+void blakehash_4way(void *state, const void *input)
+{
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256r14_4way_context ctx;
+     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
+     blake256r14_4way( &ctx, input + (64<<2), 16 );
+     blake256r14_4way_close( &ctx, vhash );
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+
+   if (opt_benchmark)
+      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( edata, pdata, 20 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   blake256r14_4way_init( &blake_4w_ctx );
+   blake256r14_4way( &blake_4w_ctx, vdata, 64 );
+
+   uint32_t *noncep = vdata + 76;   // 19*4
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+
+      blakehash_4way( hash, vdata );
+
+      for ( int i = 0; i < 4; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      {
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+
+   } while ( (num_found == 0) && (n < max_nonce) 
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
+
+#if defined(BLAKE_8WAY)
+
+blake256r14_8way_context blake_8w_ctx;
+
+void blakehash_8way( void *state, const void *input )
+{
+     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+     blake256r14_8way_context ctx;
+     memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
+     blake256r14_8way( &ctx, input + (64<<3), 16 );
+     blake256r14_8way_close( &ctx, vhash );
+     mm256_deinterleave_8x32( state,     state+ 32, state+ 64, state+ 96,
+                              state+128, state+160, state+192, state+224,
+                              vhash, 256 );
+}
+
+int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+
+   if (opt_benchmark)
+      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( edata, pdata, 20 );
+
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+
+   blake256r14_8way_init( &blake_8w_ctx );
+   blake256r14_8way( &blake_8w_ctx, vdata, 64 );
+
+   uint32_t *noncep = vdata + 152;   // 19*8
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      be32enc( noncep +4, n+4 );
+      be32enc( noncep +5, n+5 );
+      be32enc( noncep +6, n+6 );
+      be32enc( noncep +7, n+7 );
+      pdata[19] = n;
+
+      blakehash_8way( hash, vdata );
+
+      for ( int i = 0; i < 8; i++ )
+      if ( (hash+i)[7] <= HTarget && fulltest( hash+i, ptarget ) )
+      {
+          found[i] = true;
+          num_found++;
+          nonces[i] = n+i;
+          work_set_target_ratio( work, hash+1 );
+      }
+      n += 8;
+
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -0,0 +1,26 @@
+#include "blake-gate.h"
+
+int64_t blake_get_max64 ()
+{
+  return 0x7ffffLL;
+}
+
+bool register_blake_algo( algo_gate_t* gate )
+{
+  gate->optimizations = AVX2_OPT;
+  gate->get_max64 = (void*)&blake_get_max64;
+//#if defined (__AVX2__) && defined (FOUR_WAY)
+//   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
+//  gate->scanhash  = (void*)&scanhash_blake_8way;
+//  gate->hash      = (void*)&blakehash_8way;
+#if defined(BLAKE_4WAY)
+  four_way_not_tested();
+  gate->scanhash  = (void*)&scanhash_blake_4way;
+  gate->hash      = (void*)&blakehash_4way;
+#else
+  gate->scanhash  = (void*)&scanhash_blake;
+  gate->hash      = (void*)&blakehash;
+#endif
+  return true;
+}
+
--- a/algo/blake/blake-gate.h
+++ b/algo/blake/blake-gate.h
@@ -0,0 +1,21 @@
+#ifndef __BLAKE_GATE_H__
+#define __BLAKE_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__)
+  #define BLAKE_4WAY
+#endif
+
+#if defined (BLAKE_4WAY)
+void blakehash_4way(void *state, const void *input);
+int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
+void blakehash( void *state, const void *input );
+int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done );
+
+#endif
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -0,0 +1,143 @@
+/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */
+/**
+ * BLAKE interface. BLAKE is a family of functions which differ by their
+ * output size; this implementation defines BLAKE for output sizes 224,
+ * 256, 384 and 512 bits. This implementation conforms to the "third
+ * round" specification.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_blake.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef __BLAKE_HASH_4WAY__
+#define __BLAKE_HASH_4WAY__ 1
+
+#ifdef __AVX__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_blake256   256
+
+#define SPH_SIZE_blake512   512
+
+// With AVX only Blake-256 4 way is available.
+// With AVX2 Blake-256 8way & Blake-512 4 way are also available.
+
+// Blake-256 4 way
+
+typedef struct {
+   __m128i buf[16] __attribute__ ((aligned (64)));
+   __m128i H[8];
+   __m128i S[4];    
+   size_t ptr;
+   sph_u32 T0, T1;
+   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
+} blake_4way_small_context;
+
+// Default 14 rounds
+typedef blake_4way_small_context blake256_4way_context;
+void blake256_4way_init(void *cc);
+void blake256_4way(void *cc, const void *data, size_t len);
+void blake256_4way_close(void *cc, void *dst);
+
+// 14 rounds, blake, decred
+typedef blake_4way_small_context blake256r14_4way_context;
+void blake256r14_4way_init(void *cc);
+void blake256r14_4way(void *cc, const void *data, size_t len);
+void blake256r14_4way_close(void *cc, void *dst);
+
+// 8 rounds, blakecoin, vanilla
+typedef blake_4way_small_context blake256r8_4way_context;
+void blake256r8_4way_init(void *cc);
+void blake256r8_4way(void *cc, const void *data, size_t len);
+void blake256r8_4way_close(void *cc, void *dst);
+
+#ifdef __AVX2__
+
+// Blake-256 8 way
+
+typedef struct {
+   __m256i buf[16] __attribute__ ((aligned (64)));
+   __m256i H[8];
+   __m256i S[4];
+   size_t ptr;
+   sph_u32 T0, T1;
+   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
+} blake_8way_small_context;
+
+// Default 14 rounds
+typedef blake_8way_small_context blake256_8way_context;
+void blake256_8way_init(void *cc);
+void blake256_8way(void *cc, const void *data, size_t len);
+void blake256_8way_close(void *cc, void *dst);
+
+// 14 rounds, blake, decred
+typedef blake_8way_small_context blake256r14_8way_context;
+void blake256r14_8way_init(void *cc);
+void blake256r14_8way(void *cc, const void *data, size_t len);
+void blake256r14_8way_close(void *cc, void *dst);
+
+// 8 rounds, blakecoin, vanilla
+typedef blake_8way_small_context blake256r8_8way_context;
+void blake256r8_8way_init(void *cc);
+void blake256r8_8way(void *cc, const void *data, size_t len);
+void blake256r8_8way_close(void *cc, void *dst);
+
+// Blake-512 4 way
+
+typedef struct {
+   __m256i buf[16] __attribute__ ((aligned (64)));
+   __m256i H[8];
+   __m256i S[4];   
+   size_t ptr;
+   sph_u64 T0, T1;
+} blake_4way_big_context;
+
+typedef blake_4way_big_context blake512_4way_context;
+
+void blake512_4way_init(void *cc);
+void blake512_4way(void *cc, const void *data, size_t len);
+void blake512_4way_close(void *cc, void *dst);
+void blake512_4way_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
--- a/algo/blake/blake.c
+++ b/algo/blake/blake.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"
 #include "sph_blake.h"

@@ -90,18 +89,3 @@ int scanhash_blake( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }

-// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
-int64_t blake_get_max64 ()
-{
-  return 0x7ffffLL;
-}
-
-bool register_blake_algo( algo_gate_t* gate )
-{
-  gate->scanhash  = (void*)&scanhash_blake;
-  gate->hash      = (void*)&blakehash;
-  gate->get_max64 = (void*)&blake_get_max64;
-  return true;
-}
-
-
--- a/algo/blake/blake2b.c
+++ b/algo/blake/blake2b.c
@@ -3,16 +3,13 @@
 * tpruvot@github 2015-2016
 */

-#include "miner.h"
 #include "algo-gate-api.h"
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/sph_blake2b.h"

-
-static __thread sph_blake2b_ctx s_midstate;
-static __thread sph_blake2b_ctx s_ctx;
+//static __thread sph_blake2b_ctx s_midstate;
+//static __thread sph_blake2b_ctx s_ctx;
 #define MIDLEN 76
 #define A 64

@@ -28,6 +25,7 @@ void blake2b_hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

+/*
 static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
 {
 	s_ctx.outlen = MIDLEN;
@@ -35,6 +33,7 @@ static void blake2b_hash_end(uint32_t *output, const uint32_t *input)
 	sph_blake2b_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
 	sph_blake2b_final(&s_ctx, (uint8_t*) output);
 }
+*/

 int scanhash_blake2b( int thr_id, struct work *work, uint32_t max_nonce,
                      uint64_t *hashes_done )
@@ -220,6 +219,8 @@ bool register_blake2b_algo( algo_gate_t* gate )
  gate->hash                  = (void*)&blake2b_hash;
  gate->calc_network_diff     = (void*)&blake2b_calc_network_diff;
  gate->build_stratum_request = (void*)&blake2b_be_build_stratum_request;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
  gate->build_extraheader     = (void*)&blake2b_build_extraheader;
  gate->get_new_work          = (void*)&blake2b_get_new_work;
  gate->get_max64             = (void*)&blake2b_get_max64;
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -0,0 +1,134 @@
+#include "blake2s-gate.h"
+#include "blake2s-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+
+#if defined(BLAKE2S_8WAY)
+
+static __thread blake2s_8way_state blake2s_8w_ctx;
+
+void blake2s_8way_hash( void *output, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+   blake2s_8way_state ctx;
+   memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
+
+   blake2s_8way_update( &ctx, input + (64<<3), 16 );
+   blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
+
+   mm256_deinterleave_8x32( output,     output+ 32, output+ 64, output+ 96,
+                            output+128, output+160, output+192, output+224,
+                            vhash, 256 );
+}
+
+int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(64) edata[20];
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 152;   // 19*8
+
+   swab32_array( edata, pdata, 20 );
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
+
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      be32enc( noncep +4, n+4 );
+      be32enc( noncep +5, n+5 );
+      be32enc( noncep +6, n+6 );
+      be32enc( noncep +7, n+7 );
+      pdata[19] = n;
+
+      blake2s_8way_hash( hash, vdata );
+
+
+      for ( int i = 0; i < 8; i++ )
+      if (  (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 8;
+
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#elif defined(BLAKE2S_4WAY)
+
+static __thread blake2s_4way_state blake2s_4w_ctx;
+
+void blake2s_4way_hash( void *output, const void *input )
+{
+   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+   blake2s_4way_state ctx;
+   memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
+
+   blake2s_4way_update( &ctx, input + (64<<2), 16 );
+   blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
+
+   mm_deinterleave_4x32( output, output+32, output+64, output+96, vhash, 256 );
+}
+
+int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(64) edata[20];
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 76;   // 19*4
+
+   swab32_array( edata, pdata, 20 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
+
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      pdata[19] = n;
+
+      blake2s_4way_hash( hash, vdata );
+
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -0,0 +1,27 @@
+#include "blake2s-gate.h"
+
+
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blake2s_get_max64 ()
+{
+   return 0x7ffffLL;
+}
+
+bool register_blake2s_algo( algo_gate_t* gate )
+{
+#if defined(BLAKE2S_8WAY)
+  gate->scanhash  = (void*)&scanhash_blake2s_8way;
+  gate->hash      = (void*)&blake2s_8way_hash;
+#elif defined(BLAKE2S_4WAY)
+  gate->scanhash  = (void*)&scanhash_blake2s_4way;
+  gate->hash      = (void*)&blake2s_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_blake2s;
+  gate->hash      = (void*)&blake2s_hash;
+#endif
+  gate->get_max64 = (void*)&blake2s_get_max64;
+  gate->optimizations = AVX_OPT | AVX2_OPT;
+  return true;
+};
+
+
--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -0,0 +1,35 @@
+#ifndef __BLAKE2S_GATE_H__
+#define __BLAKE2S_GATE_H__ 1
+
+#include <stdint.h>
+#include "algo-gate-api.h"
+
+#if defined(__AVX__)
+  #define BLAKE2S_4WAY
+#endif
+#if defined(__AVX2__)
+  #define BLAKE2S_8WAY
+#endif
+
+bool register_blake2s_algo( algo_gate_t* gate );
+
+#if defined(BLAKE2S_8WAY)
+
+void blake2s_8way_hash( void *state, const void *input );
+int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+#elif defined (BLAKE2S_4WAY)
+
+void blake2s_4way_hash( void *state, const void *input );
+int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#else
+
+void blake2s_hash( void *state, const void *input );
+int scanhash_blake2s( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done );
+
+#endif
+
+#endif
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -0,0 +1,362 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#include "blake2s-hash-4way.h"
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(__AVX__)
+
+static const uint32_t blake2s_IV[8] =
+{
+	0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+	0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static const uint8_t blake2s_sigma[10][16] =
+{
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+};
+
+// define a constant for initial param.
+
+int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
+{
+   blake2s_nway_param P[1];
+
+   P->digest_length = outlen;
+   P->key_length    = 0;
+   P->fanout        = 1;
+   P->depth         = 1;
+   P->leaf_length   = 0;    
+   *((uint64_t*)(P->node_offset)) = 0;
+   P->node_depth    = 0;
+   P->inner_length  = 0;
+   memset( P->salt,     0, sizeof( P->salt ) );
+   memset( P->personal, 0, sizeof( P->personal ) );
+
+   memset( S, 0, sizeof( blake2s_4way_state ) );
+   for( int i = 0; i < 8; ++i )
+      S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
+
+   uint32_t *p = ( uint32_t * )( P );
+
+   /* IV XOR ParamBlock */
+   for ( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm_xor_si128( S->h[i], _mm_set1_epi32( p[i] ) );
+   return 0;
+}
+
+int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
+{
+   __m128i m[16];
+   __m128i v[16];
+
+   memcpy_128( m, block, 16 );
+   memcpy_128( v, S->h, 8 );
+
+   v[ 8] = _mm_set1_epi32( blake2s_IV[0] );
+   v[ 9] = _mm_set1_epi32( blake2s_IV[1] );
+   v[10] = _mm_set1_epi32( blake2s_IV[2] );
+   v[11] = _mm_set1_epi32( blake2s_IV[3] );
+   v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
+                          _mm_set1_epi32( blake2s_IV[4] ) );
+   v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
+                          _mm_set1_epi32( blake2s_IV[5] ) );
+   v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
+                          _mm_set1_epi32( blake2s_IV[6] ) );
+   v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
+                          _mm_set1_epi32( blake2s_IV[7] ) );
+
+#define G4W(r,i,a,b,c,d) \
+do { \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+0] ] ); \
+   d = mm_rotr_32( _mm_xor_si128( d, a ), 16 ); \
+   c = _mm_add_epi32( c, d ); \
+   b = mm_rotr_32( _mm_xor_si128( b, c ), 12 ); \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+1] ] ); \
+   d = mm_rotr_32( _mm_xor_si128( d, a ),  8 ); \
+   c = _mm_add_epi32( c, d ); \
+   b = mm_rotr_32( _mm_xor_si128( b, c ),  7 ); \
+} while(0)
+
+#define ROUND4W(r)  \
+do { \
+   G4W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
+   G4W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
+   G4W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
+   G4W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
+   G4W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
+   G4W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
+   G4W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
+   G4W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
+} while(0)
+
+   ROUND4W( 0 );
+   ROUND4W( 1 );
+   ROUND4W( 2 );
+   ROUND4W( 3 );
+   ROUND4W( 4 );
+   ROUND4W( 5 );
+   ROUND4W( 6 );
+   ROUND4W( 7 );
+   ROUND4W( 8 );
+   ROUND4W( 9 );
+
+   for( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm_xor_si128( _mm_xor_si128( S->h[i], v[i] ), v[i + 8] );
+
+#undef G4W
+#undef ROUND4W
+   return 0;
+}
+
+int blake2s_4way_update( blake2s_4way_state *S, const void *in,
+                         uint64_t inlen )
+{
+  __m128i *input = (__m128i*)in;
+  __m128i *buf = (__m128i*)S->buf;
+  const int bsize = BLAKE2S_BLOCKBYTES;
+
+   while( inlen > 0 )
+   {
+      size_t left = S->buflen;
+      if( inlen >= bsize - left )
+      {
+         memcpy_128( buf + (left>>2), input, (bsize - left) >> 2 );
+         S->buflen += bsize - left;
+         S->t[0] += BLAKE2S_BLOCKBYTES;
+         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         blake2s_4way_compress( S, buf ); 
+         S->buflen = 0;
+         input += ( bsize >> 2 );
+         inlen -= bsize;
+      }
+      else
+      {
+          memcpy_128( buf + ( left>>2 ), input, inlen>>2 );
+          S->buflen += (size_t) inlen; 
+          input += ( inlen>>2 );
+          inlen -= inlen;
+      }
+   }
+   return 0;
+}
+
+int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
+{
+   __m128i *buf = (__m128i*)S->buf;
+
+   S->t[0] += S->buflen;
+   S->t[1] += ( S->t[0] < S->buflen );
+   if ( S->last_node ) 
+      S->f[1] = ~0U;
+   S->f[0] = ~0U;
+
+   memset_zero_128( buf + ( S->buflen>>2 ),
+                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );      
+   blake2s_4way_compress( S, buf );
+
+   for ( int i = 0; i < 8; ++i )
+      casti_m128i( out, i ) = S->h[ i ];
+   return 0;
+}
+
+#if defined(__AVX2__)
+
+int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
+{
+   __m256i m[16];
+   __m256i v[16];
+
+   memcpy_256( m, block, 16 );
+   memcpy_256( v, S->h, 8 );
+
+   v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
+   v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
+   v[10] = _mm256_set1_epi32( blake2s_IV[2] );
+   v[11] = _mm256_set1_epi32( blake2s_IV[3] );
+   v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
+                             _mm256_set1_epi32( blake2s_IV[4] ) );
+   v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
+                             _mm256_set1_epi32( blake2s_IV[5] ) );
+   v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
+                             _mm256_set1_epi32( blake2s_IV[6] ) );
+   v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
+                             _mm256_set1_epi32( blake2s_IV[7] ) );
+
+#define G8W(r,i,a,b,c,d) \
+do { \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
+                          m[ blake2s_sigma[r][2*i+0] ] ); \
+   d = mm256_rotr_32( _mm256_xor_si256( d, a ), 16 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_rotr_32( _mm256_xor_si256( b, c ), 12 ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
+                         m[ blake2s_sigma[r][2*i+1] ] ); \
+   d = mm256_rotr_32( _mm256_xor_si256( d, a ),  8 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_rotr_32( _mm256_xor_si256( b, c ),  7 ); \
+} while(0)
+
+#define ROUND8W(r)  \
+do { \
+   G8W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
+   G8W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
+   G8W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
+   G8W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
+   G8W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
+   G8W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
+   G8W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
+   G8W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
+} while(0)
+
+   ROUND8W( 0 );
+   ROUND8W( 1 );
+   ROUND8W( 2 );
+   ROUND8W( 3 );
+   ROUND8W( 4 );
+   ROUND8W( 5 );
+   ROUND8W( 6 );
+   ROUND8W( 7 );
+   ROUND8W( 8 );
+   ROUND8W( 9 );
+
+   for( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
+
+#undef G8W
+#undef ROUND8W
+   return 0;
+}
+
+int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
+{
+   blake2s_nway_param P[1];
+
+   P->digest_length = outlen;
+   P->key_length    = 0;
+   P->fanout        = 1;
+   P->depth         = 1;
+   P->leaf_length   = 0;
+   *((uint64_t*)(P->node_offset)) = 0;
+   P->node_depth    = 0;
+   P->inner_length  = 0;
+   memset( P->salt,     0, sizeof( P->salt ) );
+   memset( P->personal, 0, sizeof( P->personal ) );
+
+   memset( S, 0, sizeof( blake2s_8way_state ) );
+   for( int i = 0; i < 8; ++i )
+      S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
+
+   uint32_t *p = ( uint32_t * )( P );
+
+   /* IV XOR ParamBlock */
+   for ( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm256_xor_si256( S->h[i], _mm256_set1_epi32( p[i] ) );
+   return 0;
+}
+
+int blake2s_8way_update( blake2s_8way_state *S, const void *in,
+                         uint64_t inlen )
+{
+  __m256i *input = (__m256i*)in;
+  __m256i *buf = (__m256i*)S->buf;
+  const int bsize = BLAKE2S_BLOCKBYTES;
+
+   while( inlen > 0 )
+   {
+      size_t left = S->buflen;
+      if( inlen >= bsize - left )
+      {
+         memcpy_256( buf + (left>>2), input, (bsize - left) >> 2 );
+         S->buflen += bsize - left;
+         S->t[0] += BLAKE2S_BLOCKBYTES;
+         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         blake2s_8way_compress( S, buf );
+         S->buflen = 0;
+         input += ( bsize >> 2 );
+         inlen -= bsize;
+      }
+      else
+      {
+          memcpy_256( buf + ( left>>2 ), input, inlen>>2 );
+          S->buflen += (size_t) inlen;
+          input += ( inlen>>2 );
+          inlen -= inlen;
+      }
+   }
+   return 0;
+}
+
+int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
+{
+   __m256i *buf = (__m256i*)S->buf;
+
+   S->t[0] += S->buflen;
+   S->t[1] += ( S->t[0] < S->buflen );
+   if ( S->last_node )
+      S->f[1] = ~0U;
+   S->f[0] = ~0U;
+
+   memset_zero_256( buf + ( S->buflen>>2 ),
+                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
+   blake2s_8way_compress( S, buf );
+
+   for ( int i = 0; i < 8; ++i )
+      casti_m256i( out, i ) = S->h[ i ];
+   return 0;
+}
+
+
+#endif // __AVX2__
+
+#if 0
+int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+{
+	blake2s_state S[1];
+
+	/* Verify parameters */
+	if ( NULL == in ) return -1;
+
+	if ( NULL == out ) return -1;
+
+	if ( NULL == key ) keylen = 0; /* Fail here instead if keylen != 0 and key == NULL? */
+
+	if( keylen > 0 )
+	{
+		if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+	}
+	else
+	{
+		if( blake2s_init( S, outlen ) < 0 ) return -1;
+	}
+
+	blake2s_update( S, ( uint8_t * )in, inlen );
+	blake2s_final( S, out, outlen );
+	return 0;
+}
+#endif
+
+#endif // __AVX__
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -0,0 +1,112 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+//#pragma once
+#ifndef __BLAKE2S_HASH_4WAY_H__
+#define __BLAKE2S_HASH_4WAY_H__ 1
+
+#if defined(__AVX__)
+
+#include "avxdefs.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <inttypes.h>
+#define inline __inline
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+enum blake2s_constant
+{
+   BLAKE2S_BLOCKBYTES = 64,
+   BLAKE2S_OUTBYTES   = 32,
+   BLAKE2S_KEYBYTES   = 32,
+   BLAKE2S_SALTBYTES  = 8,
+   BLAKE2S_PERSONALBYTES = 8
+};
+
+#pragma pack(push, 1)
+typedef struct __blake2s_nway_param
+{
+   uint8_t  digest_length; // 1
+   uint8_t  key_length;    // 2
+   uint8_t  fanout;        // 3
+   uint8_t  depth;         // 4
+   uint32_t leaf_length;   // 8
+   uint8_t  node_offset[6];// 14
+   uint8_t  node_depth;    // 15
+   uint8_t  inner_length;  // 16
+   // uint8_t  reserved[0];
+   uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
+   uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+} blake2s_nway_param;
+#pragma pack(pop)
+
+ALIGN( 64 ) typedef struct __blake2s_4way_state
+{
+   __m128i h[8];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 4 ];
+   uint32_t t[2];
+   uint32_t f[2];
+   size_t   buflen;
+   uint8_t  last_node;
+} blake2s_4way_state ;
+
+int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen );
+int blake2s_4way_update( blake2s_4way_state *S, const void *in,
+                         uint64_t inlen );
+int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
+
+#if defined(__AVX2__)
+
+ALIGN( 64 ) typedef struct __blake2s_8way_state
+{
+   __m256i h[8];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 8 ];
+   uint32_t t[2];
+   uint32_t f[2];
+   size_t   buflen;
+   uint8_t  last_node;
+} blake2s_8way_state ;
+
+int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
+int blake2s_8way_update( blake2s_8way_state *S, const void *in,
+                         uint64_t inlen );
+int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
+
+#endif
+
+#if 0
+	// Simple API
+//	int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+
+	// Direct Hash Mining Helpers
+	#define blake2s_salt32(out, in, inlen, key32) blake2s(out, in, key32, 32, inlen, 32) /* neoscrypt */
+	#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif  // __AVX__
+
+#endif
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -1,27 +1,29 @@
-#include "miner.h"
-#include "algo-gate-api.h"
+#include "blake2s-gate.h"

 #include <string.h>
 #include <stdint.h>

-#include "crypto/blake2s.h"
+#include "sph-blake2s.h"

-static __thread blake2s_state s_midstate;
-static __thread blake2s_state s_ctx;
+static __thread blake2s_state blake2s_ctx;
+//static __thread blake2s_state s_ctx;
 #define MIDLEN 76

-void blake2s_hash(void *output, const void *input)
+void blake2s_hash( void *output, const void *input )
 {
-	unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
-	blake2s_state blake2_ctx __attribute__ ((aligned (64)));
-
-	blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES);
-	blake2s_update(&blake2_ctx, input, 80);
-	blake2s_final(&blake2_ctx, hash, BLAKE2S_OUTBYTES);
+   unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
+   blake2s_state ctx __attribute__ ((aligned (64)));
+  
+   memcpy( &ctx, &blake2s_ctx, sizeof ctx );
+   blake2s_update( &ctx, input+64, 16 );
+ 
+//	blake2s_init(&ctx, BLAKE2S_OUTBYTES);
+//	blake2s_update(&ctx, input, 80);
+	blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );

 	memcpy(output, hash, 32);
 }
-
+/*
 static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
 {
 	s_ctx.buflen = MIDLEN;
@@ -29,7 +31,7 @@ static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
 	blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
 	blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
 }
-
+*/
 int scanhash_blake2s(int thr_id, struct work *work,
 	uint32_t max_nonce, uint64_t *hashes_done)
 {
@@ -47,13 +49,12 @@ int scanhash_blake2s(int thr_id, struct work *work,
        swab32_array( endiandata, pdata, 20 );

 	// midstate
-	blake2s_init(&s_midstate, BLAKE2S_OUTBYTES);
-	blake2s_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
-	memcpy(&s_ctx, &s_midstate, sizeof(blake2s_state));
+	blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
+	blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );

 	do {
 		be32enc(&endiandata[19], n);
-		blake2s_hash_end(hash64, endiandata);
+		blake2s_hash( hash64, endiandata );
 		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
 			*hashes_done = n - first_nonce + 1;
 			pdata[19] = n;
@@ -68,7 +69,7 @@ int scanhash_blake2s(int thr_id, struct work *work,

 	return 0;
 }
-
+/*
 // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
 int64_t blake2s_get_max64 ()
 {
@@ -82,4 +83,4 @@ bool register_blake2s_algo( algo_gate_t* gate )
  gate->get_max64 = (void*)&blake2s_get_max64;
  return true;
 };
-
+*/
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -0,0 +1,139 @@
+#include "blakecoin-gate.h"
+#include "blake-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+
+#if defined (BLAKECOIN_4WAY)
+
+blake256r8_4way_context blakecoin_4w_ctx;
+
+void blakecoin_4way_hash(void *state, const void *input)
+{
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256r8_4way_context ctx;
+
+     memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
+     blake256r8_4way( &ctx, input + (64<<2), 16 );
+     blake256r8_4way_close( &ctx, vhash );
+
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   if ( opt_benchmark )
+      HTarget = 0x7f;
+
+   swab32_array( edata, pdata, 20 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   blake256r8_4way_init( &blakecoin_4w_ctx );
+   blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );
+
+   uint32_t *noncep = vdata + 76;   // 19*4
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      pdata[19] = n;
+      blakecoin_4way_hash( hash, vdata );
+
+      for ( int i = 0; i < 4; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      {
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+
+   } while ( (num_found == 0) && (n < max_nonce) 
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
+
+#if defined(BLAKECOIN_8WAY)
+
+blake256r8_8way_context blakecoin_8w_ctx;
+
+void blakecoin_8way_hash( void *state, const void *input )
+{
+     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+     blake256r8_8way_context ctx;
+
+     memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
+     blake256r8_8way( &ctx, input + (64<<3), 16 );
+     blake256r8_8way_close( &ctx, vhash );
+
+     mm256_deinterleave_8x32( state,     state+ 32, state+ 64, state+ 96,
+                              state+128, state+160, state+192, state+224,
+                              vhash, 256 );
+}
+
+int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   uint32_t *noncep = vdata + 152;   // 19*8
+   int num_found = 0;
+   if ( opt_benchmark )
+      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( edata, pdata, 20 );
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+   blake256r8_8way_init( &blakecoin_8w_ctx );
+   blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );
+
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      be32enc( noncep +4, n+4 );
+      be32enc( noncep +5, n+5 );
+      be32enc( noncep +6, n+6 );
+      be32enc( noncep +7, n+7 );
+      pdata[19] = n;
+      blakecoin_8way_hash( hash, vdata );
+
+      for ( int i = 0; i < 8; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      {
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 8;
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
+
--- a/algo/blake/blakecoin-gate.c
+++ b/algo/blake/blakecoin-gate.c
@@ -0,0 +1,36 @@
+#include "blakecoin-gate.h"
+#include <memory.h>
+
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blakecoin_get_max64 ()
+{
+  return 0x7ffffLL;
+//  return 0x3fffffLL;
+}
+
+// vanilla uses default gen merkle root, otherwise identical to blakecoin
+bool register_vanilla_algo( algo_gate_t* gate )
+{
+#if defined(BLAKECOIN_8WAY)
+  gate->scanhash  = (void*)&scanhash_blakecoin_8way;
+  gate->hash      = (void*)&blakecoin_8way_hash;
+
+#elif defined(BLAKECOIN_4WAY)
+  gate->scanhash  = (void*)&scanhash_blakecoin_4way;
+  gate->hash      = (void*)&blakecoin_4way_hash;
+#else
+  gate->scanhash = (void*)&scanhash_blakecoin;
+  gate->hash     = (void*)&blakecoinhash;
+#endif
+  gate->optimizations = AVX_OPT | AVX2_OPT;
+  gate->get_max64 = (void*)&blakecoin_get_max64;
+  return true;
+}
+
+bool register_blakecoin_algo( algo_gate_t* gate )
+{
+  register_vanilla_algo( gate );
+  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  return true;
+}
+
--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -0,0 +1,30 @@
+#ifndef __BLAKECOIN_GATE_H__
+#define __BLAKECOIN_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX__)
+  #define BLAKECOIN_4WAY
+#endif
+#if defined(__AVX2__)
+  #define BLAKECOIN_8WAY
+#endif
+
+#if defined (BLAKECOIN_8WAY)
+void blakecoin_8way_hash(void *state, const void *input);
+int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
+#if defined (BLAKECOIN_4WAY)
+void blakecoin_4way_hash(void *state, const void *input);
+int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
+void blakecoinhash( void *state, const void *input );
+int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done );
+
+#endif
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -1,5 +1,4 @@
-#include "miner.h"
-#include "algo-gate-api.h"
+#include "blakecoin-gate.h"
 #define BLAKE32_ROUNDS 8
 #include "sph_blake.h"

@@ -99,7 +98,7 @@ void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
 SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
 }
 */
-
+/*
 // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
 int64_t blakecoin_get_max64 ()
 {
@@ -122,4 +121,4 @@ bool register_blakecoin_algo( algo_gate_t* gate )
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  return true;
 }
-
+*/
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -0,0 +1,75 @@
+#include "decred-gate.h"
+#include "blake-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+#include <unistd.h>
+
+#if defined (DECRED_4WAY)
+
+static __thread blake256_4way_context blake_mid;
+
+void decred_hash_4way( void *state, const void *input )
+{
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+//     uint32_t hash0[8] __attribute__ ((aligned (32)));
+//     uint32_t hash1[8] __attribute__ ((aligned (32)));
+//     uint32_t hash2[8] __attribute__ ((aligned (32)));
+//     uint32_t hash3[8] __attribute__ ((aligned (32)));
+     const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
+     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
+     blake256_4way_context ctx __attribute__ ((aligned (64)));
+
+     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
+     blake256_4way( &ctx, tail, tail_len );
+     blake256_4way_close( &ctx, vhash );
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done)
+{
+   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t _ALIGN(64) edata[48];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
+   uint32_t n = first_nonce;
+   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+
+   // copy to buffer guaranteed to be aligned.
+   memcpy( edata, pdata, 180 );
+
+   // use the old way until  new way updated for size.
+   mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );
+
+   blake256_4way_init( &blake_mid );
+   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
+
+   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
+   do {
+      * noncep    = n;
+      *(noncep+1) = n+1;
+      *(noncep+2) = n+2;
+      *(noncep+3) = n+3;
+
+      decred_hash_4way( hash, vdata );
+
+      for ( int i = 0; i < 4; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      {
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+  } while ( (num_found == 0) && (n < max_nonce) 
+            && !work_restart[thr_id].restart );
+
+  *hashes_done = n - first_nonce + 1;
+  return num_found;
+}
+
+#endif
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -0,0 +1,172 @@
+#include "decred-gate.h"
+#include <unistd.h>
+#include <memory.h>
+#include <string.h>
+
+uint32_t *decred_get_nonceptr( uint32_t *work_data )
+{
+   return &work_data[ DECRED_NONCE_INDEX ];
+}
+
+double decred_calc_network_diff( struct work* work )
+{
+   // sample for diff 43.281 : 1c05ea29
+   // todo: endian reversed on longpoll could be zr5 specific...
+   uint32_t nbits = work->data[ DECRED_NBITS_INDEX ];
+   uint32_t bits = ( nbits & 0xffffff );
+   int16_t shift = ( swab32(nbits) & 0xff ); // 0x1c = 28
+   int m;
+   double d = (double)0x0000ffff / (double)bits;
+
+   for ( m = shift; m < 29; m++ )
+       d *= 256.0;
+   for ( m = 29; m < shift; m++ )
+       d /= 256.0;
+   if ( shift == 28 )
+       d *= 256.0; // testnet
+   if ( opt_debug_diff )
+       applog( LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d,
+                           shift, bits );
+   return net_diff;
+}
+
+void decred_decode_extradata( struct work* work, uint64_t* net_blocks )
+{
+   // some random extradata to make the work unique
+   work->data[ DECRED_XNONCE_INDEX ] = (rand()*4);
+   work->height = work->data[32];
+   if (!have_longpoll && work->height > *net_blocks + 1)
+   {
+      char netinfo[64] = { 0 };
+      if (opt_showdiff && net_diff > 0.)
+      {
+         if (net_diff != work->targetdiff)
+            sprintf(netinfo, ", diff %.3f, target %.1f", net_diff,
+                   work->targetdiff);
+         else
+             sprintf(netinfo, ", diff %.3f", net_diff);
+       }
+       applog(LOG_BLUE, "%s block %d%s", algo_names[opt_algo], work->height,
+                       netinfo);
+       *net_blocks = work->height - 1;
+   }
+}
+
+void decred_be_build_stratum_request( char *req, struct work *work,
+                                      struct stratum_ctx *sctx )
+{
+   unsigned char *xnonce2str;
+   uint32_t ntime, nonce;
+   char ntimestr[9], noncestr[9];
+
+   be32enc( &ntime, work->data[ DECRED_NTIME_INDEX ] );
+   be32enc( &nonce, work->data[ DECRED_NONCE_INDEX ] );
+   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
+   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
+   xnonce2str = abin2hex( (char*)( &work->data[ DECRED_XNONCE_INDEX ] ),
+                                     sctx->xnonce1_size );
+   snprintf( req, JSON_BUF_LEN,
+        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
+         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
+   free(xnonce2str);
+}
+#define min(a,b) (a>b ? (b) :(a))
+
+void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
+{
+   uchar merkle_root[64] = { 0 };
+   uint32_t extraheader[32] = { 0 };
+   int headersize = 0;
+   uint32_t* extradata = (uint32_t*) sctx->xnonce1;
+   size_t t;
+   int i;
+
+   // getwork over stratum, getwork merkle + header passed in coinb1
+   memcpy(merkle_root, sctx->job.coinbase, 32);
+   headersize = min((int)sctx->job.coinbase_size - 32,
+                  sizeof(extraheader) );
+   memcpy( extraheader, &sctx->job.coinbase[32], headersize );
+
+   // Increment extranonce2 
+   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
+
+   // Assemble block header 
+   memset( g_work->data, 0, sizeof(g_work->data) );
+   g_work->data[0] = le32dec( sctx->job.version );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[1 + i] = swab32(
+                              le32dec( (uint32_t *) sctx->job.prevhash + i ) );
+   for ( i = 0; i < 8; i++ )
+      g_work->data[9 + i] = swab32( be32dec( (uint32_t *) merkle_root + i ) );
+
+//   for ( i = 0; i < 8; i++ ) // prevhash
+//      g_work->data[1 + i] = swab32( g_work->data[1 + i] );
+//   for ( i = 0; i < 8; i++ ) // merkle
+//      g_work->data[9 + i] = swab32( g_work->data[9 + i] );
+
+   for ( i = 0; i < headersize/4; i++ ) // header
+      g_work->data[17 + i] = extraheader[i];
+   // extradata
+
+   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
+      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
+   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
+      g_work->data[i] = 0;
+   g_work->data[37] = (rand()*4) << 8;
+   // block header suffix from coinb2 (stake version)
+   memcpy( &g_work->data[44],
+           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
+   sctx->bloc_height = g_work->data[32];
+   //applog_hex(work->data, 180);
+   //applog_hex(&work->data[36], 36);
+}
+
+#undef min
+
+bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
+                           int thr_id )
+{
+   if ( have_stratum && strcmp(stratum->job.job_id, work->job_id)  )
+      // need to regen g_work..
+      return false;
+   if ( have_stratum && !work->data[0] && !opt_benchmark )
+   {
+      sleep(1);
+      return false;
+   }
+   // extradata: prevent duplicates
+   work->data[ DECRED_XNONCE_INDEX     ] += 1;
+   work->data[ DECRED_XNONCE_INDEX + 1 ] |= thr_id;
+   return true;
+}
+
+
+bool register_decred_algo( algo_gate_t* gate )
+{
+#if defined(DECRED_4WAY)
+  four_way_not_tested();
+  gate->scanhash  = (void*)&scanhash_decred_4way;
+  gate->hash      = (void*)&decred_hash_4way;
+#else
+  gate->scanhash  = (void*)&scanhash_decred;
+  gate->hash      = (void*)&decred_hash;
+#endif
+  gate->optimizations = AVX2_OPT;
+  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
+  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
+  gate->display_extra_data    = (void*)&decred_decode_extradata;
+  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
+  gate->build_extraheader     = (void*)&decred_build_extraheader;
+  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
+  gate->nbits_index           = DECRED_NBITS_INDEX;
+  gate->ntime_index           = DECRED_NTIME_INDEX;
+  gate->nonce_index           = DECRED_NONCE_INDEX;
+  gate->work_data_size        = DECRED_DATA_SIZE;
+  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE;
+  allow_mininginfo            = false;
+  have_gbt                    = false;
+  return true;
+}
+
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -0,0 +1,36 @@
+#ifndef __DECRED_GATE_H__
+#define __DECRED_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#define DECRED_NBITS_INDEX 29
+#define DECRED_NTIME_INDEX 34
+#define DECRED_NONCE_INDEX 35
+#define DECRED_XNONCE_INDEX 36
+#define DECRED_DATA_SIZE 192
+#define DECRED_WORK_COMPARE_SIZE 140
+#define DECRED_MIDSTATE_LEN 128
+
+#if defined (__AVX2__) 
+//void blakehash_84way(void *state, const void *input);
+//int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
+//                         uint64_t *hashes_done );
+#endif
+
+#if defined(__AVX2__)
+  #define DECRED_4WAY
+#endif
+
+#if defined (DECRED_4WAY)
+void decred_hash_4way(void *state, const void *input);
+int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+#endif
+
+void decred_hash( void *state, const void *input );
+int scanhash_decred( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+
+#endif
+
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -1,5 +1,4 @@
-#include "miner.h"
-#include "algo-gate-api.h"
+#include "decred-gate.h"
 #include "sph_blake.h"

 #include <string.h>
@@ -15,33 +14,33 @@
 #define max(a,b) (a<b ? b : a)
 #endif
 */
-
+/*
 #define DECRED_NBITS_INDEX 29
 #define DECRED_NTIME_INDEX 34
 #define DECRED_NONCE_INDEX 35
 #define DECRED_XNONCE_INDEX 36
 #define DECRED_DATA_SIZE 192
 #define DECRED_WORK_COMPARE_SIZE 140
-
+*/
 static __thread sph_blake256_context blake_mid;
 static __thread bool ctx_midstate_done = false;

 void decred_hash(void *state, const void *input)
 {
-        #define MIDSTATE_LEN 128
+//        #define MIDSTATE_LEN 128
        sph_blake256_context ctx __attribute__ ((aligned (64)));

        uint8_t *ending = (uint8_t*) input;
-        ending += MIDSTATE_LEN;
+        ending += DECRED_MIDSTATE_LEN;

        if (!ctx_midstate_done) {
                sph_blake256_init(&blake_mid);
-                sph_blake256(&blake_mid, input, MIDSTATE_LEN);
+                sph_blake256(&blake_mid, input, DECRED_MIDSTATE_LEN);
                ctx_midstate_done = true;
        }
        memcpy(&ctx, &blake_mid, sizeof(blake_mid));

-        sph_blake256(&ctx, ending, (180 - MIDSTATE_LEN));
+        sph_blake256(&ctx, ending, (180 - DECRED_MIDSTATE_LEN));
        sph_blake256_close(&ctx, state);
 }

@@ -60,9 +59,9 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;

-        #define DCR_NONCE_OFT32 35
+//        #define DCR_NONCE_OFT32 35

-        const uint32_t first_nonce = pdata[DCR_NONCE_OFT32];
+        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];

        uint32_t n = first_nonce;
@@ -82,7 +81,7 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t

        do {
                //be32enc(&endiandata[DCR_NONCE_OFT32], n);
-                endiandata[DCR_NONCE_OFT32] = n;
+                endiandata[DECRED_NONCE_INDEX] = n;
                decred_hash(hash32, endiandata);

                if (hash32[7] <= HTarget && fulltest(hash32, ptarget)) {
@@ -93,7 +92,7 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
                        applog_hash(ptarget);
                        applog_compare_hash(hash32, ptarget);
 #endif
-                        pdata[DCR_NONCE_OFT32] = n;
+                        pdata[DECRED_NONCE_INDEX] = n;
                        return 1;
                }

@@ -102,24 +101,17 @@ int scanhash_decred(int thr_id, struct work *work, uint32_t max_nonce, uint64_t
        } while (n < max_nonce && !work_restart[thr_id].restart);

        *hashes_done = n - first_nonce + 1;
-        pdata[DCR_NONCE_OFT32] = n;
+        pdata[DECRED_NONCE_INDEX] = n;
        return 0;
 }

+/*
 uint32_t *decred_get_nonceptr( uint32_t *work_data )
 {
   return &work_data[ DECRED_NONCE_INDEX ];
 }

-// does decred need a custom stratum_get_g_work to fix nicehash
-//  bad extranonce2 size?
-// 
-// does decred need a custom init_nonce?
-// does it need to increment nonce, seems not because gen_work_now always
-// returns true
-
 double decred_calc_network_diff( struct work* work )
-//void decred_calc_network_diff( struct work* work )
 {
   // sample for diff 43.281 : 1c05ea29
   // todo: endian reversed on longpoll could be zr5 specific...
@@ -181,7 +173,7 @@ void decred_be_build_stratum_request( char *req, struct work *work,
         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
   free(xnonce2str);
 }
-
+*/
 /*
 // data shared between gen_merkle_root and build_extraheader.
 __thread uint32_t decred_extraheader[32] = { 0 };
@@ -197,7 +189,7 @@ void decred_gen_merkle_root( char* merkle_root, struct stratum_ctx* sctx )
 }
 */

-
+/*
 #define min(a,b) (a>b ? (b) :(a))

 void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
@@ -235,11 +227,15 @@ void decred_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   for ( i = 0; i < headersize/4; i++ ) // header
      g_work->data[17 + i] = extraheader[i];
   // extradata
+
   for ( i = 0; i < sctx->xnonce1_size/4; i++ )
      g_work->data[ DECRED_XNONCE_INDEX + i ] = extradata[i];
   for ( i = DECRED_XNONCE_INDEX + sctx->xnonce1_size/4; i < 45; i++ )
      g_work->data[i] = 0;
   g_work->data[37] = (rand()*4) << 8;
+   // block header suffix from coinb2 (stake version)
+   memcpy( &g_work->data[44],
+           &sctx->job.coinbase[ sctx->job.coinbase_size-4 ], 4 );
   sctx->bloc_height = g_work->data[32];
   //applog_hex(work->data, 180);
   //applog_hex(&work->data[36], 36);
@@ -274,6 +270,8 @@ bool register_decred_algo( algo_gate_t* gate )
  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
  gate->display_extra_data    = (void*)&decred_decode_extradata;
  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
+  gate->work_decode           = (void*)&std_be_work_decode;
+  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
  gate->build_extraheader     = (void*)&decred_build_extraheader;
  gate->ready_to_mine         = (void*)&decred_ready_to_mine;
  gate->nbits_index           = DECRED_NBITS_INDEX;
@@ -285,4 +283,4 @@ bool register_decred_algo( algo_gate_t* gate )
  have_gbt                    = false;
  return true;
 }
-
+*/
--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -0,0 +1,175 @@
+#include "pentablake-gate.h"
+
+#if defined (__AVX2__)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "blake-hash-4way.h"
+#include "sph_blake.h"
+
+//#define DEBUG_ALGO
+
+extern void pentablakehash_4way( void *output, const void *input )
+{
+	unsigned char _ALIGN(32) hash[128];
+//	// same as uint32_t hashA[16], hashB[16];
+//	#define hashB hash+64
+
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t hash2[8] __attribute__ ((aligned (64)));
+     uint64_t hash3[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake512_4way_context ctx;
+
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, input, 80 );
+     blake512_4way_close( &ctx, vhash );
+
+uint64_t sin0[10], sin1[10], sin2[10], sin3[10];
+mm256_deinterleave_4x64( sin0, sin1, sin2, sin3, input, 640 );
+sph_blake512_context ctx2_blake;
+sph_blake512_init(&ctx2_blake);
+sph_blake512(&ctx2_blake, sin0, 80);
+sph_blake512_close(&ctx2_blake, (void*) hash);
+
+mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+uint64_t* hash64 = (uint64_t*)hash;
+for( int i = 0; i < 8; i++ )
+{
+   if ( hash0[i] != hash64[i] )
+      printf("hash mismatch %u\n",i);
+}
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     blake512_4way_init( &ctx );
+     blake512_4way( &ctx, vhash, 64 );
+     blake512_4way_close( &ctx, vhash );
+
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+     memcpy( output+64, hash2, 32 );
+     memcpy( output+96, hash3, 32 );
+
+/*
+     uint64_t sin0[10] __attribute__ ((aligned (64)));
+     uint64_t sin1[10] __attribute__ ((aligned (64)));
+     uint64_t sin2[10] __attribute__ ((aligned (64)));
+     uint64_t sin3[10] __attribute__ ((aligned (64)));
+
+	sph_blake512_context     ctx_blake;
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+        sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, hash, 64);
+	sph_blake512_close(&ctx_blake, hash);
+
+	memcpy(output, hash, 32);
+*/
+}
+
+int scanhash_pentablake_4way( int thr_id, struct work *work,
+                              uint32_t max_nonce, uint64_t *hashes_done )
+{
+    uint32_t hash[4*8] __attribute__ ((aligned (64)));
+    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+    uint32_t endiandata[32] __attribute__ ((aligned (64)));
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+    uint32_t n = pdata[19] - 1;
+    const uint32_t first_nonce = pdata[19];
+    const uint32_t Htarg = ptarget[7];
+    uint32_t *nonces = work->nonces;
+    int num_found = 0;
+    uint32_t *noncep = vdata + 73;   // 9*8 + 1
+
+//    uint32_t _ALIGN(32) hash64[8];
+//    uint32_t _ALIGN(32) endiandata[32];
+
+    uint64_t htmax[] = {
+	0,
+	0xF,
+	0xFF,
+	0xFFF,
+	0xFFFF,
+	0x10000000
+    };
+    uint32_t masks[] = {
+ 	0xFFFFFFFF,
+	0xFFFFFFF0,
+	0xFFFFFF00,
+	0xFFFFF000,
+	0xFFFF0000,
+	0
+    };
+
+	// we need bigendian data...
+    swab32_array( endiandata, pdata, 20 );
+
+    uint64_t *edata = (uint64_t*)endiandata;
+    mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+    for ( int m=0; m < 6; m++ )
+    {
+        if ( Htarg <= htmax[m] )
+        {
+           uint32_t mask = masks[m];
+           do {
+              be32enc( noncep,   n   );
+              be32enc( noncep+2, n+1 );
+              be32enc( noncep+4, n+2 );
+              be32enc( noncep+6, n+3 );
+
+              pentablakehash_4way( hash, vdata );
+
+              for ( int i = 0; i < 4; i++ )
+              if ( !( (hash+(i<<3))[7] & mask )
+                  && fulltest( hash+(i<<3), ptarget ) )
+              {
+                 nonces[ num_found++ ] = n+i;
+                 work_set_target_ratio( work, hash+(i<<3) );
+              }
+              n += 4;
+
+           } while (n < max_nonce && !work_restart[thr_id].restart);
+           break;
+        }
+    }
+
+    *hashes_done = n - first_nonce + 1;
+    pdata[19] = n;
+    return 0;
+} 
+
+#endif
--- a/algo/blake/pentablake-gate.c
+++ b/algo/blake/pentablake-gate.c
@@ -0,0 +1,16 @@
+#include "pentablake-gate.h"
+
+bool register_pentablake_algo( algo_gate_t* gate )
+{
+#if defined (PENTABLAKE_4WAY)
+    gate->scanhash  = (void*)&scanhash_pentablake_4way;
+    gate->hash      = (void*)&pentablakehash_4way;
+#else
+    gate->scanhash  = (void*)&scanhash_pentablake;
+    gate->hash      = (void*)&pentablakehash;
+#endif
+    gate->optimizations = AVX2_OPT;
+    gate->get_max64 = (void*)&get_max64_0x3ffff;
+    return true;
+};
+
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -0,0 +1,21 @@
+#ifndef __PENTABLAKE_GATE_H__
+#define __PENTABLAKE_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__)
+  #define PENTABLAKE_4WAY
+#endif
+
+#if defined(PENTABLAKE_4WAY)
+void pentablakehash_4way( void *state, const void *input );
+int scanhash_pentablake_4way( int thr_id, struct work *work,
+                              uint32_t max_nonce, uint64_t *hashes_done );
+#endif
+
+void pentablakehash( void *state, const void *input );
+int scanhash_pentablake( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
--- a/algo/blake/pentablake.c
+++ b/algo/blake/pentablake.c
@@ -1,5 +1,4 @@
-#include "miner.h"
-#include "algo-gate-api.h"
+#include "pentablake-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -111,11 +110,3 @@ int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 } 

-bool register_pentablake_algo( algo_gate_t* gate )
-{
-    gate->scanhash  = (void*)&scanhash_pentablake;
-    gate->hash      = (void*)&pentablakehash;
-    gate->get_max64 = (void*)&get_max64_0x3ffff;
-    return true;
-};
-
--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -16,7 +16,7 @@
 #include <stdio.h>

 #include "algo/sha/sph_types.h"
-#include "crypto/blake2s.h"
+#include "sph-blake2s.h"

 static const uint32_t blake2s_IV[8] =
 {
--- a/algo/blake/sph-blake2s.h
+++ b/algo/blake/sph-blake2s.h
--- a/algo/blake/sph_blake.c
+++ b/algo/blake/sph_blake.c
@@ -813,6 +813,7 @@ blake32(sph_blake_small_context *sc, const void *data, size_t len)

 	buf = sc->buf;
 	ptr = sc->ptr;
+
 	if (len < (sizeof sc->buf) - ptr) {
 		memcpy(buf + ptr, data, len);
 		ptr += len;
@@ -871,6 +872,7 @@ blake32_close(sph_blake_small_context *sc,
 	} else {
 		sc->T0 -= 512 - bit_len;
 	}
+
 	if (bit_len <= 446) {
 		memset(u.buf + ptr + 1, 0, 55 - ptr);
 		if (out_size_w32 == 8)
@@ -890,9 +892,9 @@ blake32_close(sph_blake_small_context *sc,
 		sph_enc32be_aligned(u.buf + 60, tl);
 		blake32(sc, u.buf, 64);
 	}
-	out = dst;
-	for (k = 0; k < out_size_w32; k ++)
-		sph_enc32be(out + (k << 2), sc->H[k]);
+        out = dst;
+        for (k = 0; k < out_size_w32; k ++)
+                sph_enc32be(out + (k << 2), sc->H[k]);
 }

 #if SPH_64
@@ -982,9 +984,11 @@ blake64_close(sph_blake_big_context *sc,
 			u.buf[111] |= 1;
 		sph_enc64be_aligned(u.buf + 112, th);
 		sph_enc64be_aligned(u.buf + 120, tl);
+
 		blake64(sc, u.buf + ptr, 128 - ptr);
 	} else {
 		memset(u.buf + ptr + 1, 0, 127 - ptr);
+
 		blake64(sc, u.buf + ptr, 128 - ptr);
 		sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00);
 		sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF);
@@ -993,6 +997,7 @@ blake64_close(sph_blake_big_context *sc,
 			u.buf[111] = 1;
 		sph_enc64be_aligned(u.buf + 112, th);
 		sph_enc64be_aligned(u.buf + 120, tl);
+
 		blake64(sc, u.buf, 128);
 	}
 	out = dst;
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -0,0 +1,95 @@
+/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * BMW interface. BMW (aka "Blue Midnight Wish") is a family of
+ * functions which differ by their output size; this implementation
+ * defines BMW for output sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_bmw.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef BMW_HASH_H__
+#define BMW_HASH_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#ifdef __AVX2__
+
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_bmw256   256
+
+#define SPH_SIZE_bmw512   512
+
+typedef struct {
+   __m128i buf[64];
+   __m128i H[16];
+   size_t ptr;
+   sph_u32 bit_count;  // assume bit_count fits in 32 bits
+} bmw_4way_small_context;
+
+typedef bmw_4way_small_context bmw256_4way_context;
+
+typedef struct {
+   __m256i buf[16];
+   __m256i H[16];
+   size_t ptr;
+   sph_u64 bit_count;
+} bmw_4way_big_context;
+
+typedef bmw_4way_big_context bmw512_4way_context;
+
+void bmw256_4way_init(void *cc);
+
+void bmw256_4way(void *cc, const void *data, size_t len);
+
+void bmw256_4way_close(void *cc, void *dst);
+
+void bmw256_4way_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+void bmw512_4way_init(void *cc);
+
+void bmw512_4way(void *cc, const void *data, size_t len);
+
+void bmw512_4way_close(void *cc, void *dst);
+
+void bmw512_4way_addbits_and_close(
+	void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/bmw/bmw.test
+++ b/algo/bmw/bmw.test
--- a/algo/bmw/bmw256.c
+++ b/algo/bmw/bmw256.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <string.h>
--- a/algo/bmw/sse2/bmw.c
+++ b/algo/bmw/sse2/bmw.c
@@ -477,7 +477,7 @@ do { \
        for (u = 0; u < 16; u ++) \
        sph_enc64le_aligned(data + 8 * u, h2[u]); \
        dh = h1; \
-        h = final_b; \
+        h = (sph_u64*)final_b; \
    } \
    /* end wrapped for break loop */ \
    out = dst; \
--- a/algo/cryptonight/cryptolight.c
+++ b/algo/cryptonight/cryptolight.c
@@ -2,7 +2,6 @@
 // Distributed under the MIT/X11 software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.

-#include "miner.h"
 #include "algo-gate-api.h"

 #if defined(__arm__) || defined(_MSC_VER)
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -3,7 +3,8 @@
 #include "cryptonight.h"
 #include "miner.h"
 #include "crypto/c_keccak.h"
-#include "avxdefs.h"
+#include <immintrin.h>
+//#include "avxdefs.h"

 void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey);
 void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state);
@@ -109,43 +110,43 @@ static __thread cryptonight_ctx ctx;
 void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 {
 #ifndef NO_AES_NI
-    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
+
    uint8_t ExpandedKey[256] __attribute__((aligned(64)));
+    __m128i *longoutput, *expkey, *xmminput;
    size_t i, j;
    
-    memcpy(ctx.text, ctx.state.init, INIT_SIZE_BYTE);
-    memcpy(ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE);
-    ExpandAESKey256(ExpandedKey);
+    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
+    memcpy( ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE );
+    ExpandAESKey256( ExpandedKey );
+    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
    
-    __m128i *longoutput, *expkey, *xmminput;
-    longoutput = (__m128i *)ctx.long_state;
-    expkey     = (__m128i *)ExpandedKey;
-    xmminput   = (__m128i *)ctx.text;
+    longoutput = (__m128i*)ctx.long_state;
+    xmminput   = (__m128i*)ctx.text;
+    expkey     = (__m128i*)ExpandedKey;
    
-    //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
-    //    aesni_parallel_noxor(&ctx->long_state[i], ctx->text, ExpandedKey);
-    
-    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
+    // prefetch expkey, xmminput and enough longoutput for 4 iterations
    _mm_prefetch( xmminput,     _MM_HINT_T0 );
    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
-    for ( i = 0; i < 64; i += 16 )
-    {
-       _mm_prefetch( longoutput + i,      _MM_HINT_T0 );
-       _mm_prefetch( longoutput + i +  4, _MM_HINT_T0 );
-       _mm_prefetch( longoutput + i +  8, _MM_HINT_T0 );
-       _mm_prefetch( longoutput + i + 12, _MM_HINT_T0 );
-    }
    _mm_prefetch( expkey,     _MM_HINT_T0 );
    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
    _mm_prefetch( expkey + 8, _MM_HINT_T0 );
-
-    for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    for ( i = 0; i < 64; i += 16 )
    {
-        // prefetch 4 loops ahead,
+        __builtin_prefetch( longoutput + i,      1, 0 );
+        __builtin_prefetch( longoutput + i +  4, 1, 0 );
+        __builtin_prefetch( longoutput + i +  8, 1, 0 );
+        __builtin_prefetch( longoutput + i + 12, 1, 0 );
+    }
+
+    // n-4 iterations
+    for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
+                         i += INIT_SIZE_M128I )
+    {
+        // prefetch 4 iterations ahead.
        __builtin_prefetch( longoutput + i + 64, 1, 0 );
        __builtin_prefetch( longoutput + i + 68, 1, 0 );

-	for (j = 0; j < 10; j++ )
+	for ( j = 0; j < 10; j++ )
 	{
 		xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
 		xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
@@ -165,84 +166,99 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	_mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
 	_mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
    }
+    // last 4 iterations
+    for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    {
+        for ( j = 0; j < 10; j++ )
+        {
+                xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
+                xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
+                xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
+                xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
+                xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
+                xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
+                xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
+                xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
+        }
+        _mm_store_si128( &( longoutput[i  ] ), xmminput[0] );
+        _mm_store_si128( &( longoutput[i+1] ), xmminput[1] );
+        _mm_store_si128( &( longoutput[i+2] ), xmminput[2] );
+        _mm_store_si128( &( longoutput[i+3] ), xmminput[3] );
+        _mm_store_si128( &( longoutput[i+4] ), xmminput[4] );
+        _mm_store_si128( &( longoutput[i+5] ), xmminput[5] );
+        _mm_store_si128( &( longoutput[i+6] ), xmminput[6] );
+        _mm_store_si128( &( longoutput[i+7] ), xmminput[7] );
+    }

-//     cast_m128i( ctx.a ) = _mm_xor_si128( casti_m128i( ctx.state.k, 0 ) ,
-//                                          casti_m128i( ctx.state.k, 2 ) );
-//     cast_m128i( ctx.b ) = _mm_xor_si128( casti_m128i( ctx.state.k, 1 ),
-//                                          casti_m128i( ctx.state.k, 3 ) );
+    ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
+    ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
+    ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
+    ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7];

-     ctx.a[0] = ((uint64_t *)ctx.state.k)[0] ^ ((uint64_t *)ctx.state.k)[4];
-     ctx.b[0] = ((uint64_t *)ctx.state.k)[2] ^ ((uint64_t *)ctx.state.k)[6];
-     ctx.a[1] = ((uint64_t *)ctx.state.k)[1] ^ ((uint64_t *)ctx.state.k)[5];
-     ctx.b[1] = ((uint64_t *)ctx.state.k)[3] ^ ((uint64_t *)ctx.state.k)[7];
-
-//    for (i = 0; i < 2; i++) 
-//    {
-//     ctx.a[i] = ((uint64_t *)ctx.state.k)[i] ^  ((uint64_t *)ctx.state.k)[i+4];
-//     ctx.b[i] = ((uint64_t *)ctx.state.k)[i+2] ^ ((uint64_t *)ctx.state.k)[i+6];
-//    }
-
-    __m128i b_x = _mm_load_si128((__m128i *)ctx.b);
-    uint64_t a[2] __attribute((aligned(16))), b[2] __attribute((aligned(16)));
+    uint64_t a[2] __attribute((aligned(16))),
+             b[2] __attribute((aligned(16))),
+             c[2] __attribute((aligned(16)));
    a[0] = ctx.a[0];
    a[1] = ctx.a[1];
-	
-    for(i = 0; __builtin_expect(i < 0x80000, 1); i++)
+    __m128i b_x = _mm_load_si128( (__m128i*)ctx.b );
+    __m128i a_x = _mm_load_si128( (__m128i*)a );
+    __m128i* lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
+    __m128i c_x = _mm_load_si128( lsa );
+    uint64_t *nextblock;
+    uint64_t hi, lo;
+
+    // n-1 iterations
+    for( i = 0; __builtin_expect( i < 0x7ffff, 1 ); i++ )
    {	  
-        uint64_t c[2];
-        __builtin_prefetch( &ctx.long_state[c[0] & 0x1FFFF0], 0, 1 );
-
-	__m128i c_x = _mm_load_si128( 
-                              (__m128i *)&ctx.long_state[a[0] & 0x1FFFF0]);
-	__m128i a_x = _mm_load_si128((__m128i *)a);
-	c_x = _mm_aesenc_si128(c_x, a_x);
-	_mm_store_si128((__m128i *)c, c_x);
-	
-	b_x = _mm_xor_si128(b_x, c_x);
-	_mm_store_si128((__m128i *)&ctx.long_state[a[0] & 0x1FFFF0], b_x);
-
-	uint64_t *nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
-//	uint64_t b[2];
+	c_x = _mm_aesenc_si128( c_x, a_x );
+	_mm_store_si128( (__m128i*)c, c_x );
+        b_x = _mm_xor_si128( b_x, c_x );
+        nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
+	_mm_store_si128( lsa, b_x );
 	b[0] = nextblock[0];
 	b[1] = nextblock[1];

-	{
-	  uint64_t hi, lo;
-	 // hi,lo = 64bit x 64bit multiply of c[0] and b[0]
+        // hi,lo = 64bit x 64bit multiply of c[0] and b[0]
+	__asm__( "mulq %3\n\t"
+	         : "=d" ( hi ),
+	           "=a" ( lo )
+	         : "%a" ( c[0] ),
+	           "rm" ( b[0] )
+		 : "cc" );

-	  __asm__("mulq %3\n\t"
-		  : "=d" (hi),
-		"=a" (lo)
-		  : "%a" (c[0]),
-		"rm" (b[0])
-		  : "cc" );
-	  
-	  a[0] += hi;
-	  a[1] += lo;
-	}
-	uint64_t *dst = (uint64_t*)&ctx.long_state[c[0] & 0x1FFFF0];
-//        __m128i *dst = (__m128i*)&ctx.long_state[c[0] & 0x1FFFF0];
-
-//        *dst = cast_m128i( a ); 
-	dst[0] = a[0];
-	dst[1] = a[1];
-
-//        cast_m128i( a ) = _mm_xor_si128( cast_m128i( a ), cast_m128i( b ) );
-	a[0] ^= b[0];
-	a[1] ^= b[1];
-	b_x = c_x;
-	__builtin_prefetch( &ctx.long_state[a[0] & 0x1FFFF0], 0, 3 );
+        b_x = c_x;
+        nextblock[0] = a[0] + hi;
+        nextblock[1] = a[1] + lo;
+        a[0] = b[0] ^ nextblock[0];
+        a[1] = b[1] ^ nextblock[1];
+        lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
+        a_x = _mm_load_si128( (__m128i*)a );
+        c_x = _mm_load_si128( lsa );
    }
+    // abreviated nth iteration
+    c_x = _mm_aesenc_si128( c_x, a_x );
+    _mm_store_si128( (__m128i*)c, c_x );
+    b_x = _mm_xor_si128( b_x, c_x );
+    nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
+    _mm_store_si128( lsa, b_x );
+    b[0] = nextblock[0];
+    b[1] = nextblock[1];
+
+    __asm__( "mulq %3\n\t"
+             : "=d" ( hi ),
+               "=a" ( lo )
+             : "%a" ( c[0] ),
+               "rm" ( b[0] )
+             : "cc" );
+
+    nextblock[0] = a[0] + hi;
+    nextblock[1] = a[1] + lo;

-    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
    memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
    ExpandAESKey256( ExpandedKey );
-    
-    //for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE)
-    //    aesni_parallel_xor(&ctx->text, ExpandedKey, &ctx->long_state[i]);
+    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
    
    // prefetch expkey, all of xmminput and enough longoutput for 4 loops
-
    _mm_prefetch( xmminput,     _MM_HINT_T0 );
    _mm_prefetch( xmminput + 4, _MM_HINT_T0 );
    for ( i = 0; i < 64; i += 16 )
@@ -256,9 +272,11 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
    _mm_prefetch( expkey + 4, _MM_HINT_T0 );
    _mm_prefetch( expkey + 8, _MM_HINT_T0 );

-    for ( i = 0; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    // n-4 iterations
+    for ( i = 0; likely( i < MEMORY_M128I - 4*INIT_SIZE_M128I );
+                         i += INIT_SIZE_M128I )
    {
-        // stay 4 loops ahead,
+        // stay 4 iterations ahead.
        _mm_prefetch( longoutput + i + 64, _MM_HINT_T0 );
        _mm_prefetch( longoutput + i + 68, _MM_HINT_T0 );

@@ -283,10 +301,34 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	    xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
        }
    }
-        
+    // last 4 iterations 
+    for ( ; likely( i < MEMORY_M128I ); i += INIT_SIZE_M128I )
+    {
+        xmminput[0] = _mm_xor_si128( longoutput[i  ], xmminput[0] );
+        xmminput[1] = _mm_xor_si128( longoutput[i+1], xmminput[1] );
+        xmminput[2] = _mm_xor_si128( longoutput[i+2], xmminput[2] );
+        xmminput[3] = _mm_xor_si128( longoutput[i+3], xmminput[3] );
+        xmminput[4] = _mm_xor_si128( longoutput[i+4], xmminput[4] );
+        xmminput[5] = _mm_xor_si128( longoutput[i+5], xmminput[5] );
+        xmminput[6] = _mm_xor_si128( longoutput[i+6], xmminput[6] );
+        xmminput[7] = _mm_xor_si128( longoutput[i+7], xmminput[7] );
+
+        for( j = 0; j < 10; j++ )
+        {
+            xmminput[0] = _mm_aesenc_si128( xmminput[0], expkey[j] );
+            xmminput[1] = _mm_aesenc_si128( xmminput[1], expkey[j] );
+            xmminput[2] = _mm_aesenc_si128( xmminput[2], expkey[j] );
+            xmminput[3] = _mm_aesenc_si128( xmminput[3], expkey[j] );
+            xmminput[4] = _mm_aesenc_si128( xmminput[4], expkey[j] );
+            xmminput[5] = _mm_aesenc_si128( xmminput[5], expkey[j] );
+            xmminput[6] = _mm_aesenc_si128( xmminput[6], expkey[j] );
+            xmminput[7] = _mm_aesenc_si128( xmminput[7], expkey[j] );
+        }
+    }
+
    memcpy( ctx.state.init, ctx.text, INIT_SIZE_BYTE);
    keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
-
    extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
+
 #endif
 }
--- a/algo/cryptonight/cryptonight-common.c
+++ b/algo/cryptonight/cryptonight-common.c
@@ -5,7 +5,6 @@
 // Modified for CPUminer by Lucas Jones

 #include "cpuminer-config.h"
-//#include "miner.h"
 #include "algo-gate-api.h"

 #ifndef NO_AES_NI
--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -10,6 +10,10 @@
 #endif
 #include "cubehash_sse2.h"
 #include "algo/sha/sha3-defs.h"
+#include <stdbool.h>
+#include <unistd.h>
+#include <memory.h>
+#include "avxdefs.h"

 static void transform( cubehashParam *sp )
 {
@@ -125,6 +129,18 @@ static void transform( cubehashParam *sp )
 #endif
 }  // transform

+// Cubehash context initializing is very expensive.
+// Cache the intial value for faster reinitializing.
+cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));
+
+int cubehashReinit( cubehashParam *sp )
+{
+   memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
+   return SUCCESS;
+
+}
+
+// Initialize the cache then copy to sp.
 int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
 {
    int i;
@@ -135,24 +151,26 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)

    /* Sanity checks */
    if ( rounds <= 0 || rounds > 32 )
-         rounds = CUBEHASH_ROUNDS;
+       rounds = CUBEHASH_ROUNDS;
    if ( blockbytes <= 0 || blockbytes >= 256)
-         blockbytes = CUBEHASH_BLOCKBYTES;
+       blockbytes = CUBEHASH_BLOCKBYTES;

    // all sizes of __m128i
-    sp->hashlen   = hashbitlen/128;
-    sp->blocksize = blockbytes/16;
-    sp->rounds    = rounds;
-    sp->pos       = 0;
+    cube_ctx_cache.hashlen   = hashbitlen/128;
+    cube_ctx_cache.blocksize = blockbytes/16;
+    cube_ctx_cache.rounds    = rounds;
+    cube_ctx_cache.pos       = 0;

    for ( i = 0; i < 8; ++i )
-         sp->x[i] = _mm_set_epi32(0, 0, 0, 0);
+       cube_ctx_cache.x[i] = _mm_setzero_si128();;

-    sp->x[0] = _mm_set_epi32( 0, rounds, blockbytes, hashbitlen / 8 );
+    cube_ctx_cache.x[0] = _mm_set_epi32( 0, rounds, blockbytes,
+                                         hashbitlen / 8 );

    for ( i = 0; i < 10; ++i )
-         transform(sp);
-//    sp->pos = 0;
+       transform( &cube_ctx_cache );
+
+    memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
    return SUCCESS;
 }

--- a/algo/cubehash/sse2/cubehash_sse2.h
+++ b/algo/cubehash/sse2/cubehash_sse2.h
@@ -29,6 +29,8 @@ extern "C" {
 #endif

 int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes);
+// reinitialize context with same parameters, much faster.
+int cubehashReinit( cubehashParam* sp );

 int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size);

--- a/algo/echo/aes_ni/architectures
+++ b/algo/echo/aes_ni/architectures
@@ -1,2 +0,0 @@
-amd64
-x86
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -14,18 +14,20 @@
 * Institute of Applied Mathematics, Middle East Technical University, Turkey.
 *
 */
+#if defined(__AES__)

 #include <memory.h>
 #include "miner.h"
 #include "hash_api.h"
-#include "vperm.h"
-
+//#include "vperm.h"
+#include <immintrin.h>
+/*
 #ifndef NO_AES_NI
 #include <wmmintrin.h>
 #else
 #include <tmmintrin.h>
 #endif
-
+*/

 MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
 MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
@@ -246,7 +248,8 @@ void DumpState(__m128i *ps)
 void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
 {
 	unsigned int r, b, i, j;
-	__m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
+//      __m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
+	__m128i t1, t2, s2, k1;
 	__m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 


@@ -396,7 +399,7 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
 {
 	int i, j;

-	ctx->k = _mm_xor_si128(ctx->k, ctx->k);
+        ctx->k = _mm_setzero_si128(); 
 	ctx->processed_bits = 0;
 	ctx->uBufferBytes = 0;

@@ -742,4 +745,4 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
 	return SUCCESS;
 }

-
+#endif
--- a/algo/echo/aes_ni/implementors
+++ b/algo/echo/aes_ni/implementors
@@ -1 +0,0 @@
-Çağdaş Çalık
--- a/algo/echo/aes_ni/vperm.h
+++ b/algo/echo/aes_ni/vperm.h
@@ -1,119 +0,0 @@
-/*
- * file        : vperm.h
- * version     : 1.0.208
- * date        : 14.12.2010
- * 
- * vperm implementation of AES s-box 
- *
- * Credits: Adapted from Mike Hamburg's AES implementation, http://crypto.stanford.edu/vpaes/
- *
- * Cagdas Calik
- * ccalik@metu.edu.tr
- * Institute of Applied Mathematics, Middle East Technical University, Turkey.
- *
- */
-
-#ifndef VPERM_H
-#define VPERM_H
-
-#include "algo/sha/sha3_common.h"
-#include <tmmintrin.h>
-
-/*
-extern const unsigned int _k_s0F[];
-extern const unsigned int _k_ipt[];
-extern const unsigned int _k_opt[];
-extern const unsigned int _k_inv[];
-extern const unsigned int _k_sb1[];
-extern const unsigned int _k_sb2[];
-extern const unsigned int _k_sb3[];
-extern const unsigned int _k_sb4[];
-extern const unsigned int _k_sb5[];
-extern const unsigned int _k_sb7[];
-extern const unsigned int _k_sbo[];
-extern const unsigned int _k_h63[];
-extern const unsigned int _k_hc6[];
-extern const unsigned int _k_h5b[];
-extern const unsigned int _k_h4e[];
-extern const unsigned int _k_h0e[];
-extern const unsigned int _k_h15[];
-extern const unsigned int _k_aesmix1[];
-extern const unsigned int _k_aesmix2[];
-extern const unsigned int _k_aesmix3[];
-extern const unsigned int _k_aesmix4[];
-*/
-
-// input: x, table
-// output: x
-#define TRANSFORM(x, table, t1, t2)\
-	t1 = _mm_andnot_si128(M128(_k_s0F), x);\
-	t1 = _mm_srli_epi32(t1, 4);\
-	x  = _mm_and_si128(x, M128(_k_s0F));\
-	t1 = _mm_shuffle_epi8(*((__m128i*)table + 1), t1);\
-	x  = _mm_shuffle_epi8(*((__m128i*)table + 0), x);\
-	x  = _mm_xor_si128(x, t1)
-
-// compiled erroneously with 32-bit msc compiler
-	//t2 = _mm_shuffle_epi8(table[0], x);\
-	//x  = _mm_shuffle_epi8(table[1], t1);\
-	//x  = _mm_xor_si128(x, t2)
-
-
-// input: x
-// output: t2, t3
-#define SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4)\
-	t1 = _mm_andnot_si128(M128(_k_s0F), x);\
-	t1 = _mm_srli_epi32(t1, 4);\
-	x  = _mm_and_si128(x, M128(_k_s0F));\
-	t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 1), x);\
-	x  = _mm_xor_si128(x, t1);\
-	t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t1);\
-	t3 = _mm_xor_si128(t3, t2);\
-	t4 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), x);\
-	t4 = _mm_xor_si128(t4, t2);\
-	t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t3);\
-	t2 = _mm_xor_si128(t2, x);\
-	t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t4);\
-	t3 = _mm_xor_si128(t3, t1);\
-
-
-// input: x1, x2, table
-// output: y
-#define VPERM_LOOKUP(x1, x2, table, y, t)\
-	t = _mm_shuffle_epi8(*((__m128i*)table + 0), x1);\
-	y = _mm_shuffle_epi8(*((__m128i*)table + 1), x2);\
-	y = _mm_xor_si128(y, t)
-
-
-// input: x
-// output: x
-#define SUBSTITUTE_VPERM(x, t1, t2, t3, t4)  \
-	TRANSFORM(x, _k_ipt, t1, t2);\
-	SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
-	VPERM_LOOKUP(t2, t3, _k_sbo, x, t1);\
-	x = _mm_xor_si128(x, M128(_k_h63))
-
-
-// input: x
-// output: x
-#define AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3) \
-	SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
-	VPERM_LOOKUP(t2, t3, _k_sb1, s1, t1);\
-	VPERM_LOOKUP(t2, t3, _k_sb2, s2, t1);\
-	s3 = _mm_xor_si128(s1, s2);\
-	x = _mm_shuffle_epi8(s2, M128(_k_aesmix1));\
-	x = _mm_xor_si128(x, _mm_shuffle_epi8(s3, M128(_k_aesmix2)));\
-	x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix3)));\
-	x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix4)));\
-	x = _mm_xor_si128(x, M128(_k_h5b))
-
-
-// input: x
-// output: x
-#define AES_ROUND_VPERM(x, t1, t2, t3, t4, s1, s2, s3) \
-	TRANSFORM(x, _k_ipt, t1, t2);\
-	AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3);\
-	TRANSFORM(x, _k_opt, t1, t2)
-
-#endif // VPERM_H
-
--- a/algo/groestl/aes_ni/hash-groestl.h
+++ b/algo/groestl/aes_ni/hash-groestl.h
@@ -21,7 +21,7 @@

 #include "brg_endian.h"
 #define NEED_UINT_64T
-#include "brg_types.h"
+#include "algo/sha/brg_types.h"

 /* some sizes (number of bytes) */
 #define ROWS (8)
--- a/algo/groestl/aes_ni/hash-groestl256.h
+++ b/algo/groestl/aes_ni/hash-groestl256.h
@@ -35,7 +35,7 @@ typedef crypto_uint64 u64;

 #include "brg_endian.h"
 #define NEED_UINT_64T
-#include "brg_types.h"
+#include "algo/sha/brg_types.h"

 #ifdef IACA_TRACE
  #include IACA_MARKS
--- a/algo/groestl/groestl.c
+++ b/algo/groestl/groestl.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdio.h>
@@ -99,22 +98,21 @@ void groestl_set_target( struct work* work, double job_diff )
 work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
 }

-bool register_groestl_algo( algo_gate_t* gate )
+bool register_dmd_gr_algo( algo_gate_t* gate )
 {
    init_groestl_ctx();
    gate->optimizations   = SSE2_OPT | AES_OPT;
    gate->scanhash        = (void*)&scanhash_groestl;
    gate->hash            = (void*)&groestlhash;
    gate->set_target      = (void*)&groestl_set_target;
-    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
    gate->get_max64       = (void*)&get_max64_0x3ffff;
    return true;
 };

-bool register_dmd_gr_algo( algo_gate_t* gate )
+bool register_groestl_algo( algo_gate_t* gate )
 {
-    register_groestl_algo( gate );
-    gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
+    register_dmd_gr_algo( gate );
+    gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
    return true;
 };

--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -1,5 +1,4 @@
-#include "miner.h"
-#include "algo-gate-api.h"
+#include "myrgr-gate.h"

 #include <stdio.h>
 #include <stdlib.h>
@@ -11,12 +10,7 @@
 #else
  #include "aes_ni/hash-groestl.h"
 #endif
-
-#if defined __SHA__
-  #include <openssl/sha.h>
-#else
-  #include "algo/sha/sph_sha2.h"
-#endif
+#include "algo/sha/sph_sha2.h"

 typedef struct {
 #ifdef NO_AES_NI
@@ -24,11 +18,7 @@ typedef struct {
 #else
    hashState_groestl       groestl;
 #endif
-#if defined __SHA__
-   SHA256_CTX         sha;
-#else
-   sph_sha256_context sha;
-#endif
+    sph_sha256_context sha;
 } myrgr_ctx_holder;

 myrgr_ctx_holder myrgr_ctx;
@@ -40,44 +30,37 @@ void init_myrgr_ctx()
 #else
     init_groestl (&myrgr_ctx.groestl, 64 );
 #endif
-#if defined __SHA__
-   SHA256_Init( &myrgr_ctx.sha );
-#else
-   sph_sha256_init( &myrgr_ctx.sha );
-#endif
+     sph_sha256_init(&myrgr_ctx.sha);
 }

-void myriadhash( void *output, const void *input )
+void myriad_hash(void *output, const void *input)
 {
-     myrgr_ctx_holder ctx __attribute__ ((aligned (64)));
-     memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
-     uint32_t hash[16] __attribute__ ((aligned (64))); 
+        myrgr_ctx_holder ctx;
+        memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
+
+ 	uint32_t _ALIGN(32) hash[16];

 #ifdef NO_AES_NI
-     sph_groestl512(&ctx.groestl, input, 80);
-     sph_groestl512_close(&ctx.groestl, hash);
+	sph_groestl512(&ctx.groestl, input, 80);
+	sph_groestl512_close(&ctx.groestl, hash);
 #else
-     update_and_final_groestl( &ctx.groestl, (char*)input,
-                               (const char*)input, 640 );
+        update_groestl( &ctx.groestl, (char*)input, 640 );
+        final_groestl( &ctx.groestl, (char*)hash);
 #endif

-#if defined __SHA__
-     SHA256_Update( &ctx.sha, hash, 64 );
-     SHA256_Final( (unsigned char*) hash, &ctx.sha );
-#else
-     sph_sha256(&ctx.sha, hash, 64);
-     sph_sha256_close(&ctx.sha, hash);
-#endif
-     memcpy(output, hash, 32);
+	sph_sha256(&ctx.sha, hash, 64);
+	sph_sha256_close(&ctx.sha, hash);
+
+	memcpy(output, hash, 32);
 }

-int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done)
+int scanhash_myriad(int thr_id, struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done)
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;

-	uint32_t endiandata[20] __attribute__ ((aligned (64)));
+	uint32_t _ALIGN(64) endiandata[20];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;

@@ -88,9 +71,9 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,

 	do {
 		const uint32_t Htarg = ptarget[7];
-		uint32_t hash[8] __attribute__ ((aligned (64)));
+		uint32_t hash[8];
 		be32enc(&endiandata[19], nonce);
-		myriadhash(hash, endiandata);
+		myriad_hash(hash, endiandata);

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
@@ -105,14 +88,15 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-
+/*
 bool register_myriad_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | SHA_OPT;
+    gate->optimizations = SSE2_OPT | AES_OPT;
    init_myrgr_ctx();
    gate->scanhash = (void*)&scanhash_myriad;
    gate->hash     = (void*)&myriadhash;
+//    gate->hash_alt = (void*)&myriadhash;
    gate->get_max64 = (void*)&get_max64_0x3ffff;
    return true;
 };
-
+*/
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -0,0 +1,107 @@
+#include "myrgr-gate.h"
+
+#if defined(MYRGR_4WAY)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "aes_ni/hash-groestl.h"
+#include "algo/sha/sha2-hash-4way.h"
+
+typedef struct {
+    hashState_groestl       groestl;
+    sha256_4way_context     sha;
+} myrgr_4way_ctx_holder;
+
+myrgr_4way_ctx_holder myrgr_4way_ctx;
+
+void init_myrgr_4way_ctx()
+{
+     init_groestl (&myrgr_4way_ctx.groestl, 64 );
+     sha256_4way_init( &myrgr_4way_ctx.sha );
+}
+
+void myriad_4way_hash( void *output, const void *input )
+{
+     uint32_t hash0[20] __attribute__ ((aligned (64)));
+     uint32_t hash1[20] __attribute__ ((aligned (64)));
+     uint32_t hash2[20] __attribute__ ((aligned (64)));
+     uint32_t hash3[20] __attribute__ ((aligned (64)));
+     uint32_t vhash[16*4] __attribute__ ((aligned (64)));
+     myrgr_4way_ctx_holder ctx;
+     memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );
+
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
+
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     sha256_4way( &ctx.sha, vhash, 64 );
+     sha256_4way_close( &ctx.sha, vhash );
+
+     mm_deinterleave_4x32( output, output+32, output+64, output+96,
+                           vhash, 256 );
+}
+
+int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 76; // 19*4
+
+/*
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+
+	uint32_t _ALIGN(64) endiandata[20];
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+*/
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   swab32_array( edata, pdata, 20 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   do {
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );
+
+      myriad_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/groestl/myrgr-gate.c
+++ b/algo/groestl/myrgr-gate.c
@@ -0,0 +1,18 @@
+#include "myrgr-gate.h"
+
+bool register_myriad_algo( algo_gate_t* gate )
+{
+#if defined (MYRGR_4WAY)
+  init_myrgr_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_myriad_4way;
+  gate->hash      = (void*)&myriad_4way_hash;
+#else
+  init_myrgr_ctx();
+  gate->scanhash  = (void*)&scanhash_myriad;
+  gate->hash      = (void*)&myriad_hash;
+#endif
+  gate->optimizations = AES_OPT | AVX2_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/groestl/myrgr-gate.h
+++ b/algo/groestl/myrgr-gate.h
@@ -0,0 +1,30 @@
+#ifndef MYRGR_GATE_H__
+#define MYRGR_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define MYRGR_4WAY
+#endif
+
+#if defined(MYRGR_4WAY)
+
+void myriad_4way_hash( void *state, const void *input );
+
+int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_myrgr_4way_ctx();
+
+#endif
+
+void myriad_hash( void *state, const void *input );
+
+int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_myrgr_ctx();
+
+#endif
+
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -0,0 +1,935 @@
+/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
+/*
+ * Hamsi implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+//#include "miner.h"
+#include "hamsi-hash-4way.h"
+
+#if defined(__AVX2__)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/*
+ * The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
+ * table lookup during message expansion (1 to 8, inclusive). If we note
+ * w the number of bits per message word (w=32 for Hamsi-224/256, w=64
+ * for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
+ * Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
+ * then we will get t tables (where t=ceil(w/n)) of individual size
+ * 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
+ * n=5, there are 7 tables, but the last one uses only two bits on
+ * input, not five).
+ *
+ * Also, we read t rows of r words from RAM. Words in a given row are
+ * concatenated in RAM in that order, so most of the cost is about
+ * reading the first row word; comparatively, cache misses are thus
+ * less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
+ *
+ * When n=1, tables are "special" in that we omit the first entry of
+ * each table (which always contains 0), so that total table size is
+ * halved.
+ *
+ * We thus have the following (size1 is the cumulative table size of
+ * Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
+ * are for Hamsi-224/256 and Hamsi-384/512, respectively).
+ *
+ *   n      size1      size2    t1    t2
+ * ---------------------------------------
+ *   1       1024       4096    32    64
+ *   2       2048       8192    16    32
+ *   3       2688      10880    11    22
+ *   4       4096      16384     8    16
+ *   5       6272      25600     7    13
+ *   6      10368      41984     6    11
+ *   7      16896      73856     5    10
+ *   8      32768     131072     4     8
+ *
+ * So there is a trade-off: a lower n makes the tables fit better in
+ * L1 cache, but increases the number of memory accesses. The optimal
+ * value depends on the amount of available L1 cache and the relative
+ * impact of a cache miss.
+ *
+ * Experimentally, in ideal benchmark conditions (which are not necessarily
+ * realistic with regards to L1 cache contention), it seems that n=8 is
+ * the best value on "big" architectures (those with 32 kB or more of L1
+ * cache), while n=4 is better on "small" architectures. This was tested
+ * on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
+ * (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
+ * (8 kB L1 cache).
+ *
+ * Note: with n=1, the 32 tables (actually implemented as one big table)
+ * are read entirely and sequentially, regardless of the input data,
+ * thus avoiding any data-dependent table access pattern.
+ */
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+//#include "hamsi-helper-4way.c"
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
+	SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
+	SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
+	SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
+	SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
+	SPH_C32(0x6769756d)
+};
+
+static const sph_u32 alpha_n[] = {
+	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
+};
+
+static const sph_u32 alpha_f[] = {
+	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
+	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
+	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
+	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
+};
+
+// imported from hamsi helper
+
+/* Note: this table lists bits within each byte from least
+   siginificant to most significant. */
+static const sph_u32 T512[64][16] = {
+	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
+	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
+	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
+	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
+	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
+	  SPH_C32(0x9e69af68) },
+	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
+	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
+	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
+	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
+	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
+	  SPH_C32(0x0c26f262) },
+	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
+	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
+	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
+	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
+	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
+	  SPH_C32(0xdc24e61f) },
+	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
+	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
+	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
+	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
+	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
+	  SPH_C32(0x3daac2da) },
+	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
+	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
+	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
+	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
+	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
+	  SPH_C32(0x78cace29) },
+	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
+	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
+	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
+	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
+	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
+	  SPH_C32(0x2dd1f9ab) },
+	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
+	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
+	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
+	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
+	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
+	  SPH_C32(0xbf2c0be2) },
+	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
+	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
+	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
+	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
+	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
+	  SPH_C32(0x32219526) },
+	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
+	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
+	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
+	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
+	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
+	  SPH_C32(0xac8e6c88) },
+	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
+	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
+	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
+	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
+	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
+	  SPH_C32(0x7b1bd6b9) },
+	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
+	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
+	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
+	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
+	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
+	  SPH_C32(0xf746c320) },
+	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
+	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
+	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
+	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
+	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
+	  SPH_C32(0x69505b3a) },
+	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
+	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
+	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
+	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
+	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
+	  SPH_C32(0x8a341574) },
+	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
+	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
+	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
+	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
+	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
+	  SPH_C32(0x450360bf) },
+	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
+	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
+	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
+	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
+	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
+	  SPH_C32(0xf3d45758) },
+	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
+	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
+	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
+	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
+	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
+	  SPH_C32(0x925c44e9) },
+	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
+	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
+	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
+	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
+	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
+	  SPH_C32(0xa123ff9f) },
+	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
+	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
+	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
+	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
+	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
+	  SPH_C32(0x1568ff0f) },
+	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
+	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
+	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
+	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
+	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
+	  SPH_C32(0xc5c1eb3e) },
+	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
+	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
+	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
+	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
+	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
+	  SPH_C32(0x1af21fe1) },
+	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
+	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
+	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
+	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
+	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
+	  SPH_C32(0x857f3c2b) },
+	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
+	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
+	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
+	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
+	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
+	  SPH_C32(0x2ba05a55) },
+	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
+	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
+	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
+	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
+	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
+	  SPH_C32(0xfeabf254) },
+	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
+	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
+	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
+	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
+	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
+	  SPH_C32(0xfe1cdc7f) },
+	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
+	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
+	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
+	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
+	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
+	  SPH_C32(0xb0a51834) },
+	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
+	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
+	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
+	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
+	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
+	  SPH_C32(0xa6b8c28d) },
+	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
+	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
+	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
+	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
+	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
+	  SPH_C32(0x3a4e99d7) },
+	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
+	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
+	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
+	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
+	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
+	  SPH_C32(0xe1844257) },
+	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
+	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
+	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
+	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
+	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
+	  SPH_C32(0x2c3b504e) },
+	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
+	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
+	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
+	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
+	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
+	  SPH_C32(0x524a0d59) },
+	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
+	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
+	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
+	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
+	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
+	  SPH_C32(0x378dd173) },
+	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
+	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
+	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
+	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
+	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
+	  SPH_C32(0x8b6c72bd) },
+	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
+	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
+	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
+	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
+	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
+	  SPH_C32(0x8e67b7fa) },
+	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
+	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
+	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
+	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
+	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
+	  SPH_C32(0x443d3004) },
+	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
+	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
+	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
+	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
+	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
+	  SPH_C32(0xf4f6ea7b) },
+	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
+	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
+	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
+	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
+	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
+	  SPH_C32(0x979961d0) },
+	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
+	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
+	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
+	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
+	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
+	  SPH_C32(0x98aa496e) },
+	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
+	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
+	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
+	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
+	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
+	  SPH_C32(0x094e3198) },
+	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
+	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
+	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
+	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
+	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
+	  SPH_C32(0xe86cba2e) },
+	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
+	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
+	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
+	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
+	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
+	  SPH_C32(0x4b7eec55) },
+	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
+	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
+	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
+	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
+	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
+	  SPH_C32(0x1e7536a6) },
+	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
+	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
+	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
+	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
+	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
+	  SPH_C32(0x24314f17) },
+	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
+	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
+	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
+	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
+	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
+	  SPH_C32(0x9075b1ce) },
+	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
+	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
+	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
+	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
+	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
+	  SPH_C32(0x9b6ef888) },
+	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
+	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
+	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
+	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
+	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
+	  SPH_C32(0xd8b61463) },
+	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
+	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
+	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
+	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
+	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
+	  SPH_C32(0x3ea660f7) },
+	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
+	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
+	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
+	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
+	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
+	  SPH_C32(0x7f975691) },
+	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
+	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
+	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
+	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
+	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
+	  SPH_C32(0x2c94459e) },
+	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
+	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
+	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
+	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
+	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
+	  SPH_C32(0x56a7b19f) },
+	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
+	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
+	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
+	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
+	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
+	  SPH_C32(0x81fdf908) },
+	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
+	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
+	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
+	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
+	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
+	  SPH_C32(0x5bd61539) },
+	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
+	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
+	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
+	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
+	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
+	  SPH_C32(0x15b961e7) },
+	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
+	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
+	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
+	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
+	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
+	  SPH_C32(0x2a2c18f0) },
+	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
+	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
+	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
+	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
+	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
+	  SPH_C32(0x551e3d6e) },
+	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
+	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
+	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
+	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
+	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
+	  SPH_C32(0x33c5244f) },
+	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
+	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
+	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
+	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
+	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
+	  SPH_C32(0x8a58e6a4) },
+	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
+	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
+	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
+	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
+	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
+	  SPH_C32(0xda878000) },
+	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
+	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
+	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
+	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
+	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
+	  SPH_C32(0x3c5dfffe) },
+	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
+	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
+	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
+	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
+	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
+	  SPH_C32(0x7b1675d7) },
+	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
+	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
+	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
+	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
+	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
+	  SPH_C32(0x2879ebac) },
+	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
+	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
+	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
+	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
+	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
+	  SPH_C32(0xbe0a679e) },
+	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
+	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
+	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
+	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
+	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
+	  SPH_C32(0x30aebcf7) },
+	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
+	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
+	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
+	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
+	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
+	  SPH_C32(0xc7ff60f0) },
+	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
+	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
+	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
+	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
+	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
+	  SPH_C32(0xe7e00a94) }
+};
+
+#define INPUT_BIG \
+do { \
+  __m256i db = *buf; \
+  const sph_u32 *tp = &T512[0][0]; \
+  m0 = m256_zero; \
+  m1 = m256_zero; \
+  m2 = m256_zero; \
+  m3 = m256_zero; \
+  m4 = m256_zero; \
+  m5 = m256_zero; \
+  m6 = m256_zero; \
+  m7 = m256_zero; \
+  for ( int u = 0; u < 64; u++ ) \
+  { \
+     __m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
+     dm = mm256_negate_32( _mm256_or_si256( dm, \
+                         _mm256_slli_epi64( dm, 32 ) ) ); \
+     m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x1], tp[0x0], tp[0x1], tp[0x0], \
+                                    tp[0x1], tp[0x0], tp[0x1], tp[0x0] ) ) ); \
+     m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x3], tp[0x2], tp[0x3], tp[0x2], \
+                                    tp[0x3], tp[0x2], tp[0x3], tp[0x2] ) ) ); \
+     m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x5], tp[0x4], tp[0x5], tp[0x4], \
+                                    tp[0x5], tp[0x4], tp[0x5], tp[0x4] ) ) ); \
+     m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x7], tp[0x6], tp[0x7], tp[0x6], \
+                                    tp[0x7], tp[0x6], tp[0x7], tp[0x6] ) ) ); \
+     m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x9], tp[0x8], tp[0x9], tp[0x8], \
+                                    tp[0x9], tp[0x8], tp[0x9], tp[0x8] ) ) ); \
+     m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0xB], tp[0xA], tp[0xB], tp[0xA], \
+                                    tp[0xB], tp[0xA], tp[0xB], tp[0xA] ) ) ); \
+     m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0xD], tp[0xC], tp[0xD], tp[0xC], \
+                                    tp[0xD], tp[0xC], tp[0xD], tp[0xC] ) ) ); \
+     m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0xF], tp[0xE], tp[0xF], tp[0xE], \
+                                    tp[0xF], tp[0xE], tp[0xF], tp[0xE] ) ) ); \
+     tp += 0x10; \
+     db = _mm256_srli_epi64( db, 1 ); \
+  } \
+} while (0)
+
+#define SBOX( a, b, c, d ) \
+do { \
+  __m256i t; \
+  t = a; \
+  a = _mm256_and_si256( a, c ); \
+  a = _mm256_xor_si256( a, d ); \
+  c = _mm256_xor_si256( c, b ); \
+  c = _mm256_xor_si256( c, a ); \
+  d = _mm256_or_si256( d, t ); \
+  d = _mm256_xor_si256( d, b ); \
+  t = _mm256_xor_si256( t, c ); \
+  b = d; \
+  d = _mm256_or_si256( d, t ); \
+  d = _mm256_xor_si256( d, a ); \
+  a = _mm256_and_si256( a, b ); \
+  t = _mm256_xor_si256( t, a ); \
+  b = _mm256_xor_si256( b, d ); \
+  b = _mm256_xor_si256( b, t ); \
+  a = c; \
+  c = b; \
+  b = d; \
+  d = mm256_not( t ); \
+} while (0)
+
+#define L( a, b, c, d ) \
+do { \
+   a = mm256_rotl_32( a, 13 ); \
+   c = mm256_rotl_32( c,  3 ); \
+   b = _mm256_xor_si256( b, _mm256_xor_si256( a, c ) ); \
+   d = _mm256_xor_si256( d, _mm256_xor_si256( c, \
+                                              _mm256_slli_epi32( a, 3 ) ) ); \
+   b = mm256_rotl_32( b, 1 ); \
+   d = mm256_rotl_32( d, 7 ); \
+   a = _mm256_xor_si256( a, _mm256_xor_si256( b, d ) ); \
+   c = _mm256_xor_si256( c, _mm256_xor_si256( d, \
+                                              _mm256_slli_epi32( b, 7 ) ) ); \
+   a = mm256_rotl_32( a,  5 ); \
+   c = mm256_rotl_32( c, 22 ); \
+} while (0)
+
+#define DECL_STATE_BIG \
+   __m256i c0, c1, c2, c3, c4, c5, c6, c7; \
+
+#define READ_STATE_BIG(sc) \
+do { \
+   c0 = sc->h[0x0]; \
+   c1 = sc->h[0x1]; \
+   c2 = sc->h[0x2]; \
+   c3 = sc->h[0x3]; \
+   c4 = sc->h[0x4]; \
+   c5 = sc->h[0x5]; \
+   c6 = sc->h[0x6]; \
+   c7 = sc->h[0x7]; \
+} while (0)
+
+#define WRITE_STATE_BIG(sc) \
+do { \
+   sc->h[0x0] = c0; \
+   sc->h[0x1] = c1; \
+   sc->h[0x2] = c2; \
+   sc->h[0x3] = c3; \
+   sc->h[0x4] = c4; \
+   sc->h[0x5] = c5; \
+   sc->h[0x6] = c6; \
+   sc->h[0x7] = c7; \
+} while (0)
+
+#define s0   m0
+#define s1   c0
+#define s2   m1
+#define s3   c1
+#define s4   c2
+#define s5   m2
+#define s6   c3
+#define s7   m3
+#define s8   m4
+#define s9   c4
+#define sA   m5
+#define sB   c5
+#define sC   c6
+#define sD   m6
+#define sE   c7
+#define sF   m7
+
+#define ROUND_BIG(rc, alpha) \
+do { \
+  __m256i t0, t1, t2, t3; \
+  s0 = _mm256_xor_si256( s0, _mm256_set_epi32( \
+        alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00], \
+        alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00] ) ); \
+  s1 = _mm256_xor_si256( s1, _mm256_set_epi32( \
+                     alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02], \
+                     alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02] ) ); \
+  s2 = _mm256_xor_si256( s2, _mm256_set_epi32( \
+                     alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04], \
+                     alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04] ) ); \
+  s3 = _mm256_xor_si256( s3, _mm256_set_epi32( \
+                     alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06], \
+                     alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06] ) ); \
+  s4 = _mm256_xor_si256( s4, _mm256_set_epi32( \
+                     alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08], \
+                     alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08] ) ); \
+  s5 = _mm256_xor_si256( s5, _mm256_set_epi32( \
+                     alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A], \
+                     alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A] ) ); \
+  s6 = _mm256_xor_si256( s6, _mm256_set_epi32( \
+                     alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C], \
+                     alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C] ) ); \
+  s7 = _mm256_xor_si256( s7, _mm256_set_epi32( \
+                     alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E], \
+                     alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E] ) ); \
+  s8 = _mm256_xor_si256( s8, _mm256_set_epi32( \
+                     alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10], \
+                     alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10] ) ); \
+  s9 = _mm256_xor_si256( s9, _mm256_set_epi32( \
+                     alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12], \
+                     alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12] ) ); \
+  sA = _mm256_xor_si256( sA, _mm256_set_epi32( \
+                     alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14], \
+                     alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14] ) ); \
+  sB = _mm256_xor_si256( sB, _mm256_set_epi32( \
+                     alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16], \
+                     alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16] ) ); \
+  sC = _mm256_xor_si256( sC, _mm256_set_epi32( \
+                     alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18], \
+                     alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18] ) ); \
+  sD = _mm256_xor_si256( sD, _mm256_set_epi32( \
+                     alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A], \
+                     alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A] ) ); \
+  sE = _mm256_xor_si256( sE, _mm256_set_epi32( \
+                     alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C], \
+                     alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C] ) ); \
+  sF = _mm256_xor_si256( sF, _mm256_set_epi32( \
+                     alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E], \
+                     alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E] ) ); \
+\
+  SBOX( s0, s4, s8, sC ); \
+  SBOX( s1, s5, s9, sD ); \
+  SBOX( s2, s6, sA, sE ); \
+  SBOX( s3, s7, sB, sF ); \
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \
+                           _mm256_bslli_epi128( s5, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \
+                           _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
+  L( s0, t1, s9, t3 ); \
+  s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
+                           _mm256_bslli_epi128( s6, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \
+                           _mm256_bslli_epi128( sF, 4 ), 0xAA ); \
+  L( s1, t1, sA, t3 ); \
+  s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \
+                           _mm256_bslli_epi128( s7, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \
+                           _mm256_bslli_epi128( sC, 4 ), 0xAA ); \
+  L( s2, t1, sB, t3 ); \
+  s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \
+                           _mm256_bslli_epi128( s4, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \
+                           _mm256_bslli_epi128( sD, 4 ), 0xAA ); \
+  L( s3, t1, s8, t3 ); \
+  s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \
+  t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \
+  t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \
+                           _mm256_bslli_epi128( sB, 4 ), 0xAA ); \
+  L( t0, t1, t2, t3 ); \
+  s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
+  s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \
+  s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
+  s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \
+  s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \
+  sA = _mm256_blend_epi32( sA, t2, 0xAA ); \
+  s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \
+  sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \
+\
+  t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
+                           _mm256_bslli_epi128( sD, 4 ), 0xAA ); \
+  t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \
+  L( t0, t1, t2, t3 ); \
+  s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \
+  sC = _mm256_blend_epi32( sC, t0, 0xAA ); \
+  s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \
+  sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \
+  s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
+  sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \
+  s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
+  sF = _mm256_blend_epi32( sF, t3, 0xAA ); \
+} while (0)
+
+#define P_BIG \
+do { \
+   ROUND_BIG(0, alpha_n); \
+   ROUND_BIG(1, alpha_n); \
+   ROUND_BIG(2, alpha_n); \
+   ROUND_BIG(3, alpha_n); \
+   ROUND_BIG(4, alpha_n); \
+   ROUND_BIG(5, alpha_n); \
+} while (0)
+
+#define PF_BIG \
+do { \
+   ROUND_BIG( 0, alpha_f); \
+   ROUND_BIG( 1, alpha_f); \
+   ROUND_BIG( 2, alpha_f); \
+   ROUND_BIG( 3, alpha_f); \
+   ROUND_BIG( 4, alpha_f); \
+   ROUND_BIG( 5, alpha_f); \
+   ROUND_BIG( 6, alpha_f); \
+   ROUND_BIG( 7, alpha_f); \
+   ROUND_BIG( 8, alpha_f); \
+   ROUND_BIG( 9, alpha_f); \
+   ROUND_BIG(10, alpha_f); \
+   ROUND_BIG(11, alpha_f); \
+} while (0)
+
+#define T_BIG \
+do { /* order is important */ \
+   c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \
+   c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \
+   c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \
+   c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \
+   c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \
+   c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \
+   c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \
+   c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \
+} while (0)
+
+void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
+{
+   DECL_STATE_BIG
+   sph_u32 tmp;
+
+   tmp = SPH_T32( (sph_u32)num << 6 );
+   sc->count_low = SPH_T32( sc->count_low + tmp );
+   sc->count_high += (sph_u32)( (num >> 13) >> 13 );
+   if ( sc->count_low < tmp )
+      sc->count_high++;
+
+   READ_STATE_BIG( sc );
+   while ( num-- > 0 )
+   {
+      __m256i m0, m1, m2, m3, m4, m5, m6, m7;
+
+      INPUT_BIG;
+      P_BIG;
+      T_BIG;
+      buf++;
+   }
+   WRITE_STATE_BIG( sc );
+}
+
+void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
+{
+   __m256i m0, m1, m2, m3, m4, m5, m6, m7;
+   DECL_STATE_BIG
+   READ_STATE_BIG( sc );
+   INPUT_BIG;
+   PF_BIG;
+   T_BIG;
+   WRITE_STATE_BIG( sc );
+}
+
+void hamsi512_4way_init( hamsi_4way_big_context *sc )
+{
+   sc->partial_len = 0;
+   sph_u32 lo, hi;
+   sc->count_high = sc->count_low = 0;
+   for ( int i = 0; i < 8; i++ )
+   {
+      lo = 2*i;
+      hi = 2*i + 1;
+      sc->h[i] = _mm256_set_epi32( IV512[hi], IV512[lo], IV512[hi], IV512[lo],
+                                   IV512[hi], IV512[lo], IV512[hi], IV512[lo] );
+   }
+}
+
+void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+
+// It looks like the only way to get in here is if core was previously called
+// with a very small len
+// That's not likely even with 80 byte input so deprecate partial len
+/*
+   if ( sc->partial_len != 0 )
+   {
+      size_t mlen;
+
+      mlen = 8 - sc->partial_len;
+      if ( len < mlen )
+      {
+         memcpy_256( sc->partial + (sc->partial_len >> 3), data, len>>3 );
+         sc->partial_len += len;
+         return;
+      }
+      else
+      {
+         memcpy_256( sc->partial + (sc->partial_len >> 3), data, mlen>>3 );
+         len -= mlen;
+         vdata += mlen>>3;
+         hamsi_big( sc, sc->partial, 1 );
+         sc->partial_len = 0;
+      }
+   }
+*/
+
+   hamsi_big( sc, vdata, len>>3 );
+   vdata += ( (len& ~(size_t)7) >> 3 );
+   len &= (size_t)7;
+   memcpy_256( sc->buf, vdata, len>>3 );
+   sc->partial_len = len;
+}
+
+void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
+{
+   __m256i *out = (__m256i*)dst;
+   __m256i pad[1];
+   size_t u;
+   int ch, cl;
+
+   sph_enc32be( &ch, sc->count_high );
+   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] =  _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch );
+   sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
+                                  0UL, 0x80UL, 0UL, 0x80UL );
+   hamsi_big( sc, sc->buf, 1 );
+   hamsi_big_final( sc, pad );
+   for ( u = 0; u < 8; u ++ )
+      out[u] = mm256_bswap_32( sc->h[u] );
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -0,0 +1,72 @@
+/* $Id: sph_hamsi.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Hamsi interface. This code implements Hamsi with the recommended
+ * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_hamsi.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef HAMSI_4WAY_H__
+#define HAMSI_4WAY_H__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+
+#if defined (__AVX__)
+
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#define SPH_SIZE_hamsi512   512
+
+// Partial is only scalar but needs pointer ref for hamsi-helper
+// deprecate partial_len
+typedef struct {
+   __m256i h[8];
+   __m256i buf[1];
+   size_t partial_len;
+   sph_u32 count_high, count_low;
+} hamsi_4way_big_context;
+
+typedef hamsi_4way_big_context hamsi512_4way_context;
+
+void hamsi512_4way_init( hamsi512_4way_context *sc );
+void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
+void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -0,0 +1,115 @@
+/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
+/*
+ * Helper code, included (three times !) by HAVAL implementation.
+ *
+ * TODO: try to merge this with md_helper.c.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)    SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)   a ## b
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _4way)
+( haval_4way_context *sc, const void *data, size_t len )
+{
+   __m128i *vdata = (__m128i*)data;
+   unsigned current;
+
+   current = (unsigned)sc->count_low & 127U;
+   while ( len > 0 )
+   {
+      unsigned clen;
+      sph_u32 clow, clow2;
+
+      clen = 128U - current;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( sc->buf + (current>>2), vdata, clen>>2 );
+      vdata += clen>>2;
+      current += clen;
+      len -= clen;
+      if ( current == 128U )
+      {
+         DSTATE;
+         IN_PREPARE(sc->buf);
+         RSTATE;
+         SPH_XCAT(CORE, PASSES)(INW);
+         WSTATE;
+         current = 0;
+      }
+      clow = sc->count_low;
+      clow2 = SPH_T32(clow + clen);
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high ++;
+   }
+}
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
+                                                void *dst)
+{
+   unsigned current;
+   DSTATE;
+
+   current = (unsigned)sc->count_low & 127UL;
+
+   sc->buf[ current>>2 ] = m128_one_32;
+   current += 4;   
+   RSTATE;
+   if ( current > 116UL )
+   {
+      memset_zero_128( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
+      do
+      {
+         IN_PREPARE(sc->buf);
+         SPH_XCAT(CORE, PASSES)(INW);
+      } while (0);
+      current = 0;
+   }
+
+   uint32_t t1, t2;
+   memset_zero_128( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
+   t1 = 0x01 | (PASSES << 3);
+   t2 = sc->olen << 3;
+   sc->buf[ 116>>2 ] = _mm_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
+   sc->buf[ 120>>2 ] = _mm_set1_epi32( sc->count_low << 3 );
+   sc->buf[ 124>>2 ] = _mm_set1_epi32( (sc->count_high << 3)
+                                     | (sc->count_low >> 29) );
+   do
+   {
+      IN_PREPARE(sc->buf);
+      SPH_XCAT(CORE, PASSES)(INW);
+   } while (0);
+   WSTATE;
+   haval_4way_out( sc, dst );
+}
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -0,0 +1,522 @@
+/* $Id: haval.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * HAVAL implementation.
+ *
+ * The HAVAL reference paper is of questionable clarity with regards to
+ * some details such as endianness of bits within a byte, bytes within
+ * a 32-bit word, or the actual ordering of words within a stream of
+ * words. This implementation has been made compatible with the reference
+ * implementation available on: http://labs.calyptix.com/haval.php
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+#include "haval-hash-4way.h"
+
+#if defined (__AVX__)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAVAL
+#define SPH_SMALL_FOOTPRINT_HAVAL   1
+//#endif
+
+#define F1(x6, x5, x4, x3, x2, x1, x0) \
+   _mm_xor_si128( x0, \
+       _mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \
+                      _mm_xor_si128( _mm_and_si128( x2, x5 ), \
+                                     _mm_and_si128( x3, x6 ) ) ) ) \
+
+#define F2(x6, x5, x4, x3, x2, x1, x0) \
+   _mm_xor_si128( \
+      _mm_and_si128( x2, \
+         _mm_xor_si128( _mm_andnot_si128( x3, x1 ), \
+                        _mm_xor_si128( _mm_and_si128( x4, x5 ), \
+                                       _mm_xor_si128( x6, x0 ) ) ) ), \
+         _mm_xor_si128( \
+             _mm_and_si128( x4, _mm_xor_si128( x1, x5 ) ), \
+             _mm_xor_si128( _mm_and_si128( x3, x5 ), x0 ) ) ) \
+
+#define F3(x6, x5, x4, x3, x2, x1, x0) \
+  _mm_xor_si128( \
+    _mm_and_si128( x3, \
+      _mm_xor_si128( _mm_and_si128( x1, x2 ), \
+                     _mm_xor_si128( x6, x0 ) ) ), \
+      _mm_xor_si128( _mm_xor_si128(_mm_and_si128( x1, x4 ), \
+                                   _mm_and_si128( x2, x5 ) ), x0 ) )
+
+#define F4(x6, x5, x4, x3, x2, x1, x0) \
+  _mm_xor_si128( \
+     _mm_xor_si128( \
+        _mm_and_si128( x3, \
+           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x2 ), \
+                                         _mm_or_si128( x4, x6 ) ), x5 ) ), \
+        _mm_and_si128( x4, \
+           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm_not(x2), x5 ), \
+                          _mm_xor_si128( x1, x6 ) ), x0 ) ) ), \
+     _mm_xor_si128( _mm_and_si128( x2, x6 ), x0 ) )
+
+
+#define F5(x6, x5, x4, x3, x2, x1, x0) \
+   _mm_xor_si128( \
+       _mm_and_si128( x0, \
+            mm_not( _mm_xor_si128( \
+                    _mm_and_si128( _mm_and_si128( x1, x2 ), x3 ), x5 ) ) ), \
+      _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
+                                    _mm_and_si128( x2, x5 ) ), \
+                                    _mm_and_si128( x3, x6 ) ) )
+
+/*
+ * The macros below integrate the phi() permutations, depending on the
+ * pass and the total number of passes.
+ */
+
+#define FP3_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x1, x0, x3, x5, x6, x2, x4)
+#define FP3_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x4, x2, x1, x0, x5, x3, x6)
+#define FP3_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x6, x1, x2, x3, x4, x5, x0)
+
+#define FP4_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x2, x6, x1, x4, x5, x3, x0)
+#define FP4_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x3, x5, x2, x0, x1, x6, x4)
+#define FP4_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x1, x4, x3, x6, x0, x2, x5)
+#define FP4_4(x6, x5, x4, x3, x2, x1, x0) \
+	F4(x6, x4, x0, x5, x2, x1, x3)
+
+#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \
+	F1(x3, x4, x1, x0, x5, x2, x6)
+#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \
+	F2(x6, x2, x1, x0, x3, x4, x5)
+#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \
+	F3(x2, x6, x0, x4, x3, x1, x5)
+#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \
+	F4(x1, x5, x3, x2, x0, x4, x6)
+#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \
+	F5(x2, x5, x0, x6, x4, x3, x1)
+
+/*
+ * One step, for "n" passes, pass number "p" (1 <= p <= n), using
+ * input word number "w" and step constant "c".
+ */
+#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
+do { \
+   __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
+   x7 = _mm_add_epi32( _mm_add_epi32( mm_rotr_32( t, 7 ), \
+                                      mm_rotr_32( x7, 11 ) ), \
+                       _mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
+} while (0)
+
+/*
+ * PASSy(n, in) computes pass number "y", for a total of "n", using the
+ * one-argument macro "in" to access input words. Current state is assumed
+ * to be held in variables "s0" to "s7".
+ */
+
+//#if SPH_SMALL_FOOTPRINT_HAVAL
+
+#define PASS1(n, in)   do { \
+		unsigned pass_count; \
+		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+			STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(pass_count + 0), SPH_C32(0x00000000)); \
+			STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(pass_count + 1), SPH_C32(0x00000000)); \
+			STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(pass_count + 2), SPH_C32(0x00000000)); \
+			STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(pass_count + 3), SPH_C32(0x00000000)); \
+			STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(pass_count + 4), SPH_C32(0x00000000)); \
+			STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(pass_count + 5), SPH_C32(0x00000000)); \
+			STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(pass_count + 6), SPH_C32(0x00000000)); \
+			STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(pass_count + 7), SPH_C32(0x00000000)); \
+   		} \
+	} while (0)
+
+#define PASSG(p, n, in)   do { \
+		unsigned pass_count; \
+		for (pass_count = 0; pass_count < 32; pass_count += 8) { \
+			STEP(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \
+				in(MP ## p[pass_count + 0]), \
+				RK ## p[pass_count + 0]); \
+			STEP(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \
+				in(MP ## p[pass_count + 1]), \
+				RK ## p[pass_count + 1]); \
+			STEP(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \
+				in(MP ## p[pass_count + 2]), \
+				RK ## p[pass_count + 2]); \
+			STEP(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \
+				in(MP ## p[pass_count + 3]), \
+				RK ## p[pass_count + 3]); \
+			STEP(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \
+				in(MP ## p[pass_count + 4]), \
+				RK ## p[pass_count + 4]); \
+			STEP(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \
+				in(MP ## p[pass_count + 5]), \
+				RK ## p[pass_count + 5]); \
+			STEP(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \
+				in(MP ## p[pass_count + 6]), \
+				RK ## p[pass_count + 6]); \
+			STEP(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \
+				in(MP ## p[pass_count + 7]), \
+				RK ## p[pass_count + 7]); \
+   		} \
+	} while (0)
+
+#define PASS2(n, in)    PASSG(2, n, in)
+#define PASS3(n, in)    PASSG(3, n, in)
+#define PASS4(n, in)    PASSG(4, n, in)
+#define PASS5(n, in)    PASSG(5, n, in)
+
+static const unsigned MP2[32] = {
+	 5, 14, 26, 18, 11, 28,  7, 16,
+	 0, 23, 20, 22,  1, 10,  4,  8,
+	30,  3, 21,  9, 17, 24, 29,  6,
+	19, 12, 15, 13,  2, 25, 31, 27
+};
+
+static const unsigned MP3[32] = {
+	19,  9,  4, 20, 28, 17,  8, 22,
+	29, 14, 25, 12, 24, 30, 16, 26,
+	31, 15,  7,  3,  1,  0, 18, 27,
+	13,  6, 21, 10, 23, 11,  5,  2
+};
+
+static const unsigned MP4[32] = {
+	24,  4,  0, 14,  2,  7, 28, 23,
+	26,  6, 30, 20, 18, 25, 19,  3,
+	22, 11, 31, 21,  8, 27, 12,  9,
+	 1, 29,  5, 15, 17, 10, 16, 13
+};
+
+static const unsigned MP5[32] = {
+	27,  3, 21, 26, 17, 11, 20, 29,
+	19,  0, 12,  7, 13,  8, 31, 10,
+	 5,  9, 14, 30, 18,  6, 28, 24,
+	 2, 23, 16, 22,  4,  1, 25, 15
+};
+
+static const sph_u32 RK2[32] = {
+	SPH_C32(0x452821E6), SPH_C32(0x38D01377),
+	SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C),
+	SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD),
+	SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917),
+	SPH_C32(0x9216D5D9), SPH_C32(0x8979FB1B),
+	SPH_C32(0xD1310BA6), SPH_C32(0x98DFB5AC),
+	SPH_C32(0x2FFD72DB), SPH_C32(0xD01ADFB7),
+	SPH_C32(0xB8E1AFED), SPH_C32(0x6A267E96),
+	SPH_C32(0xBA7C9045), SPH_C32(0xF12C7F99),
+	SPH_C32(0x24A19947), SPH_C32(0xB3916CF7),
+	SPH_C32(0x0801F2E2), SPH_C32(0x858EFC16),
+	SPH_C32(0x636920D8), SPH_C32(0x71574E69),
+	SPH_C32(0xA458FEA3), SPH_C32(0xF4933D7E),
+	SPH_C32(0x0D95748F), SPH_C32(0x728EB658),
+	SPH_C32(0x718BCD58), SPH_C32(0x82154AEE),
+	SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5)
+};
+
+static const sph_u32 RK3[32] = {
+	SPH_C32(0x9C30D539), SPH_C32(0x2AF26013),
+	SPH_C32(0xC5D1B023), SPH_C32(0x286085F0),
+	SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF),
+	SPH_C32(0x8E79DCB0), SPH_C32(0x603A180E),
+	SPH_C32(0x6C9E0E8B), SPH_C32(0xB01E8A3E),
+	SPH_C32(0xD71577C1), SPH_C32(0xBD314B27),
+	SPH_C32(0x78AF2FDA), SPH_C32(0x55605C60),
+	SPH_C32(0xE65525F3), SPH_C32(0xAA55AB94),
+	SPH_C32(0x57489862), SPH_C32(0x63E81440),
+	SPH_C32(0x55CA396A), SPH_C32(0x2AAB10B6),
+	SPH_C32(0xB4CC5C34), SPH_C32(0x1141E8CE),
+	SPH_C32(0xA15486AF), SPH_C32(0x7C72E993),
+	SPH_C32(0xB3EE1411), SPH_C32(0x636FBC2A),
+	SPH_C32(0x2BA9C55D), SPH_C32(0x741831F6),
+	SPH_C32(0xCE5C3E16), SPH_C32(0x9B87931E),
+	SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C)
+};
+
+static const sph_u32 RK4[32] = {
+	SPH_C32(0x7A325381), SPH_C32(0x28958677),
+	SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF),
+	SPH_C32(0xC4BFE81B), SPH_C32(0x66282193),
+	SPH_C32(0x61D809CC), SPH_C32(0xFB21A991),
+	SPH_C32(0x487CAC60), SPH_C32(0x5DEC8032),
+	SPH_C32(0xEF845D5D), SPH_C32(0xE98575B1),
+	SPH_C32(0xDC262302), SPH_C32(0xEB651B88),
+	SPH_C32(0x23893E81), SPH_C32(0xD396ACC5),
+	SPH_C32(0x0F6D6FF3), SPH_C32(0x83F44239),
+	SPH_C32(0x2E0B4482), SPH_C32(0xA4842004),
+	SPH_C32(0x69C8F04A), SPH_C32(0x9E1F9B5E),
+	SPH_C32(0x21C66842), SPH_C32(0xF6E96C9A),
+	SPH_C32(0x670C9C61), SPH_C32(0xABD388F0),
+	SPH_C32(0x6A51A0D2), SPH_C32(0xD8542F68),
+	SPH_C32(0x960FA728), SPH_C32(0xAB5133A3),
+	SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4)
+};
+
+static const sph_u32 RK5[32] = {
+	SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98),
+	SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176),
+	SPH_C32(0x66CA593E), SPH_C32(0x82430E88),
+	SPH_C32(0x8CEE8619), SPH_C32(0x456F9FB4),
+	SPH_C32(0x7D84A5C3), SPH_C32(0x3B8B5EBE),
+	SPH_C32(0xE06F75D8), SPH_C32(0x85C12073),
+	SPH_C32(0x401A449F), SPH_C32(0x56C16AA6),
+	SPH_C32(0x4ED3AA62), SPH_C32(0x363F7706),
+	SPH_C32(0x1BFEDF72), SPH_C32(0x429B023D),
+	SPH_C32(0x37D0D724), SPH_C32(0xD00A1248),
+	SPH_C32(0xDB0FEAD3), SPH_C32(0x49F1C09B),
+	SPH_C32(0x075372C9), SPH_C32(0x80991B7B),
+	SPH_C32(0x25D479D8), SPH_C32(0xF6E8DEF7),
+	SPH_C32(0xE3FE501A), SPH_C32(0xB6794C3B),
+	SPH_C32(0x976CE0BD), SPH_C32(0x04C006BA),
+	SPH_C32(0xC1A94FB6), SPH_C32(0x409F60C4)
+};
+
+#define SAVE_STATE \
+   __m128i u0, u1, u2, u3, u4, u5, u6, u7; \
+   do { \
+      u0 = s0; \
+      u1 = s1; \
+      u2 = s2; \
+      u3 = s3; \
+      u4 = s4; \
+      u5 = s5; \
+      u6 = s6; \
+      u7 = s7; \
+   } while (0)
+
+#define UPDATE_STATE \
+do { \
+   s0 = _mm_add_epi32( s0, u0 ); \
+   s1 = _mm_add_epi32( s1, u1 ); \
+   s2 = _mm_add_epi32( s2, u2 ); \
+   s3 = _mm_add_epi32( s3, u3 ); \
+   s4 = _mm_add_epi32( s4, u4 ); \
+   s5 = _mm_add_epi32( s5, u5 ); \
+   s6 = _mm_add_epi32( s6, u6 ); \
+   s7 = _mm_add_epi32( s7, u7 ); \
+} while (0)
+
+/*
+ * COREn(in) performs the core HAVAL computation for "n" passes, using
+ * the one-argument macro "in" to access the input words. Running state
+ * is held in variable "s0" to "s7".
+ */
+/*
+#define CORE3(in)  do { \
+		SAVE_STATE; \
+		PASS1(3, in); \
+		PASS2(3, in); \
+		PASS3(3, in); \
+		UPDATE_STATE; \
+	} while (0)
+
+#define CORE4(in)  do { \
+		SAVE_STATE; \
+		PASS1(4, in); \
+		PASS2(4, in); \
+		PASS3(4, in); \
+		PASS4(4, in); \
+		UPDATE_STATE; \
+	} while (0)
+*/
+#define CORE5(in)  do { \
+		SAVE_STATE; \
+		PASS1(5, in); \
+		PASS2(5, in); \
+		PASS3(5, in); \
+		PASS4(5, in); \
+		PASS5(5, in); \
+		UPDATE_STATE; \
+	} while (0)
+
+/*
+ * DSTATE declares the state variables "s0" to "s7".
+ */
+#define DSTATE   __m128i s0, s1, s2, s3, s4, s5, s6, s7
+
+/*
+ * RSTATE fills the state variables from the context "sc".
+ */
+#define RSTATE \
+do { \
+   s0 = sc->s0; \
+   s1 = sc->s1; \
+   s2 = sc->s2; \
+   s3 = sc->s3; \
+   s4 = sc->s4; \
+   s5 = sc->s5; \
+   s6 = sc->s6; \
+   s7 = sc->s7; \
+} while (0)
+
+/*
+ * WSTATE updates the context "sc" from the state variables.
+ */
+#define WSTATE \
+do { \
+   sc->s0 = s0; \
+   sc->s1 = s1; \
+   sc->s2 = s2; \
+   sc->s3 = s3; \
+   sc->s4 = s4; \
+   sc->s5 = s5; \
+   sc->s6 = s6; \
+   sc->s7 = s7; \
+} while (0)
+
+/*
+ * Initialize a context. "olen" is the output length, in 32-bit words
+ * (between 4 and 8, inclusive). "passes" is the number of passes
+ * (3, 4 or 5).
+ */
+static void
+haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes )
+{
+   sc->s0 = _mm_set1_epi32( 0x243F6A88UL );
+   sc->s1 = _mm_set1_epi32( 0x85A308D3UL );
+   sc->s2 = _mm_set1_epi32( 0x13198A2EUL );
+   sc->s3 = _mm_set1_epi32( 0x03707344UL );
+   sc->s4 = _mm_set1_epi32( 0xA4093822UL );
+   sc->s5 = _mm_set1_epi32( 0x299F31D0UL );
+   sc->s6 = _mm_set1_epi32( 0x082EFA98UL );
+   sc->s7 = _mm_set1_epi32( 0xEC4E6C89UL );
+   sc->olen = olen;
+   sc->passes = passes;
+   sc->count_high = 0;
+   sc->count_low = 0;
+	
+}
+
+#define IN_PREPARE(indata) const __m128i *const load_ptr = (indata)
+
+#define INW(i)   load_ptr[ i ] 
+
+/*
+ * Write out HAVAL output. The output length is tailored to the requested
+ * length.
+ */
+static void
+haval_4way_out( haval_4way_context *sc, void *dst )
+{
+   __m128i *buf = (__m128i*)dst;
+   DSTATE;
+   RSTATE;
+
+   buf[0] = s0;
+   buf[1] = s1;
+   buf[2] = s2;
+   buf[3] = s3;
+   buf[4] = s4;
+   buf[5] = s5;
+   buf[6] = s6;
+   buf[7] = s7;
+}
+
+/*
+ * The main core functions inline the code with the COREx() macros. We
+ * use a helper file, included three times, which avoids code copying.
+ */
+/*
+#undef PASSES
+#define PASSES   3
+#include "haval-helper.c"
+
+#undef PASSES
+#define PASSES   4
+#include "haval-helper.c"
+*/
+
+#undef PASSES
+#define PASSES   5
+#include "haval-4way-helper.c"
+
+/* ====================================================================== */
+
+#define API(xxx, y) \
+void \
+haval ## xxx ## _ ## y ## _4way_init(void *cc) \
+{ \
+	haval_4way_init(cc, xxx >> 5, y); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \
+{ \
+	haval ## y ## _4way(cc, data, len); \
+} \
+ \
+void \
+haval ## xxx ## _ ## y ## _4way_close(void *cc, void *dst) \
+{ \
+	haval ## y ## _4way_close(cc, dst); \
+} \
+
+API(256, 5)
+
+#define RVAL \
+do { \
+   s0 = val[0]; \
+   s1 = val[1]; \
+   s2 = val[2]; \
+   s3 = val[3]; \
+   s4 = val[4]; \
+   s5 = val[5]; \
+   s6 = val[6]; \
+   s7 = val[7]; \
+} while (0)
+
+#define WVAL \
+do { \
+   val[0] = s0; \
+   val[1] = s1; \
+   val[2] = s2; \
+   val[3] = s3; \
+   val[4] = s4; \
+   val[5] = s5; \
+   val[6] = s6; \
+   val[7] = s7; \
+} while (0)
+
+#define INMSG(i)   msg[i]
+
+#ifdef __cplusplus
+}
+#endif	
+#endif
--- a/algo/haval/haval-hash-4way.h
+++ b/algo/haval/haval-hash-4way.h
@@ -0,0 +1,95 @@
+/* $Id: sph_haval.h 218 2010-06-08 17:06:34Z tp $ */
+/**
+ * HAVAL interface.
+ *
+ * HAVAL is actually a family of 15 hash functions, depending on whether
+ * the internal computation uses 3, 4 or 5 passes, and on the output
+ * length, which is 128, 160, 192, 224 or 256 bits. This implementation
+ * provides interface functions for all 15, which internally map to
+ * three cores (depending on the number of passes). Note that output
+ * lengths other than 256 bits are not obtained by a simple truncation
+ * of a longer result; the requested length is encoded within the
+ * padding data.
+ *
+ * HAVAL was published in: Yuliang Zheng, Josef Pieprzyk and Jennifer
+ * Seberry: "HAVAL -- a one-way hashing algorithm with variable length
+ * of output", Advances in Cryptology -- AUSCRYPT'92, Lecture Notes in
+ * Computer Science, Vol.718, pp.83-104, Springer-Verlag, 1993.
+ *
+ * This paper, and a reference implementation, are available on the
+ * Calyptix web site: http://labs.calyptix.com/haval.php
+ *
+ * The HAVAL reference paper is quite unclear on the data encoding
+ * details, i.e. endianness (both byte order within a 32-bit word, and
+ * word order within a message block). This implementation has been
+ * made compatible with the reference implementation referenced above.
+ *
+ * @warning   A collision for HAVAL-128/3 (HAVAL with three passes and
+ * 128-bit output) has been published; this function is thus considered
+ * as cryptographically broken. The status for other variants is unclear;
+ * use only with care.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_haval.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef HAVAL_HASH_4WAY_H__
+#define HAVAL_HASH_4WAY_H__
+
+#if defined(__AVX__)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_haval256_5   256
+
+typedef struct {
+   __m128i buf[32];
+   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+   unsigned olen, passes;
+   sph_u32 count_high, count_low;
+} haval_4way_context;
+
+typedef haval_4way_context haval256_5_4way_context;
+
+void haval256_5_4way_init( void *cc );
+
+void haval256_5_4way( void *cc, const void *data, size_t len );
+
+void haval256_5_4way_close( void *cc, void *dst );
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+#endif
--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdio.h>
@@ -16,7 +15,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/skein/sse2/skein.c"

 #ifndef NO_AES_NI
--- a/algo/heavy/heavy.c
+++ b/algo/heavy/heavy.c
@@ -2,7 +2,6 @@
 #include <openssl/sha.h>
 #include <stdint.h>

-#include "miner.h"
 #include "algo-gate-api.h"
 #include "sph_hefty1.h"
 #include "algo/keccak/sph_keccak.h"
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -1,10 +1,7 @@
 #include <memory.h>
 #include <stdlib.h>

-#include "miner.h"
-//#include "algo-gate-api.h"
 #include "hodl-gate.h"
-//#include "hodl.h"
 #include "hodl-wolf.h"

 #define HODL_NSTARTLOC_INDEX 20
@@ -97,21 +94,20 @@ bool hodl_do_this_thread( int thr_id )
 int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
                   uint64_t *hashes_done )
 {
-#ifdef NO_AES_NI
-  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
-  return false;
-//  GetPsuedoRandomData( hodl_scratchbuf, work->data, thr_id );
-//  pthread_barrier_wait( &hodl_barrier );
-//  return scanhash_hodl( thr_id, work, max_nonce, hashes_done );
-#else
-  GenRandomGarbage( hodl_scratchbuf, work->data, thr_id );
+#ifndef NO_AES_NI
+  GenRandomGarbage( (CacheEntry*)hodl_scratchbuf, work->data, thr_id );
  pthread_barrier_wait( &hodl_barrier );
  return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
 #endif
+  return false;
 }

 bool register_hodl_algo( algo_gate_t* gate )
 {
+#ifdef NO_AES_NI
+  applog( LOG_ERR, "Only CPUs with AES are supported, use legacy version.");
+  return false;
+#endif
  pthread_barrier_init( &hodl_barrier, NULL, opt_n_threads );
  gate->optimizations         = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash              = (void*)&hodl_scanhash;
--- a/algo/hodl/hodl-wolf.c
+++ b/algo/hodl/hodl-wolf.c
@@ -150,6 +150,9 @@ int scanhash_hodl_wolf( int threadNumber, struct work* work, uint32_t max_nonce,
        int searchNumber = COMPARE_SIZE / opt_n_threads;
        int startLoc = threadNumber * searchNumber;

+        if ( opt_debug )
+           applog( LOG_DEBUG,"Hash target= %08lx", ptarget[7] );
+
        for(int32_t k = startLoc; k < startLoc + searchNumber && !work_restart[threadNumber].restart; k++)
        {
           // copy data to first l2 cache
--- a/algo/hodl/sha512_avx.c
+++ b/algo/hodl/sha512_avx.c
@@ -4,6 +4,11 @@
 //Dependencies
 #include <string.h>
 #include <stdlib.h>
+
+#ifdef __FreeBSD__
+#include <sys/endian.h>
+#endif 
+
 #include "tmmintrin.h"
 #include "smmintrin.h"

--- a/algo/hodl/sha512_avx2.c
+++ b/algo/hodl/sha512_avx2.c
@@ -3,6 +3,11 @@
 //Dependencies
 #include <string.h>
 #include <stdlib.h>
+
+#ifdef __FreeBSD__
+#include <sys/endian.h>
+#endif 
+
 #include "tmmintrin.h"
 #include "smmintrin.h"
 #include "immintrin.h"
--- a/algo/jh/jh-hash-4way.c
+++ b/algo/jh/jh-hash-4way.c
@@ -0,0 +1,609 @@
+/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */
+/*
+ * JH implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifdef __AVX2__
+
+#include <stddef.h>
+#include <string.h>
+
+#include "jh-hash-4way.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH
+#define SPH_SMALL_FOOTPRINT_JH   1
+#endif
+
+#if !defined SPH_JH_64 && SPH_64_TRUE
+#define SPH_JH_64   1
+#endif
+
+#if !SPH_64
+#undef SPH_JH_64
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+/*
+ * The internal bitslice representation may use either big-endian or
+ * little-endian (true bitslice operations do not care about the bit
+ * ordering, and the bit-swapping linear operations in JH happen to
+ * be invariant through endianness-swapping). The constants must be
+ * defined according to the chosen endianness; we use some
+ * byte-swapping macros for that.
+ */
+
+#if SPH_LITTLE_ENDIAN
+
+#if SPH_64
+#define C64e(x)     ((SPH_C64(x) >> 56) \
+                    | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \
+                    | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \
+                    | ((SPH_C64(x) >>  8) & SPH_C64(0x00000000FF000000)) \
+                    | ((SPH_C64(x) <<  8) & SPH_C64(0x000000FF00000000)) \
+                    | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \
+                    | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \
+                    | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000)))
+#define dec64e_aligned   sph_dec64le_aligned
+#define enc64e           sph_enc64le
+#endif
+
+#else
+
+#if SPH_64
+#define C64e(x)     SPH_C64(x)
+#define dec64e_aligned   sph_dec64be_aligned
+#define enc64e           sph_enc64be
+#endif
+
+#endif
+
+#define Sb(x0, x1, x2, x3, c) \
+do { \
+   __m256i cc = _mm256_set_epi64x( c, c, c, c ); \
+    x3 = mm256_not( x3 ); \
+    x0 = _mm256_xor_si256( x0, _mm256_andnot_si256( x2, cc ) ); \
+    tmp = _mm256_xor_si256( cc, _mm256_and_si256( x0, x1 ) ); \
+    x0 = _mm256_xor_si256( x0, _mm256_and_si256( x2, x3 ) ); \
+    x3 = _mm256_xor_si256( x3, _mm256_andnot_si256( x1, x2 ) ); \
+    x1 = _mm256_xor_si256( x1, _mm256_and_si256( x0, x2 ) ); \
+    x2 = _mm256_xor_si256( x2, _mm256_andnot_si256( x3, x0 ) ); \
+    x0 = _mm256_xor_si256( x0, _mm256_or_si256( x1, x3 ) ); \
+    x3 = _mm256_xor_si256( x3, _mm256_and_si256( x1, x2 ) ); \
+    x1 = _mm256_xor_si256( x1, _mm256_and_si256( tmp, x0 ) ); \
+    x2 = _mm256_xor_si256( x2, tmp ); \
+} while (0)
+
+#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) \
+do { \
+    x4 = _mm256_xor_si256( x4, x1 ); \
+    x5 = _mm256_xor_si256( x5, x2 ); \
+    x6 = _mm256_xor_si256( x6, _mm256_xor_si256( x3, x0 ) ); \
+    x7 = _mm256_xor_si256( x7, x0 ); \
+    x0 = _mm256_xor_si256( x0, x5 ); \
+    x1 = _mm256_xor_si256( x1, x6 ); \
+    x2 = _mm256_xor_si256( x2, _mm256_xor_si256( x7, x4 ) ); \
+    x3 = _mm256_xor_si256( x3, x4 ); \
+} while (0)
+
+#if SPH_JH_64
+
+static const sph_u64 C[] = {
+	C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557),
+	C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40),
+	C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a),
+	C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231),
+	C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410),
+	C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc),
+	C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0),
+	C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3),
+	C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce),
+	C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23),
+	C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8),
+	C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197),
+	C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95),
+	C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214),
+	C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80),
+	C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4),
+	C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989),
+	C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36),
+	C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7),
+	C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f),
+	C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727),
+	C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b),
+	C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e),
+	C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062),
+	C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984),
+	C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5),
+	C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2),
+	C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f),
+	C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465),
+	C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a),
+	C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1),
+	C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf),
+	C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48),
+	C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0),
+	C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134),
+	C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a),
+	C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff),
+	C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6),
+	C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae),
+	C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567),
+	C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a),
+	C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518),
+	C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446),
+	C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e),
+	C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee),
+	C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001),
+	C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779),
+	C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83),
+	C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a),
+	C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef),
+	C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d),
+	C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65),
+	C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a),
+	C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c),
+	C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d),
+	C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71),
+	C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc),
+	C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0),
+	C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c),
+	C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f),
+	C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751),
+	C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad),
+	C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56),
+	C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6),
+	C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a),
+	C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163),
+	C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826),
+	C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f),
+	C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30),
+	C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a),
+	C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3),
+	C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505),
+	C64e(0xb17681d913326cce), C64e(0x3c175284f805a262),
+	C64e(0xf42bcbb378471547), C64e(0xff46548223936a48),
+	C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e),
+	C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e),
+	C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd),
+	C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7),
+	C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be),
+	C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de),
+	C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9),
+	C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a),
+	C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b),
+	C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2)
+};
+
+#define Ceven_hi(r)   (C[((r) << 2) + 0])
+#define Ceven_lo(r)   (C[((r) << 2) + 1])
+#define Codd_hi(r)    (C[((r) << 2) + 2])
+#define Codd_lo(r)    (C[((r) << 2) + 3])
+
+#define S(x0, x1, x2, x3, cb, r)   do { \
+		Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \
+		Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \
+	} while (0)
+
+#define L(x0, x1, x2, x3, x4, x5, x6, x7)   do { \
+		Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \
+			x4 ## h, x5 ## h, x6 ## h, x7 ## h); \
+		Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \
+			x4 ## l, x5 ## l, x6 ## l, x7 ## l); \
+	} while (0)
+
+
+#define Wz(x, c, n) \
+do { \
+   __m256i t = _mm256_slli_epi64( _mm256_and_si256(x ## h, (c)), (n) ); \
+   x ## h = _mm256_or_si256( _mm256_and_si256( \
+                                _mm256_srli_epi64(x ## h, (n)), (c)), t ); \
+   t = _mm256_slli_epi64( _mm256_and_si256(x ## l, (c)), (n) ); \
+   x ## l = _mm256_or_si256( _mm256_and_si256((x ## l >> (n)), (c)), t ); \
+} while (0)
+
+
+/*
+#define Wz(x, c, n)   do { \
+		sph_u64 t = (x ## h & (c)) << (n); \
+		x ## h = ((x ## h >> (n)) & (c)) | t; \
+		t = (x ## l & (c)) << (n); \
+		x ## l = ((x ## l >> (n)) & (c)) | t; \
+	} while (0)
+*/
+
+#define W0(x)   Wz(x, _mm256_set_epi64x( 0x5555555555555555, \
+       0x5555555555555555, 0x5555555555555555, 0x5555555555555555 ), 1 )
+#define W1(x)   Wz(x, _mm256_set_epi64x( 0x3333333333333333, \
+       0x3333333333333333, 0x3333333333333333, 0x3333333333333333 ), 2 )
+#define W2(x)   Wz(x, _mm256_set_epi64x( 0x0F0F0F0F0F0F0F0F, \
+       0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F ), 4 )
+#define W3(x)   Wz(x, _mm256_set_epi64x( 0x00FF00FF00FF00FF, \
+       0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF ), 8 ) 
+#define W4(x)   Wz(x, _mm256_set_epi64x( 0x0000FFFF0000FFFF, \
+       0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF, 0x0000FFFF0000FFFF ), 16 )
+#define W5(x)   Wz(x, _mm256_set_epi64x( 0x00000000FFFFFFFF, \
+       0x00000000FFFFFFFF, 0x00000000FFFFFFFF, 0x00000000FFFFFFFF ), 32 )
+#define W6(x) \
+do { \
+   __m256i t = x ## h; \
+   x ## h = x ## l; \
+   x ## l = t; \
+} while (0)
+
+/*
+#define W0(x)   Wz(x, SPH_C64(0x5555555555555555),  1)
+#define W1(x)   Wz(x, SPH_C64(0x3333333333333333),  2)
+#define W2(x)   Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F),  4)
+#define W3(x)   Wz(x, SPH_C64(0x00FF00FF00FF00FF),  8)
+#define W4(x)   Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16)
+#define W5(x)   Wz(x, SPH_C64(0x00000000FFFFFFFF), 32)
+#define W6(x)   do { \
+		sph_u64 t = x ## h; \
+		x ## h = x ## l; \
+		x ## l = t; \
+	} while (0)
+*/
+
+#define DECL_STATE \
+	__m256i h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \
+	__m256i h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \
+	__m256i tmp;
+
+#define READ_STATE(state)   do { \
+		h0h = (state)->H[ 0]; \
+		h0l = (state)->H[ 1]; \
+		h1h = (state)->H[ 2]; \
+		h1l = (state)->H[ 3]; \
+		h2h = (state)->H[ 4]; \
+		h2l = (state)->H[ 5]; \
+		h3h = (state)->H[ 6]; \
+		h3l = (state)->H[ 7]; \
+		h4h = (state)->H[ 8]; \
+		h4l = (state)->H[ 9]; \
+		h5h = (state)->H[10]; \
+		h5l = (state)->H[11]; \
+		h6h = (state)->H[12]; \
+		h6l = (state)->H[13]; \
+		h7h = (state)->H[14]; \
+		h7l = (state)->H[15]; \
+	} while (0)
+
+#define WRITE_STATE(state)   do { \
+		(state)->H[ 0] = h0h; \
+		(state)->H[ 1] = h0l; \
+		(state)->H[ 2] = h1h; \
+		(state)->H[ 3] = h1l; \
+		(state)->H[ 4] = h2h; \
+		(state)->H[ 5] = h2l; \
+		(state)->H[ 6] = h3h; \
+		(state)->H[ 7] = h3l; \
+		(state)->H[ 8] = h4h; \
+		(state)->H[ 9] = h4l; \
+		(state)->H[10] = h5h; \
+		(state)->H[11] = h5l; \
+		(state)->H[12] = h6h; \
+		(state)->H[13] = h6l; \
+		(state)->H[14] = h7h; \
+		(state)->H[15] = h7l; \
+	} while (0)
+
+#define INPUT_BUF1 \
+	__m256i m0h = buf[0]; \
+	__m256i m0l = buf[1]; \
+	__m256i m1h = buf[2]; \
+	__m256i m1l = buf[3]; \
+	__m256i m2h = buf[4]; \
+	__m256i m2l = buf[5]; \
+	__m256i m3h = buf[6]; \
+	__m256i m3l = buf[7]; \
+        h0h = _mm256_xor_si256( h0h, m0h ); \
+        h0l = _mm256_xor_si256( h0l, m0l ); \
+        h1h = _mm256_xor_si256( h1h, m1h ); \
+        h1l = _mm256_xor_si256( h1l, m1l ); \
+        h2h = _mm256_xor_si256( h2h, m2h ); \
+        h2l = _mm256_xor_si256( h2l, m2l ); \
+        h3h = _mm256_xor_si256( h3h, m3h ); \
+        h3l = _mm256_xor_si256( h3l, m3l ); \
+
+#define INPUT_BUF2 \
+   h4h = _mm256_xor_si256( h4h, m0h ); \
+   h4l = _mm256_xor_si256( h4l, m0l ); \
+   h5h = _mm256_xor_si256( h5h, m1h ); \
+   h5l = _mm256_xor_si256( h5l, m1l ); \
+   h6h = _mm256_xor_si256( h6h, m2h ); \
+   h6l = _mm256_xor_si256( h6l, m2l ); \
+   h7h = _mm256_xor_si256( h7h, m3h ); \
+   h7l = _mm256_xor_si256( h7l, m3l ); \
+
+static const sph_u64 IV256[] = {
+	C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1),
+	C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03),
+	C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477),
+	C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8),
+	C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262),
+	C64e(0x277695f776248f94), C64e(0x87d5b6574780296c),
+	C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f),
+	C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769)
+};
+
+
+static const sph_u64 IV512[] = {
+	C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543),
+	C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361),
+	C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80),
+	C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7),
+	C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a),
+	C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199),
+	C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156),
+	C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b)
+};
+
+#else
+
+
+#endif
+
+#define SL(ro)   SLu(r + ro, ro)
+
+#define SLu(r, ro)   do { \
+		S(h0, h2, h4, h6, Ceven_, r); \
+		S(h1, h3, h5, h7, Codd_, r); \
+		L(h0, h2, h4, h6, h1, h3, h5, h7); \
+		W ## ro(h1); \
+		W ## ro(h3); \
+		W ## ro(h5); \
+		W ## ro(h7); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_JH
+
+#if SPH_JH_64
+
+/*
+ * The "small footprint" 64-bit version just uses a partially unrolled
+ * loop.
+ */
+
+#define E8   do { \
+		unsigned r; \
+		for (r = 0; r < 42; r += 7) { \
+			SL(0); \
+			SL(1); \
+			SL(2); \
+			SL(3); \
+			SL(4); \
+			SL(5); \
+			SL(6); \
+		} \
+	} while (0)
+
+#else
+
+
+#endif
+
+#else
+
+#if SPH_JH_64
+
+/*
+ * On a "true 64-bit" architecture, we can unroll at will.
+ */
+
+#define E8   do { \
+		SLu( 0, 0); \
+		SLu( 1, 1); \
+		SLu( 2, 2); \
+		SLu( 3, 3); \
+		SLu( 4, 4); \
+		SLu( 5, 5); \
+		SLu( 6, 6); \
+		SLu( 7, 0); \
+		SLu( 8, 1); \
+		SLu( 9, 2); \
+		SLu(10, 3); \
+		SLu(11, 4); \
+		SLu(12, 5); \
+		SLu(13, 6); \
+		SLu(14, 0); \
+		SLu(15, 1); \
+		SLu(16, 2); \
+		SLu(17, 3); \
+		SLu(18, 4); \
+		SLu(19, 5); \
+		SLu(20, 6); \
+		SLu(21, 0); \
+		SLu(22, 1); \
+		SLu(23, 2); \
+		SLu(24, 3); \
+		SLu(25, 4); \
+		SLu(26, 5); \
+		SLu(27, 6); \
+		SLu(28, 0); \
+		SLu(29, 1); \
+		SLu(30, 2); \
+		SLu(31, 3); \
+		SLu(32, 4); \
+		SLu(33, 5); \
+		SLu(34, 6); \
+		SLu(35, 0); \
+		SLu(36, 1); \
+		SLu(37, 2); \
+		SLu(38, 3); \
+		SLu(39, 4); \
+		SLu(40, 5); \
+		SLu(41, 6); \
+	} while (0)
+
+#else
+
+
+#endif
+
+#endif
+
+static void
+jh_4way_init( jh_4way_context *sc, const void *iv )
+{
+    uint64_t *v = (uint64_t*)iv;
+    
+    for ( int i = 0; i < 16; i++ )
+        sc->H[i] = _mm256_set_epi64x( v[i], v[i], v[i], v[i] );
+    sc->ptr = 0;
+    sc->block_count = 0;
+}
+
+static void
+jh_4way_core( jh_4way_context *sc, const void *data, size_t len )
+{
+    __m256i *buf;
+    __m256i *vdata = (__m256i*)data;
+   const int buf_size = 64;   // 64 * _m256i
+   size_t ptr;
+   DECL_STATE
+
+   buf = sc->buf;
+   ptr = sc->ptr;
+
+   if ( len < (buf_size - ptr) )
+   {
+       memcpy_256( buf + (ptr>>3), vdata, len>>3 );
+       ptr += len;
+       sc->ptr = ptr;
+       return;
+   }
+
+   READ_STATE(sc);
+   while ( len > 0 )
+   {
+       size_t clen;
+       clen = buf_size - ptr;
+       if ( clen > len )
+          clen = len;
+
+       memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
+       ptr += clen;
+       vdata += (clen>>3);
+       len -= clen;
+       if ( ptr == buf_size )
+       {
+          INPUT_BUF1;
+          E8;
+          INPUT_BUF2;
+          sc->block_count ++;
+          ptr = 0;
+       }
+   }
+   WRITE_STATE(sc);
+   sc->ptr = ptr;
+}
+
+static void
+jh_4way_close( jh_4way_context *sc, unsigned ub, unsigned n, void *dst,
+               size_t out_size_w32, const void *iv )
+{
+   __m256i buf[16*4];
+   __m256i *dst256 = (__m256i*)dst;
+   size_t numz, u;
+   sph_u64 l0, l1, l0e, l1e;
+
+   buf[0] = _mm256_set_epi64x( 0x80, 0x80, 0x80, 0x80 );
+
+   if ( sc->ptr == 0 )
+       numz = 48;
+   else
+       numz = 112 - sc->ptr;
+
+   memset_zero_256( buf+1, (numz>>3) - 1 );   
+
+   l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3);
+   l1 = SPH_T64(sc->block_count >> 55);
+   sph_enc64be( &l0e, l0 );
+   sph_enc64be( &l1e, l1 );
+   *(buf + (numz>>3)    ) = _mm256_set_epi64x( l1e, l1e, l1e, l1e );
+   *(buf + (numz>>3) + 1) = _mm256_set_epi64x( l0e, l0e, l0e, l0e ); 
+
+   jh_4way_core( sc, buf, numz + 16 );
+
+   for ( u=0; u < 8; u++ )
+       buf[u] = sc->H[u+8];
+
+    memcpy_256( dst256, buf, 8 );
+}
+
+void
+jh256_4way_init(void *cc)
+{
+	jh_4way_init(cc, IV256);
+}
+
+void
+jh256_4way(void *cc, const void *data, size_t len)
+{
+	jh_4way_core(cc, data, len);
+}
+
+void
+jh256_4way_close(void *cc, void *dst)
+{
+	jh_4way_close(cc, 0, 0, dst, 8, IV256);
+}
+
+void
+jh512_4way_init(void *cc)
+{
+	jh_4way_init(cc, IV512);
+}
+
+void
+jh512_4way(void *cc, const void *data, size_t len)
+{
+	jh_4way_core(cc, data, len);
+}
+
+void
+jh512_4way_close(void *cc, void *dst)
+{
+	jh_4way_close(cc, 0, 0, dst, 16, IV512);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/jh/jh-hash-4way.h
+++ b/algo/jh/jh-hash-4way.h
@@ -0,0 +1,100 @@
+/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * JH interface. JH is a family of functions which differ by
+ * their output size; this implementation defines JH for output
+ * sizes 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_jh.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef JH_HASH_4WAY_H__
+#define JH_HASH_4WAY_H__
+
+#ifdef __AVX2__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_jh256   256
+
+#define SPH_SIZE_jh512   512
+
+/**
+ * This structure is a context for JH computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a JH computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running JH computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+    __m256i buf[8] __attribute__ ((aligned (64)));
+    __m256i H[16];
+    size_t ptr;
+    uint64_t block_count;
+/*
+	unsigned char buf[64]; 
+	size_t ptr;
+	union {
+		sph_u64 wide[16];
+	} H;
+	sph_u64 block_count;
+*/
+} jh_4way_context;
+
+typedef jh_4way_context jh256_4way_context;
+
+typedef jh_4way_context jh512_4way_context;
+
+void jh256_4way_init(void *cc);
+
+void jh256_4way(void *cc, const void *data, size_t len);
+
+void jh256_4way_close(void *cc, void *dst);
+
+void jh512_4way_init(void *cc);
+
+void jh512_4way(void *cc, const void *data, size_t len);
+
+void jh512_4way_close(void *cc, void *dst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -0,0 +1,154 @@
+#include "jha-gate.h"
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+//#include "avxdefs.h"
+
+#if defined(JHA_4WAY)
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/jh/jh-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/groestl/aes_ni/hash-groestl.h"
+
+//static __thread keccak512_4way_context jha_kec_mid
+//                                   __attribute__ ((aligned (64)));
+
+void jha_hash_4way( void *out, const void *input )
+{
+    uint64_t hash0[8] __attribute__ ((aligned (64)));
+    uint64_t hash1[8] __attribute__ ((aligned (64)));
+    uint64_t hash2[8] __attribute__ ((aligned (64)));
+    uint64_t hash3[8] __attribute__ ((aligned (64)));
+    uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashA[8*4] __attribute__ ((aligned (64)));
+    uint64_t vhashB[8*4] __attribute__ ((aligned (64)));
+    __m256i* vh  = (__m256i*)vhash;
+    __m256i* vhA = (__m256i*)vhashA;
+    __m256i* vhB = (__m256i*)vhashB;
+    __m256i vh_mask;
+
+    blake512_4way_context  ctx_blake;
+    hashState_groestl      ctx_groestl;
+    jh512_4way_context     ctx_jh;
+    skein512_4way_context  ctx_skein;
+    keccak512_4way_context ctx_keccak;
+
+    keccak512_4way_init( &ctx_keccak );
+    keccak512_4way( &ctx_keccak, input, 80 );
+    keccak512_4way_close( &ctx_keccak, vhash );
+
+    // Heavy & Light Pair Loop
+    for ( int round = 0; round < 3; round++ )
+    {
+       vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
+               vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero );
+
+       mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash0,
+                                               (char*)hash0, 512 );
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash1,
+                                               (char*)hash1, 512 );
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash2,
+                                               (char*)hash2, 512 );
+       init_groestl( &ctx_groestl, 64 );
+       update_and_final_groestl( &ctx_groestl, (char*)hash3,
+                                               (char*)hash3, 512 );
+       mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 );
+
+       skein512_4way_init( &ctx_skein );
+       skein512_4way( &ctx_skein, vhash, 64 );
+       skein512_4way_close( &ctx_skein, vhashB );
+
+       for ( int i = 0; i < 8; i++ )
+          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+
+       blake512_4way_init( &ctx_blake );
+       blake512_4way( &ctx_blake, vhash, 64 );
+       blake512_4way_close( &ctx_blake, vhashA );
+
+       jh512_4way_init( &ctx_jh );
+       jh512_4way( &ctx_jh, vhash, 64 );
+       jh512_4way_close( &ctx_jh, vhashB );
+
+       for ( int i = 0; i < 8; i++ )
+          vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask );
+    }
+
+    mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
+}
+
+int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t endiandata[20] __attribute__((aligned(64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   const uint32_t Htarg = ptarget[7];
+   uint32_t n = pdata[19];
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1
+
+   uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+   uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+   for ( int i=0; i < 19; i++ )
+      be32enc( &endiandata[i], pdata[i] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   for ( int m = 0; m < 6; m++ )
+   {
+      if ( Htarg <= htmax[m] )
+      {
+         uint32_t mask = masks[m];
+         do {
+              be32enc( noncep,   n   );
+              be32enc( noncep+2, n+1 );
+              be32enc( noncep+4, n+2 );
+              be32enc( noncep+6, n+3 );
+
+              jha_hash_4way( hash, vdata );
+              pdata[19] = n;
+
+              for ( int i = 0; i < 4; i++ )
+              if ( ( !( (hash+(i<<3))[7] & mask ) == 0 )
+                  && fulltest( hash+(i<<3), ptarget ) )
+              {
+                 nonces[ num_found++ ] = n+i;
+                 work_set_target_ratio( work, hash+(i<<3) );
+              }
+              n += 4;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                     && !work_restart[thr_id].restart );
+         break;
+      }
+   }
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+#endif
--- a/algo/jh/jha-gate.c
+++ b/algo/jh/jha-gate.c
@@ -0,0 +1,18 @@
+#include "jha-gate.h"
+
+
+bool register_jha_algo( algo_gate_t* gate )
+{
+#if defined (JHA_4WAY)
+  four_way_not_tested();
+  gate->scanhash         = (void*)&scanhash_jha_4way;
+  gate->hash             = (void*)&jha_hash_4way;
+#else
+  gate->scanhash         = (void*)&scanhash_jha;
+  gate->hash             = (void*)&jha_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+  gate->set_target       = (void*)&scrypt_set_target;
+  return true;
+};
+
--- a/algo/jh/jha-gate.h
+++ b/algo/jh/jha-gate.h
@@ -0,0 +1,25 @@
+#ifndef JHA_GATE_H__
+#define JHA_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define JHA_4WAY
+#endif
+
+#if defined JHA_4WAY
+void jha_hash_4way( void *state, const void *input );
+
+int scanhash_jha_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                       uint64_t *hashes_done );
+#endif
+
+void jha_hash( void *state, const void *input );
+
+int scanhash_jha( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+
+#endif
+
--- a/algo/jh/jha.c
+++ b/algo/jh/jha.c
@@ -0,0 +1,155 @@
+#include "jha-gate.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "algo/blake/sph_blake.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+
+#ifdef NO_AES_NI
+  #include "algo/groestl/sph_groestl.h"
+#else
+  #include "algo/groestl/aes_ni/hash-groestl.h"
+#endif
+
+static __thread sph_keccak512_context jha_kec_mid __attribute__ ((aligned (64)));
+
+void jha_kec_midstate( const void* input )
+{
+    sph_keccak512_init( &jha_kec_mid );
+    sph_keccak512( &jha_kec_mid, input, 64 );
+}
+
+void jha_hash(void *output, const void *input)
+{
+	uint8_t _ALIGN(128) hash[64];
+
+#ifdef NO_AES_NI
+	sph_groestl512_context ctx_groestl;
+#else
+        hashState_groestl      ctx_groestl;
+#endif
+        sph_blake512_context ctx_blake;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+
+        memcpy( &ctx_keccak, &jha_kec_mid, sizeof jha_kec_mid );
+        sph_keccak512(&ctx_keccak, input+64, 16 );
+	sph_keccak512_close(&ctx_keccak, hash );
+
+	// Heavy & Light Pair Loop
+	for (int round = 0; round < 3; round++)
+	{
+	   if (hash[0] & 0x01)
+           {
+#ifdef NO_AES_NI
+		sph_groestl512_init(&ctx_groestl);
+		sph_groestl512(&ctx_groestl, hash, 64 );
+		sph_groestl512_close(&ctx_groestl, hash );
+#else
+                init_groestl( &ctx_groestl, 64 );
+                update_and_final_groestl( &ctx_groestl, (char*)hash,
+                                          (char*)hash, 512 );
+#endif
+	    }
+            else
+            {
+		sph_skein512_init(&ctx_skein);
+		sph_skein512(&ctx_skein, hash, 64);
+		sph_skein512_close(&ctx_skein, hash );
+	    }
+
+	    if (hash[0] & 0x01)
+            {
+		sph_blake512_init(&ctx_blake);
+		sph_blake512(&ctx_blake, hash, 64);
+		sph_blake512_close(&ctx_blake, hash );
+	    }
+            else
+            {
+		sph_jh512_init(&ctx_jh);
+		sph_jh512(&ctx_jh, hash, 64 );
+		sph_jh512_close(&ctx_jh, hash );
+	    }
+	}
+
+	memcpy(output, hash, 32);
+}
+
+int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+{
+	uint32_t _ALIGN(128) hash32[8];
+	uint32_t _ALIGN(128) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const uint32_t Htarg = ptarget[7];
+	uint32_t n = pdata[19] - 1;
+
+	uint64_t htmax[] = {
+		0,
+		0xF,
+		0xFF,
+		0xFFF,
+		0xFFFF,
+		0x10000000
+	};
+	uint32_t masks[] = {
+		0xFFFFFFFF,
+		0xFFFFFFF0,
+		0xFFFFFF00,
+		0xFFFFF000,
+		0xFFFF0000,
+		0
+	};
+
+	// we need bigendian data...
+	for (int i=0; i < 19; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+
+        jha_kec_midstate( endiandata );
+
+#ifdef DEBUG_ALGO
+	printf("[%d] Htarg=%X\n", thr_id, Htarg);
+#endif
+	for (int m=0; m < 6; m++) {
+		if (Htarg <= htmax[m]) {
+			uint32_t mask = masks[m];
+			do {
+				pdata[19] = ++n;
+				be32enc(&endiandata[19], n);
+				jha_hash(hash32, endiandata);
+#ifndef DEBUG_ALGO
+				if ((!(hash32[7] & mask)) && fulltest(hash32, ptarget)) {
+					work_set_target_ratio(work, hash32);
+					*hashes_done = n - first_nonce + 1;
+					return 1;
+				}
+#else
+				if (!(n % 0x1000) && !thr_id) printf(".");
+				if (!(hash32[7] & mask)) {
+					printf("[%d]",thr_id);
+					if (fulltest(hash32, ptarget)) {
+						work_set_target_ratio(work, hash32);
+						*hashes_done = n - first_nonce + 1;
+						return 1;
+					}
+				}
+#endif
+			} while (n < max_nonce && !work_restart[thr_id].restart);
+			// see blake.c if else to understand the loop on htmax => mask
+			break;
+		}
+	}
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
+
--- a/algo/jh/sph_jh.c
+++ b/algo/jh/sph_jh.c
@@ -914,6 +914,7 @@ jh_core(sph_jh_context *sc, const void *data, size_t len)

 	buf = sc->buf;
 	ptr = sc->ptr;
+
 	if (len < (sizeof sc->buf) - ptr) {
 		memcpy(buf + ptr, data, len);
 		ptr += len;
--- a/algo/jh/sse2/jh_sse2_opt64.h
+++ b/algo/jh/sse2/jh_sse2_opt64.h
@@ -22,15 +22,12 @@
 */


-
 #include <emmintrin.h>
 #include <stdint.h>
 #include <string.h>
+#include "algo/sha/sha3-defs.h"

 typedef __m128i  word128;   /*word128 defines a 128-bit SSE2 word*/
-
-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
 typedef enum {jhSUCCESS = 0, jhFAIL = 1, jhBAD_HASHLEN = 2} jhReturn;

 /*define data alignment for different C compilers*/
@@ -342,13 +339,13 @@ do { \
        jhSbuffer[53] = 0x00, \
        jhSbuffer[54] = 0x00, \
        jhSbuffer[55] = 0x00; \
-        jhSbuffer[56] = ((64*8) >> 56) & 0xff, \
-        jhSbuffer[57] = ((64*8) >> 48) & 0xff, \
-        jhSbuffer[58] = ((64*8) >> 40) & 0xff, \
-        jhSbuffer[59] = ((64*8) >> 32) & 0xff, \
-        jhSbuffer[60] = ((64*8) >> 24) & 0xff, \
-        jhSbuffer[61] = ((64*8) >> 16) & 0xff, \
-        jhSbuffer[62] = ((64*8) >> 8) & 0xff, \
+        jhSbuffer[56] = ((char)((uint64_t)(64*8) >> 56)) & 0xff, \
+        jhSbuffer[57] = ((char)((uint64_t)(64*8) >> 48)) & 0xff, \
+        jhSbuffer[58] = ((char)((uint64_t)(64*8) >> 40)) & 0xff, \
+        jhSbuffer[59] = ((char)((uint64_t)(64*8) >> 32)) & 0xff, \
+        jhSbuffer[60] = ((char)((uint64_t)(64*8) >> 24)) & 0xff, \
+        jhSbuffer[61] = ((char)((uint64_t)(64*8) >> 16)) & 0xff, \
+        jhSbuffer[62] = ((char)((uint64_t)(64*8) >> 8)) & 0xff, \
        jhSbuffer[63] = (64*8) & 0xff; \
        b = true; \
    } \
--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -0,0 +1,68 @@
+#include "keccak-gate.h"
+
+#ifdef KECCAK_4WAY
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "sph_keccak.h"
+#include "keccak-hash-4way.h"
+
+void keccakhash_4way(void *state, const void *input)
+{
+    uint64_t vhash[4*4] __attribute__ ((aligned (64)));
+    keccak256_4way_context ctx;
+
+    keccak256_4way_init( &ctx );
+    keccak256_4way( &ctx, input, 80 );
+    keccak256_4way_close( &ctx, vhash );
+
+    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done)
+{
+   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t n = pdata[19];
+   const uint32_t first_nonce = pdata[19];
+//   const uint32_t Htarg = ptarget[7];
+   uint32_t endiandata[20];
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 73;   // 9*8 + 1
+
+   for ( int i=0; i < 19; i++ ) 
+      be32enc( &endiandata[i], pdata[i] );
+
+   uint64_t *edata = (uint64_t*)endiandata;
+   mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 );
+
+   do {
+      be32enc( noncep,   n   );
+      be32enc( noncep+2, n+1 );
+      be32enc( noncep+4, n+2 );
+      be32enc( noncep+6, n+3 );
+	
+      keccakhash_4way( hash, vdata );
+
+      for ( int i = 0; i < 4; i++ )
+      if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
+           && fulltest( hash+(i<<3), ptarget ) )
+      {
+         nonces[ num_found++ ] = n+i;
+         work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/keccak/keccak-gate.c
+++ b/algo/keccak/keccak-gate.c
@@ -0,0 +1,46 @@
+#include "keccak-gate.h"
+
+void keccak_set_target( struct work* work, double job_diff )
+{
+  work_set_target( work, job_diff / (128.0 * opt_diff_factor) );
+}
+
+int64_t keccak_get_max64() { return 0x7ffffLL; }
+
+bool register_keccak_algo( algo_gate_t* gate )
+{
+  gate->optimizations = AVX2_OPT;
+  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  gate->set_target      = (void*)&keccak_set_target;
+  gate->get_max64       = (void*)&keccak_get_max64;
+#if defined (KECCAK_4WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_4way;
+  gate->hash      = (void*)&keccakhash_4way;
+#else
+  gate->scanhash        = (void*)&scanhash_keccak;
+  gate->hash            = (void*)&keccakhash;
+#endif
+  return true;
+};
+
+void keccakc_set_target( struct work* work, double job_diff )
+{
+  work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_keccakc_algo( algo_gate_t* gate )
+{
+  gate->optimizations = AVX2_OPT;
+  gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root;
+  gate->set_target      = (void*)&keccakc_set_target;
+  gate->get_max64       = (void*)&keccak_get_max64;
+#if defined (KECCAK_4WAY)
+  gate->scanhash  = (void*)&scanhash_keccak_4way;
+  gate->hash      = (void*)&keccakhash_4way;
+#else
+  gate->scanhash        = (void*)&scanhash_keccak;
+  gate->hash            = (void*)&keccakhash;
+#endif
+  return true;
+};
+
--- a/algo/keccak/keccak-gate.h
+++ b/algo/keccak/keccak-gate.h
@@ -0,0 +1,23 @@
+#ifndef KECCAK_GATE_H__
+#define KECCAK_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__)
+  #define KECCAK_4WAY
+#endif
+
+#if defined(KECCAK_4WAY)
+
+void keccakhash_4way( void *state, const void *input );
+int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+#endif
+
+void keccakhash( void *state, const void *input );
+int scanhash_keccak( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+#endif
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -0,0 +1,503 @@
+#include <stddef.h>
+#include "keccak-hash-4way.h"
+
+#if defined(__AVX2__)
+
+static const sph_u64 RC[] = {
+        SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082),
+        SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000),
+        SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001),
+        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009),
+        SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088),
+        SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A),
+        SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B),
+        SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003),
+        SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080),
+        SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A),
+        SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080),
+        SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008)
+};
+
+#define a00   (kc->w[ 0])
+#define a10   (kc->w[ 1])
+#define a20   (kc->w[ 2])
+#define a30   (kc->w[ 3])
+#define a40   (kc->w[ 4])
+#define a01   (kc->w[ 5])
+#define a11   (kc->w[ 6])
+#define a21   (kc->w[ 7])
+#define a31   (kc->w[ 8])
+#define a41   (kc->w[ 9])
+#define a02   (kc->w[10])
+#define a12   (kc->w[11])
+#define a22   (kc->w[12])
+#define a32   (kc->w[13])
+#define a42   (kc->w[14])
+#define a03   (kc->w[15])
+#define a13   (kc->w[16])
+#define a23   (kc->w[17])
+#define a33   (kc->w[18])
+#define a43   (kc->w[19])
+#define a04   (kc->w[20])
+#define a14   (kc->w[21])
+#define a24   (kc->w[22])
+#define a34   (kc->w[23])
+#define a44   (kc->w[24])
+
+#define DECL_STATE
+#define READ_STATE(sc)
+#define WRITE_STATE(sc)
+
+#define INPUT_BUF(size)   do { \
+    size_t j; \
+    for (j = 0; j < (size>>3); j++ ) \
+        kc->w[j ] = _mm256_xor_si256( kc->w[j], buf[j] ); \
+} while (0)
+
+#define DECL64(x)        __m256i x
+#define MOV64(d, s)      (d = s)
+#define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
+#define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
+#define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
+#define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
+#define ROL64(d, v, n)   (d = mm256_rotl_64(v, n))
+#define XOR64_IOTA       XOR64
+
+#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
+                DECL64(tt0); \
+                DECL64(tt1); \
+                DECL64(tt2); \
+                DECL64(tt3); \
+                XOR64(tt0, d0, d1); \
+                XOR64(tt1, d2, d3); \
+                XOR64(tt0, tt0, d4); \
+                XOR64(tt0, tt0, tt1); \
+                ROL64(tt0, tt0, 1); \
+                XOR64(tt2, c0, c1); \
+                XOR64(tt3, c2, c3); \
+                XOR64(tt0, tt0, c4); \
+                XOR64(tt2, tt2, tt3); \
+                XOR64(t, tt0, tt2); \
+        } while (0)
+
+#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                DECL64(t0); \
+                DECL64(t1); \
+                DECL64(t2); \
+                DECL64(t3); \
+                DECL64(t4); \
+                TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \
+                TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \
+                TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \
+                TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \
+                TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \
+                XOR64(b00, b00, t0); \
+                XOR64(b01, b01, t0); \
+                XOR64(b02, b02, t0); \
+                XOR64(b03, b03, t0); \
+                XOR64(b04, b04, t0); \
+                XOR64(b10, b10, t1); \
+                XOR64(b11, b11, t1); \
+                XOR64(b12, b12, t1); \
+                XOR64(b13, b13, t1); \
+                XOR64(b14, b14, t1); \
+                XOR64(b20, b20, t2); \
+                XOR64(b21, b21, t2); \
+                XOR64(b22, b22, t2); \
+                XOR64(b23, b23, t2); \
+                XOR64(b24, b24, t2); \
+                XOR64(b30, b30, t3); \
+                XOR64(b31, b31, t3); \
+                XOR64(b32, b32, t3); \
+                XOR64(b33, b33, t3); \
+                XOR64(b34, b34, t3); \
+                XOR64(b40, b40, t4); \
+                XOR64(b41, b41, t4); \
+                XOR64(b42, b42, t4); \
+                XOR64(b43, b43, t4); \
+                XOR64(b44, b44, t4); \
+        } while (0)
+
+#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                /* ROL64(b00, b00,  0); */ \
+                ROL64(b01, b01, 36); \
+                ROL64(b02, b02,  3); \
+                ROL64(b03, b03, 41); \
+                ROL64(b04, b04, 18); \
+                ROL64(b10, b10,  1); \
+                ROL64(b11, b11, 44); \
+                ROL64(b12, b12, 10); \
+                ROL64(b13, b13, 45); \
+                ROL64(b14, b14,  2); \
+                ROL64(b20, b20, 62); \
+                ROL64(b21, b21,  6); \
+                ROL64(b22, b22, 43); \
+                ROL64(b23, b23, 15); \
+                ROL64(b24, b24, 61); \
+                ROL64(b30, b30, 28); \
+                ROL64(b31, b31, 55); \
+                ROL64(b32, b32, 25); \
+                ROL64(b33, b33, 21); \
+                ROL64(b34, b34, 56); \
+                ROL64(b40, b40, 27); \
+                ROL64(b41, b41, 20); \
+                ROL64(b42, b42, 39); \
+                ROL64(b43, b43,  8); \
+                ROL64(b44, b44, 14); \
+        } while (0)
+
+/*
+ * The KHI macro integrates the "lane complement" optimization. On input,
+ * some words are complemented:
+ *    a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43
+ * On output, the following words are complemented:
+ *    a04 a10 a20 a22 a23 a31
+ *
+ * The (implicit) permutation and the theta expansion will bring back
+ * the input mask for the next round.
+ */
+
+#define KHI_XO(d, a, b, c)   do { \
+                DECL64(kt); \
+                OR64(kt, b, c); \
+                XOR64(d, a, kt); \
+        } while (0)
+
+#define KHI_XA(d, a, b, c)   do { \
+                DECL64(kt); \
+                AND64(kt, b, c); \
+                XOR64(d, a, kt); \
+        } while (0)
+
+#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \
+        b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \
+        b40, b41, b42, b43, b44) \
+        do { \
+                DECL64(c0); \
+                DECL64(c1); \
+                DECL64(c2); \
+                DECL64(c3); \
+                DECL64(c4); \
+                DECL64(bnn); \
+                NOT64(bnn, b20); \
+                KHI_XO(c0, b00, b10, b20); \
+                KHI_XO(c1, b10, bnn, b30); \
+                KHI_XA(c2, b20, b30, b40); \
+                KHI_XO(c3, b30, b40, b00); \
+                KHI_XA(c4, b40, b00, b10); \
+                MOV64(b00, c0); \
+                MOV64(b10, c1); \
+                MOV64(b20, c2); \
+                MOV64(b30, c3); \
+                MOV64(b40, c4); \
+                NOT64(bnn, b41); \
+                KHI_XO(c0, b01, b11, b21); \
+                KHI_XA(c1, b11, b21, b31); \
+                KHI_XO(c2, b21, b31, bnn); \
+                KHI_XO(c3, b31, b41, b01); \
+                KHI_XA(c4, b41, b01, b11); \
+                MOV64(b01, c0); \
+                MOV64(b11, c1); \
+                MOV64(b21, c2); \
+                MOV64(b31, c3); \
+                MOV64(b41, c4); \
+                NOT64(bnn, b32); \
+                KHI_XO(c0, b02, b12, b22); \
+                KHI_XA(c1, b12, b22, b32); \
+                KHI_XA(c2, b22, bnn, b42); \
+                KHI_XO(c3, bnn, b42, b02); \
+                KHI_XA(c4, b42, b02, b12); \
+                MOV64(b02, c0); \
+                MOV64(b12, c1); \
+                MOV64(b22, c2); \
+                MOV64(b32, c3); \
+                MOV64(b42, c4); \
+                NOT64(bnn, b33); \
+                KHI_XA(c0, b03, b13, b23); \
+                KHI_XO(c1, b13, b23, b33); \
+                KHI_XO(c2, b23, bnn, b43); \
+                KHI_XA(c3, bnn, b43, b03); \
+                KHI_XO(c4, b43, b03, b13); \
+                MOV64(b03, c0); \
+                MOV64(b13, c1); \
+                MOV64(b23, c2); \
+                MOV64(b33, c3); \
+                MOV64(b43, c4); \
+                NOT64(bnn, b14); \
+                KHI_XA(c0, b04, bnn, b24); \
+                KHI_XO(c1, bnn, b24, b34); \
+                KHI_XA(c2, b24, b34, b44); \
+                KHI_XO(c3, b34, b44, b04); \
+                KHI_XA(c4, b44, b04, b14); \
+                MOV64(b04, c0); \
+                MOV64(b14, c1); \
+                MOV64(b24, c2); \
+                MOV64(b34, c3); \
+                MOV64(b44, c4); \
+        } while (0)
+
+#define IOTA(r)   XOR64_IOTA(a00, a00, r)
+
+#define P0    a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \
+              a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44
+#define P1    a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \
+              a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14
+#define P2    a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \
+              a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31
+#define P3    a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \
+              a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13
+#define P4    a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \
+              a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01
+#define P5    a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \
+              a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30
+#define P6    a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \
+              a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33
+#define P7    a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \
+              a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23
+#define P8    a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \
+              a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12
+#define P9    a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \
+              a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21
+#define P10   a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \
+              a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02
+#define P11   a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \
+              a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10
+#define P12   a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \
+              a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11
+#define P13   a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \
+              a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41
+#define P14   a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \
+              a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24
+#define P15   a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \
+              a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42
+#define P16   a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \
+              a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04
+#define P17   a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \
+              a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20
+#define P18   a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \
+              a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22
+#define P19   a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \
+              a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32
+#define P20   a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \
+              a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43
+#define P21   a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \
+              a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34
+#define P22   a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \
+              a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03
+#define P23   a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \
+              a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40
+
+#define P8_TO_P0   do { \
+                DECL64(t); \
+                MOV64(t, a01); \
+                MOV64(a01, a11); \
+                MOV64(a11, a43); \
+                MOV64(a43, t); \
+                MOV64(t, a02); \
+                MOV64(a02, a22); \
+                MOV64(a22, a31); \
+                MOV64(a31, t); \
+                MOV64(t, a03); \
+                MOV64(a03, a33); \
+                MOV64(a33, a24); \
+                MOV64(a24, t); \
+                MOV64(t, a04); \
+                MOV64(a04, a44); \
+                MOV64(a44, a12); \
+                MOV64(a12, t); \
+                MOV64(t, a10); \
+                MOV64(a10, a32); \
+                MOV64(a32, a13); \
+                MOV64(a13, t); \
+                MOV64(t, a14); \
+                MOV64(a14, a21); \
+                MOV64(a21, a20); \
+                MOV64(a20, t); \
+                MOV64(t, a23); \
+                MOV64(a23, a42); \
+                MOV64(a42, a40); \
+                MOV64(a40, t); \
+                MOV64(t, a30); \
+                MOV64(a30, a41); \
+                MOV64(a41, a34); \
+                MOV64(a34, t); \
+        } while (0)
+
+#define LPAR   (
+#define RPAR   )
+
+#define KF_ELT(r, s, k)   do { \
+                THETA LPAR P ## r RPAR; \
+                RHO LPAR P ## r RPAR; \
+                KHI LPAR P ## s RPAR; \
+                IOTA(k); \
+        } while (0)
+
+#define DO(x)   x
+
+#define KECCAK_F_1600   DO(KECCAK_F_1600_)
+
+#define KECCAK_F_1600_   do { \
+    int j; \
+    for (j = 0; j < 24; j += 8) \
+    { \
+       KF_ELT( 0,  1, (_mm256_set_epi64x( RC[j + 0], RC[j + 0], \
+                                       RC[j + 0], RC[j + 0])) ); \
+       KF_ELT( 1,  2, (_mm256_set_epi64x( RC[j + 1], RC[j + 1], \
+                                       RC[j + 1], RC[j + 1])) ); \
+       KF_ELT( 2,  3, (_mm256_set_epi64x( RC[j + 2], RC[j + 2], \
+                                       RC[j + 2], RC[j + 2])) ); \
+       KF_ELT( 3,  4, (_mm256_set_epi64x( RC[j + 3], RC[j + 3], \
+                                       RC[j + 3], RC[j + 3])) ); \
+       KF_ELT( 4,  5, (_mm256_set_epi64x( RC[j + 4], RC[j + 4], \
+                                       RC[j + 4], RC[j + 4])) ); \
+       KF_ELT( 5,  6, (_mm256_set_epi64x( RC[j + 5], RC[j + 5], \
+                                       RC[j + 5], RC[j + 5])) ); \
+       KF_ELT( 6,  7, (_mm256_set_epi64x( RC[j + 6], RC[j + 6], \
+                                       RC[j + 6], RC[j + 6])) ); \
+       KF_ELT( 7,  8, (_mm256_set_epi64x( RC[j + 7], RC[j + 7], \
+                                       RC[j + 7], RC[j + 7])) ); \
+       P8_TO_P0; \
+    } \
+} while (0)
+
+
+static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
+{
+   int i;
+   for (i = 0; i < 25; i ++)
+          kc->w[i] = _mm256_setzero_si256();
+
+   // Initialization for the "lane complement".
+   kc->w[ 1] = m256_neg1;
+   kc->w[ 2] = m256_neg1;
+   kc->w[ 8] = m256_neg1;
+   kc->w[12] = m256_neg1;
+   kc->w[17] = m256_neg1;
+   kc->w[20] = m256_neg1;
+   kc->ptr = 0;
+   kc->lim = 200 - (out_size >> 2);
+}
+
+static void
+keccak64_core( keccak64_ctx_m256i *kc, const void *data, size_t len,
+               size_t lim )
+{
+    __m256i *buf;
+    __m256i *vdata = (__m256i*)data;
+    size_t ptr;
+    DECL_STATE
+
+    buf = kc->buf;
+    ptr = kc->ptr;
+
+    if ( len < (lim - ptr) )
+    {
+        memcpy_256( buf + (ptr>>3), vdata, len>>3 );
+        kc->ptr = ptr + len;
+        return;
+    }
+
+    READ_STATE( kc );
+    while ( len > 0 )
+    {
+        size_t clen;
+
+        clen = (lim - ptr);
+        if ( clen > len )
+             clen = len;
+        memcpy_256( buf + (ptr>>3), vdata, clen>>3 );
+        ptr += clen;
+        vdata = vdata + (clen>>3);
+        len -= clen;
+        if ( ptr == lim )
+        {
+            INPUT_BUF( lim );
+            KECCAK_F_1600;
+            ptr = 0;
+        }
+    }
+    WRITE_STATE( kc );
+    kc->ptr = ptr;
+}
+
+static void keccak64_close( keccak64_ctx_m256i *kc, void *dst, size_t byte_len,
+            size_t lim )
+{
+    unsigned eb;
+    union {
+       __m256i tmp[lim + 1];
+       sph_u64 dummy;   /* for alignment */
+    } u;
+    size_t j;
+    size_t m256_len = byte_len >> 3;
+
+    eb = 0x100  >> 8;
+    if ( kc->ptr == (lim - 8) )
+    {
+        uint64_t t = eb | 0x8000000000000000;
+        u.tmp[0] = _mm256_set_epi64x( t, t, t, t );
+        j = 8;
+    }
+    else
+    {
+        j = lim - kc->ptr;
+        u.tmp[0] = _mm256_set_epi64x( eb, eb, eb, eb );
+        memset_zero_256( u.tmp + 1, (j>>3) - 2 );
+        u.tmp[ (j>>3) - 1] = _mm256_set_epi64x( 0x8000000000000000,
+                0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
+    }
+    keccak64_core( kc, u.tmp, j, lim );
+    /* Finalize the "lane complement" */
+    NOT64( kc->w[ 1], kc->w[ 1] );
+    NOT64( kc->w[ 2], kc->w[ 2] );
+    NOT64( kc->w[ 8], kc->w[ 8] );
+    NOT64( kc->w[12], kc->w[12] );
+    NOT64( kc->w[17], kc->w[17] );
+    NOT64( kc->w[20], kc->w[20] );
+    for ( j = 0; j < m256_len; j++ )
+         u.tmp[j] =  kc->w[j]; 
+    memcpy_256( dst, u.tmp, m256_len );
+}
+
+void keccak256_4way_init( void *kc )
+{
+   keccak64_init( kc, 256 );
+}
+
+void
+keccak256_4way(void *cc, const void *data, size_t len)
+{
+    keccak64_core(cc, data, len, 136);
+}
+
+void
+keccak256_4way_close(void *cc, void *dst)
+{
+    keccak64_close(cc, dst, 32, 136);
+}
+
+void keccak512_4way_init( void *kc )
+{
+   keccak64_init( kc, 512 );
+}
+
+void
+keccak512_4way(void *cc, const void *data, size_t len)
+{
+        keccak64_core(cc, data, len, 72);
+}
+
+void
+keccak512_4way_close(void *cc, void *dst)
+{
+        keccak64_close(cc, dst, 64, 72);
+}
+
+#endif
--- a/algo/keccak/keccak-hash-4way.h
+++ b/algo/keccak/keccak-hash-4way.h
@@ -0,0 +1,94 @@
+/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Keccak interface. This is the interface for Keccak with the
+ * recommended parameters for SHA-3, with output lengths 224, 256,
+ * 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_keccak.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef KECCAK_HASH_4WAY_H__
+#define KECCAK_HASH_4WAY_H__
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#ifdef  __AVX2__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+#include "avxdefs.h"
+
+#define SPH_SIZE_keccak256   256
+
+/**
+ * Output size (in bits) for Keccak-512.
+ */
+#define SPH_SIZE_keccak512   512
+
+/**
+ * This structure is a context for Keccak computations: it contains the
+ * intermediate values and some data from the last entered block. Once a
+ * Keccak computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running Keccak computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+
+typedef struct {
+        __m256i buf[144*8];    /* first field, for alignment */
+        __m256i w[25];
+        size_t ptr, lim;
+//        sph_u64 wide[25];
+} keccak64_ctx_m256i;
+
+typedef keccak64_ctx_m256i keccak256_4way_context;
+typedef keccak64_ctx_m256i keccak512_4way_context;
+
+void keccak256_4way_init(void *cc);
+void keccak256_4way(void *cc, const void *data, size_t len);
+void keccak256_4way_close(void *cc, void *dst);
+
+
+void keccak512_4way_init(void *cc);
+void keccak512_4way(void *cc, const void *data, size_t len);
+void keccak512_4way_close(void *cc, void *dst);
+void keccak512_4way_addbits_and_close(
+        void *cc, unsigned ub, unsigned n, void *dst);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/algo/keccak/keccak.c
+++ b/algo/keccak/keccak.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
@@ -51,17 +50,3 @@ int scanhash_keccak(int thr_id, struct work *work,
 	return 0;
 }

-void keccak_set_target( struct work* work, double job_diff )
-{
-  work_set_target( work, job_diff / (128.0 * opt_diff_factor) );
-}
-
-bool register_keccak_algo( algo_gate_t* gate )
-{
-  gate->scanhash        = (void*)&scanhash_keccak;
-  gate->hash            = (void*)&keccakhash;
-  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
-  gate->set_target      = (void*)&keccak_set_target;
-  return true;
-};
-
--- a/algo/keccak/sph_keccak.c
+++ b/algo/keccak/sph_keccak.c
@@ -955,6 +955,7 @@ static const struct {

 #endif

+
 #define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4)   do { \
 		DECL64(tt0); \
 		DECL64(tt1); \
@@ -1643,8 +1644,7 @@ keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim)
 		for (j = 0; j < d; j += 8) \
 			sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \
 		memcpy(dst, u.tmp, d); \
-		keccak_init(kc, (unsigned)d << 3); \
-	} \
+}

 #else

--- a/algo/keccak/sse2/keccak.c
+++ b/algo/keccak/sse2/keccak.c
@@ -775,10 +775,8 @@ static const sph_u64 RC[] = {
 			KF_ELT( 5,  6, RC[j + 5]); \
 			KF_ELT( 6,  7, RC[j + 6]); \
 			KF_ELT( 7,  8, RC[j + 7]); \
-*/
-
-	//kekDECL_STATE \
-        
+	kekDECL_STATE \
+*/        
 #define DECL_KEC  


--- a/algo/lbry.c
+++ b/algo/lbry.c
@@ -1,260 +0,0 @@
-#include "miner.h"
-#include "algo-gate-api.h"
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdio.h>
-#include "ripemd/sph_ripemd.h"
-#include "sha/sph_sha2.h"
-#if defined __SHA__
- #include <openssl/sha.h>
-#endif
-
-#define LBRY_NTIME_INDEX 25
-#define LBRY_NBITS_INDEX 26
-#define LBRY_NONCE_INDEX 27
-#define LBRY_WORK_DATA_SIZE 192
-#define LBRY_WORK_CMP_SIZE 76  // same as default
-
-/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */
-typedef struct {
-#if defined __SHA__
-   SHA256_CTX             sha256;
-#else
-   sph_sha256_context     sha256;
-#endif
-   sph_sha512_context     sha512;
-   sph_ripemd160_context  ripemd;
-} lbryhash_context_holder;
-
-/* no need to copy, because close reinit the context */
-static  lbryhash_context_holder ctx __attribute__ ((aligned (64)));
-
-void init_lbry_contexts(void *dummy)
-{
-#if defined __SHA__
-   SHA256_Init( &ctx.sha256 );
-#else
-   sph_sha256_init( &ctx.sha256 );
-#endif
-   sph_sha512_init( &ctx.sha512 );
-   sph_ripemd160_init( &ctx.ripemd );
-}
-
-void lbry_hash(void* output, const void* input)
-{
-#if defined __SHA__
-   SHA256_CTX              ctx_sha256 __attribute__ ((aligned (64)));
-#else
-   sph_sha256_context      ctx_sha256 __attribute__ ((aligned (64)));
-#endif
-   sph_sha512_context      ctx_sha512 __attribute__ ((aligned (64)));
-   sph_ripemd160_context   ctx_ripemd __attribute__ ((aligned (64)));
-   uint32_t _ALIGN(64) hashA[16];
-   uint32_t _ALIGN(64) hashB[16];
-   uint32_t _ALIGN(64) hashC[16];
-
-#if defined __SHA__
-   SHA256_Init( &ctx_sha256 );
-   SHA256_Update( &ctx_sha256, input, 112 );
-   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
-
-   SHA256_Init( &ctx_sha256 );
-   SHA256_Update( &ctx_sha256, hashA, 32 );
-   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
-#else
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256 ( &ctx_sha256, input, 112 );
-   sph_sha256_close( &ctx_sha256, hashA );
-
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256 ( &ctx_sha256, hashA, 32 );
-   sph_sha256_close( &ctx_sha256, hashA );
-#endif
-
-   sph_sha512_init( &ctx_sha512 );
-   sph_sha512 ( &ctx_sha512, hashA, 32 );
-   sph_sha512_close( &ctx_sha512, hashA );  
-
-   sph_ripemd160_init( &ctx_ripemd );
-   sph_ripemd160 ( &ctx_ripemd, hashA, 32 );
-   sph_ripemd160_close( &ctx_ripemd, hashB );
-
-   sph_ripemd160_init( &ctx_ripemd );
-   sph_ripemd160 ( &ctx_ripemd, hashA+8, 32 );
-   sph_ripemd160_close( &ctx_ripemd, hashC );
-
-#if defined __SHA__
-   SHA256_Init( &ctx_sha256 );
-   SHA256_Update( &ctx_sha256, hashB, 20 );
-   SHA256_Update( &ctx_sha256, hashC, 20 );
-   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
-
-   SHA256_Init( &ctx_sha256 );
-   SHA256_Update( &ctx_sha256, hashA, 32 );
-   SHA256_Final( (unsigned char*) hashA, &ctx_sha256 );
-#else
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256 ( &ctx_sha256, hashB, 20 );
-   sph_sha256 ( &ctx_sha256, hashC, 20 );
-   sph_sha256_close( &ctx_sha256, hashA );
-
-   sph_sha256_init( &ctx_sha256 );
-   sph_sha256 ( &ctx_sha256, hashA, 32 );
-   sph_sha256_close( &ctx_sha256, hashA );
-#endif
-   memcpy( output, hashA, 32 );
-}
-
-int scanhash_lbry( int thr_id, struct work *work, uint32_t max_nonce,
-                   uint64_t *hashes_done)
-{
-  uint32_t *pdata = work->data;
-  uint32_t *ptarget = work->target;
-	uint32_t n = pdata[27] - 1;
-	const uint32_t first_nonce = pdata[27];
-	const uint32_t Htarg = ptarget[7];
-
-	uint32_t hash64[8] __attribute__((aligned(64)));
-	uint32_t endiandata[32] __attribute__ ((aligned (64)));
-
-	uint64_t htmax[] = {
-		0,
-		0xF,
-		0xFF,
-		0xFFF,
-		0xFFFF,
-		0x10000000
-	};
-	uint32_t masks[] = {
-		0xFFFFFFFF,
-		0xFFFFFFF0,
-		0xFFFFFF00,
-		0xFFFFF000,
-		0xFFFF0000,
-		0
-	};
-
-	// we need bigendian data...
-        swab32_array( endiandata, pdata, 32 );
-
-#ifdef DEBUG_ALGO
-	printf("[%d] Htarg=%X\n", thr_id, Htarg);
-#endif
-	for (int m=0; m < sizeof(masks); m++) {
-		if (Htarg <= htmax[m]) {
-			uint32_t mask = masks[m];
-			do {
-				pdata[27] = ++n;
-				be32enc(&endiandata[27], n);
-				lbry_hash(hash64, &endiandata);
-#ifndef DEBUG_ALGO
-				if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) {
-					*hashes_done = n - first_nonce + 1;
-					return true;
-				}
-#else
-				if (!(n % 0x1000) && !thr_id) printf(".");
-				if (!(hash64[7] & mask)) {
-					printf("[%d]",thr_id);
-					if (fulltest(hash64, ptarget)) {
-						*hashes_done = n - first_nonce + 1;
-						return true;
-					}
-				}
-#endif
-			} while (n < max_nonce && !work_restart[thr_id].restart);
-			// see blake.c if else to understand the loop on htmax => mask
-			break;
-		}
-	}
-
-	*hashes_done = n - first_nonce + 1;
-	pdata[27] = n;
-	return 0;
-}
-
-double lbry_calc_network_diff( struct work *work )
-{
-        // sample for diff 43.281 : 1c05ea29
-        // todo: endian reversed on longpoll could be zr5 specific...
-
-   uint32_t nbits = swab32( work->data[ LBRY_NBITS_INDEX ] );
-   uint32_t bits = (nbits & 0xffffff);
-   int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
-   double d = (double)0x0000ffff / (double)bits;
-
-   for (int m=shift; m < 29; m++) d *= 256.0;
-   for (int m=29; m < shift; m++) d /= 256.0;
-   if (opt_debug_diff)
-      applog(LOG_DEBUG, "net diff: %f -> shift %u, bits %08x", d, shift, bits);
-
-   return d;
-}
-
-// std_le should work but it doesn't
-void lbry_le_build_stratum_request( char *req, struct work *work,
-                                      struct stratum_ctx *sctx )
-{
-   unsigned char *xnonce2str;
-   uint32_t ntime, nonce;
-   char ntimestr[9], noncestr[9];
-
-   le32enc( &ntime, work->data[ LBRY_NTIME_INDEX ] );
-   le32enc( &nonce, work->data[ LBRY_NONCE_INDEX ] );
-   bin2hex( ntimestr, (char*)(&ntime), sizeof(uint32_t) );
-   bin2hex( noncestr, (char*)(&nonce), sizeof(uint32_t) );
-   xnonce2str = abin2hex( work->xnonce2, work->xnonce2_len);
-   snprintf( req, JSON_BUF_LEN,
-        "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
-         rpc_user, work->job_id, xnonce2str, ntimestr, noncestr );
-   free(xnonce2str);
-}
-
-void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
-{
-   unsigned char merkle_root[64] = { 0 };
-   size_t t;
-   int i;
-
-   algo_gate.gen_merkle_root( merkle_root, sctx );
-   // Increment extranonce2 
-   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
-   // Assemble block header 
-   memset( g_work->data, 0, sizeof(g_work->data) );
-   g_work->data[0] = le32dec( sctx->job.version );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[1 + i] = le32dec( (uint32_t *) sctx->job.prevhash + i );
-   for ( i = 0; i < 8; i++ )
-      g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );
-   for ( int i = 0; i < 8; i++ )
-        g_work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
-   g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime);
-   g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits);
-   g_work->data[28] = 0x80000000;
-}
-
-void lbry_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-int64_t lbry_get_max64() { return 0x1ffffLL; }
-
-bool register_lbry_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | SHA_OPT;
-  gate->scanhash              = (void*)&scanhash_lbry;
-  gate->hash                  = (void*)&lbry_hash;
-  gate->calc_network_diff     = (void*)&lbry_calc_network_diff;
-  gate->get_max64             = (void*)&lbry_get_max64;
-  gate->build_stratum_request = (void*)&lbry_le_build_stratum_request;
-  gate->build_extraheader     = (void*)&lbry_build_extraheader;
-  gate->set_target            = (void*)&lbry_set_target;
-  gate->ntime_index           = LBRY_NTIME_INDEX;
-  gate->nbits_index           = LBRY_NBITS_INDEX;
-  gate->nonce_index           = LBRY_NONCE_INDEX;
-  gate->work_data_size        = LBRY_WORK_DATA_SIZE;
-  return true;
-}
-
--- a/algo/luffa/luffa-hash-2way.c
+++ b/algo/luffa/luffa-hash-2way.c
@@ -0,0 +1,583 @@
+/*
+ * luffa_for_sse2.c
+ * Version 2.0 (Sep 15th 2009)
+ *
+ * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
+ *
+ * Hitachi, Ltd. is the owner of this software and hereby grant
+ * the U.S. Government and any interested party the right to use
+ * this software for the purposes of the SHA-3 evaluation process,
+ * notwithstanding that this software is copyrighted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <string.h>
+#include <immintrin.h>
+#include "luffa-hash-2way.h"
+
+#if defined(__AVX2__)
+
+#include "avxdefs.h"
+
+#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
+                               0UL, 0UL, 0UL, 0xffffffffUL )
+
+#define ADD_CONSTANT(a,b,c0,c1)\
+    a = _mm256_xor_si256(a,c0);\
+    b = _mm256_xor_si256(b,c1);\
+
+#define MULT2(a0,a1) \
+do { \
+  register __m256i b = _mm256_xor_si256( a0, \
+                   _mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \
+  a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
+  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
+} while(0)
+
+// confirm pointer arithmetic
+// ok but use array indexes
+#define STEP_PART(x,c,t)\
+    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
+    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
+    MIXWORD(*x,*(x+4),*t,*(t+1));\
+    MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
+    MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
+    MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
+    ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
+
+#define SUBCRUMB(a0,a1,a2,a3,t)\
+    t  = _mm256_load_si256(&a0);\
+    a0 = _mm256_or_si256(a0,a1);\
+    a2 = _mm256_xor_si256(a2,a3);\
+    a1 = _mm256_andnot_si256(a1, m256_neg1 );\
+    a0 = _mm256_xor_si256(a0,a3);\
+    a3 = _mm256_and_si256(a3,t);\
+    a1 = _mm256_xor_si256(a1,a3);\
+    a3 = _mm256_xor_si256(a3,a2);\
+    a2 = _mm256_and_si256(a2,a0);\
+    a0 = _mm256_andnot_si256(a0, m256_neg1 );\
+    a2 = _mm256_xor_si256(a2,a1);\
+    a1 = _mm256_or_si256(a1,a3);\
+    t  = _mm256_xor_si256(t,a1);\
+    a3 = _mm256_xor_si256(a3,a2);\
+    a2 = _mm256_and_si256(a2,a1);\
+    a1 = _mm256_xor_si256(a1,a0);\
+    a0 = _mm256_load_si256(&t);\
+
+#define MIXWORD(a,b,t1,t2)\
+    b  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(a,2);\
+    t2 = _mm256_srli_epi32(a,30);\
+     a = _mm256_or_si256(t1,t2);\
+    a  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(b,14);\
+    t2 = _mm256_srli_epi32(b,18);\
+    b  = _mm256_or_si256(t1,t2);\
+    b  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(a,10);\
+    t2 = _mm256_srli_epi32(a,22);\
+    a  = _mm256_or_si256(t1,t2);\
+    a  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(b,1);\
+    t2 = _mm256_srli_epi32(b,31);\
+    b  = _mm256_or_si256(t1,t2);
+
+#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
+    a1 = _mm256_shuffle_epi32(a1,147);\
+    t0 = _mm256_load_si256(&a1);\
+    a1 = _mm256_unpacklo_epi32(a1,a0);\
+    t0 = _mm256_unpackhi_epi32(t0,a0);\
+    t1 = _mm256_shuffle_epi32(t0,78);\
+    a0 = _mm256_shuffle_epi32(a1,78);\
+    SUBCRUMB(t1,t0,a0,a1,tmp0);\
+    t0 = _mm256_unpacklo_epi32(t0,t1);\
+    a1 = _mm256_unpacklo_epi32(a1,a0);\
+    a0 = _mm256_load_si256(&a1);\
+    a0 = _mm256_unpackhi_epi64(a0,t0);\
+    a1 = _mm256_unpacklo_epi64(a1,t0);\
+    a1 = _mm256_shuffle_epi32(a1,57);\
+    MIXWORD(a0,a1,tmp0,tmp1);\
+    ADD_CONSTANT(a0,a1,c0,c1);
+
+#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
+    s2 = _mm256_load_si256(&r1);\
+    q2 = _mm256_load_si256(&p1);\
+    r2 = _mm256_shuffle_epi32(r2,216);\
+    p2 = _mm256_shuffle_epi32(p2,216);\
+    r1 = _mm256_unpacklo_epi32(r1,r0);\
+    p1 = _mm256_unpacklo_epi32(p1,p0);\
+    s2 = _mm256_unpackhi_epi32(s2,r0);\
+    q2 = _mm256_unpackhi_epi32(q2,p0);\
+    s0 = _mm256_load_si256(&r2);\
+    q0 = _mm256_load_si256(&p2);\
+    r2 = _mm256_unpacklo_epi64(r2,r1);\
+    p2 = _mm256_unpacklo_epi64(p2,p1);\
+    s1 = _mm256_load_si256(&s0);\
+    q1 = _mm256_load_si256(&q0);\
+    s0 = _mm256_unpackhi_epi64(s0,r1);\
+    q0 = _mm256_unpackhi_epi64(q0,p1);\
+    r2 = _mm256_shuffle_epi32(r2,225);\
+    p2 = _mm256_shuffle_epi32(p2,225);\
+    r0 = _mm256_load_si256(&s1);\
+    p0 = _mm256_load_si256(&q1);\
+    s0 = _mm256_shuffle_epi32(s0,225);\
+    q0 = _mm256_shuffle_epi32(q0,225);\
+    s1 = _mm256_unpacklo_epi64(s1,s2);\
+    q1 = _mm256_unpacklo_epi64(q1,q2);\
+    r0 = _mm256_unpackhi_epi64(r0,s2);\
+    p0 = _mm256_unpackhi_epi64(p0,q2);\
+    s2 = _mm256_load_si256(&r0);\
+    q2 = _mm256_load_si256(&p0);\
+    s3 = _mm256_load_si256(&r2);\
+    q3 = _mm256_load_si256(&p2);\
+
+#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
+    s0 = _mm256_load_si256(&r0);\
+    q0 = _mm256_load_si256(&p0);\
+    s1 = _mm256_load_si256(&r2);\
+    q1 = _mm256_load_si256(&p2);\
+    r0 = _mm256_unpackhi_epi32(r0,r1);\
+    p0 = _mm256_unpackhi_epi32(p0,p1);\
+    r2 = _mm256_unpackhi_epi32(r2,r3);\
+    p2 = _mm256_unpackhi_epi32(p2,p3);\
+    s0 = _mm256_unpacklo_epi32(s0,r1);\
+    q0 = _mm256_unpacklo_epi32(q0,p1);\
+    s1 = _mm256_unpacklo_epi32(s1,r3);\
+    q1 = _mm256_unpacklo_epi32(q1,p3);\
+    r1 = _mm256_load_si256(&r0);\
+    p1 = _mm256_load_si256(&p0);\
+    r0 = _mm256_unpackhi_epi64(r0,r2);\
+    p0 = _mm256_unpackhi_epi64(p0,p2);\
+    s0 = _mm256_unpackhi_epi64(s0,s1);\
+    q0 = _mm256_unpackhi_epi64(q0,q1);\
+    r1 = _mm256_unpacklo_epi64(r1,r2);\
+    p1 = _mm256_unpacklo_epi64(p1,p2);\
+    s2 = _mm256_load_si256(&r0);\
+    q2 = _mm256_load_si256(&p0);\
+    s1 = _mm256_load_si256(&r1);\
+    q1 = _mm256_load_si256(&p1);\
+
+#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    s1 = _mm256_load_si256(&r3);\
+    q1 = _mm256_load_si256(&p3);\
+    s3 = _mm256_load_si256(&r3);\
+    q3 = _mm256_load_si256(&p3);\
+    s1 = _mm256_unpackhi_epi32(s1,r2);\
+    q1 = _mm256_unpackhi_epi32(q1,p2);\
+    s3 = _mm256_unpacklo_epi32(s3,r2);\
+    q3 = _mm256_unpacklo_epi32(q3,p2);\
+    s0 = _mm256_load_si256(&s1);\
+    q0 = _mm256_load_si256(&q1);\
+    s2 = _mm256_load_si256(&s3);\
+    q2 = _mm256_load_si256(&q3);\
+    r3 = _mm256_load_si256(&r1);\
+    p3 = _mm256_load_si256(&p1);\
+    r1 = _mm256_unpacklo_epi32(r1,r0);\
+    p1 = _mm256_unpacklo_epi32(p1,p0);\
+    r3 = _mm256_unpackhi_epi32(r3,r0);\
+    p3 = _mm256_unpackhi_epi32(p3,p0);\
+    s0 = _mm256_unpackhi_epi64(s0,r3);\
+    q0 = _mm256_unpackhi_epi64(q0,p3);\
+    s1 = _mm256_unpacklo_epi64(s1,r3);\
+    q1 = _mm256_unpacklo_epi64(q1,p3);\
+    s2 = _mm256_unpackhi_epi64(s2,r1);\
+    q2 = _mm256_unpackhi_epi64(q2,p1);\
+    s3 = _mm256_unpacklo_epi64(s3,r1);\
+    q3 = _mm256_unpacklo_epi64(q3,p1);
+
+#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
+
+/* initial values of chaining variables */
+static const uint32 IV[40] __attribute((aligned(32))) = {
+    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
+    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
+    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
+    0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
+    0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
+    0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
+    0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
+    0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
+    0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
+    0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
+};
+
+/* Round Constants */
+static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
+    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
+    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
+    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
+    0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
+    0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
+    0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
+    0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
+    0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
+    0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
+    0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
+    0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
+    0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
+    0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
+    0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
+    0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
+    0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
+    0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
+    0x00000000,0x00000000,0x00000000,0x5090d577,
+    0x00000000,0x00000000,0x00000000,0xac11d7fa,
+    0x00000000,0x00000000,0x00000000,0x2d1925ab,
+    0x00000000,0x00000000,0x00000000,0x1bcb66f2,
+    0x00000000,0x00000000,0x00000000,0xb46496ac,
+    0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
+    0x00000000,0x00000000,0x00000000,0xd1925ab0,
+    0x00000000,0x00000000,0x00000000,0x78602649,
+    0x00000000,0x00000000,0x00000000,0x29131ab6,
+    0x00000000,0x00000000,0x00000000,0x8edae952,
+    0x00000000,0x00000000,0x00000000,0x0fc053c3,
+    0x00000000,0x00000000,0x00000000,0x3b6ba548,
+    0x00000000,0x00000000,0x00000000,0x3f014f0c,
+    0x00000000,0x00000000,0x00000000,0xedae9520,
+    0x00000000,0x00000000,0x00000000,0xfc053c31
+};
+
+__m256i CNS[32];
+
+/***************************************************/
+/* Round function         */
+/* state: hash context    */
+
+void rnd512_2way( luffa_2way_context *state, __m256i *msg )
+{
+    __m256i t0, t1;
+    __m256i *chainv = state->chainv;
+    __m256i msg0, msg1;
+    __m256i tmp[2];
+    __m256i x[8];
+
+    t0 = chainv[0];
+    t1 = chainv[1];
+
+    t0 = _mm256_xor_si256( t0, chainv[2] );
+    t1 = _mm256_xor_si256( t1, chainv[3] );
+    t0 = _mm256_xor_si256( t0, chainv[4] );
+    t1 = _mm256_xor_si256( t1, chainv[5] );
+    t0 = _mm256_xor_si256( t0, chainv[6] );
+    t1 = _mm256_xor_si256( t1, chainv[7] );
+    t0 = _mm256_xor_si256( t0, chainv[8] );
+    t1 = _mm256_xor_si256( t1, chainv[9] );
+
+    MULT2( t0, t1 );
+
+    msg0 = _mm256_shuffle_epi32( msg[0], 27 );
+    msg1 = _mm256_shuffle_epi32( msg[1], 27 );
+
+    chainv[0] = _mm256_xor_si256( chainv[0], t0 );
+    chainv[1] = _mm256_xor_si256( chainv[1], t1 );
+    chainv[2] = _mm256_xor_si256( chainv[2], t0 );
+    chainv[3] = _mm256_xor_si256( chainv[3], t1 );
+    chainv[4] = _mm256_xor_si256( chainv[4], t0 );
+    chainv[5] = _mm256_xor_si256( chainv[5], t1 );
+    chainv[6] = _mm256_xor_si256( chainv[6], t0 );
+    chainv[7] = _mm256_xor_si256( chainv[7], t1 );
+    chainv[8] = _mm256_xor_si256( chainv[8], t0 );
+    chainv[9] = _mm256_xor_si256( chainv[9], t1 );
+
+    t0 = chainv[0];
+    t1 = chainv[1];
+
+    MULT2( chainv[0], chainv[1]);
+    chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
+    chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
+
+    MULT2( chainv[2], chainv[3]);
+    chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
+    chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
+
+    MULT2( chainv[4], chainv[5]);
+    chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
+    chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
+
+    MULT2( chainv[6], chainv[7]);
+    chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
+    chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
+
+    MULT2( chainv[8], chainv[9]);
+    chainv[8] = _mm256_xor_si256( chainv[8], t0 );
+    chainv[9] = _mm256_xor_si256( chainv[9], t1 );
+
+    t0 = chainv[8];
+    t1 = chainv[9];
+
+    MULT2( chainv[8], chainv[9]);
+    chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
+    chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
+
+    MULT2( chainv[6], chainv[7]);
+    chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
+    chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
+
+    MULT2( chainv[4], chainv[5]);
+    chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
+    chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
+
+    MULT2( chainv[2], chainv[3] );
+    chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
+    chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
+
+    MULT2( chainv[0], chainv[1] );
+    chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t0 ), msg0 );
+    chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t1 ), msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
+    chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
+    chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
+    chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
+    chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
+
+    MULT2( msg0, msg1);
+
+    chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3],  1 ),
+                                 _mm256_srli_epi32( chainv[3], 31 ) );
+    chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5],  2 ),
+                                 _mm256_srli_epi32( chainv[5], 30 ) );
+    chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7],  3 ),
+                                 _mm256_srli_epi32( chainv[7], 29 ) );
+    chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9],  4 ),
+                                 _mm256_srli_epi32( chainv[9], 28 ) );
+
+    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
+                x[0], x[1], x[2], x[3],
+                chainv[1],chainv[3],chainv[5],chainv[7],
+                x[4], x[5], x[6], x[7] );
+
+    STEP_PART( &x[0], &CNS[ 0], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 2], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 4], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 6], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 8], &tmp[0] );
+    STEP_PART( &x[0], &CNS[10], &tmp[0] );
+    STEP_PART( &x[0], &CNS[12], &tmp[0] );
+    STEP_PART( &x[0], &CNS[14], &tmp[0] );
+
+    MIXTON1024( x[0], x[1], x[2], x[3],
+                chainv[0], chainv[2], chainv[4],chainv[6],
+                x[4], x[5], x[6], x[7],
+                chainv[1],chainv[3],chainv[5],chainv[7]);
+
+    /* Process last 256-bit block */
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[16], CNS[17],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[18], CNS[19],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[20], CNS[21],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[22], CNS[23],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[24], CNS[25],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[26], CNS[27],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[28], CNS[29],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t0, t1, CNS[30], CNS[31],
+                tmp[0], tmp[1] );
+}
+
+
+/***************************************************/
+/* Finalization function  */
+/* state: hash context    */
+/* b[8]: hash values      */
+
+void finalization512_2way( luffa_2way_context *state, uint32 *b )
+{
+    uint32 hash[8] __attribute((aligned(64)));
+    __m256i* chainv = state->chainv;
+    __m256i t[2];
+    __m256i zero[2];
+    zero[0] = zero[1] = _mm256_setzero_si256();
+
+    /*---- blank round with m=0 ----*/
+    rnd512_2way( state, zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+
+    t[0] = _mm256_xor_si256( t[0], chainv[2] );
+    t[1] = _mm256_xor_si256( t[1], chainv[3] );
+    t[0] = _mm256_xor_si256( t[0], chainv[4] );
+    t[1] = _mm256_xor_si256( t[1], chainv[5] );
+    t[0] = _mm256_xor_si256( t[0], chainv[6] );
+    t[1] = _mm256_xor_si256( t[1], chainv[7] );
+    t[0] = _mm256_xor_si256( t[0], chainv[8] );
+    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+
+    t[0] = _mm256_shuffle_epi32( t[0], 27 );
+    t[1] = _mm256_shuffle_epi32( t[1], 27 );
+
+    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
+    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
+
+    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
+
+    rnd512_2way( state, zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+    t[0] = _mm256_xor_si256( t[0], chainv[2] );
+    t[1] = _mm256_xor_si256( t[1], chainv[3] );
+    t[0] = _mm256_xor_si256( t[0], chainv[4] );
+    t[1] = _mm256_xor_si256( t[1], chainv[5] );
+    t[0] = _mm256_xor_si256( t[0], chainv[6] );
+    t[1] = _mm256_xor_si256( t[1], chainv[7] );
+    t[0] = _mm256_xor_si256( t[0], chainv[8] );
+    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+
+    t[0] = _mm256_shuffle_epi32( t[0], 27 );
+    t[1] = _mm256_shuffle_epi32( t[1], 27 );
+
+    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
+    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
+
+    casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
+}
+
+int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
+{
+    int i;
+    state->hashbitlen = hashbitlen;
+
+    for ( i=0; i<32; i++ ) CNS[i] =
+          _mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
+                            CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2)    ],
+                            CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
+                            CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2)    ] );
+
+    for ( i=0; i<10; i++ ) state->chainv[i] =
+          _mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
+                            IV[ (i<<2) +1 ], IV[ (i<<2)    ],
+                            IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
+                            IV[ (i<<2) +1 ], IV[ (i<<2)    ] );
+
+    ((__m256i*)state->buffer)[0] = m256_zero;
+    ((__m256i*)state->buffer)[1] = m256_zero;
+
+    return 0;
+}
+
+// Do not call luffa_update_close after having called luffa_update.
+// Once luffa_update has been called only call luffa_update or luffa_close.
+int luffa_2way_update( luffa_2way_context *state, const void *data,
+                       size_t len )
+{
+    __m256i *vdata  = (__m256i*)data;
+    __m256i *buffer = (__m256i*)state->buffer;
+    __m256i msg[2];
+    int i;
+    int blocks = (int)len >> 5;
+    state-> rembytes = (int)len & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = mm256_bswap_32( vdata[ 0] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
+       rnd512_2way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    // store in buffer for transform in final for midstate to work
+    if ( state->rembytes  )
+    {
+      // remaining data bytes
+      buffer[0] = mm256_bswap_32( vdata[0] );
+      buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                   0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
+    }
+    return 0;
+}
+
+int luffa_2way_close( luffa_2way_context *state, void *hashval )
+{
+    __m256i *buffer = (__m256i*)state->buffer;
+    __m256i msg[2];
+
+    // transform pad block
+    if ( state->rembytes )
+      // not empty, data is in buffer
+      rnd512_2way( state, buffer );
+    else
+    {     // empty pad block, constant data
+      msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
+      msg[1] = m256_zero;
+      rnd512_2way( state, msg );
+    }
+    finalization512_2way( state, (uint32*)hashval );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_2way( state, (uint32*)( hashval+32 ) );
+    return 0;
+}
+
+int luffa_2way_update_close( luffa_2way_context *state,
+                 void *output, const void *data, size_t inlen )
+{
+// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
+    const __m256i *vdata  = (__m256i*)data;
+    __m256i msg[2];
+    int i;
+    const int blocks = (int)( inlen >> 5 );
+    state->rembytes = inlen & 0x1F;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       msg[0] = mm256_bswap_32( vdata[ 0 ] );
+       msg[1] = mm256_bswap_32( vdata[ 1 ] );
+       rnd512_2way( state, msg );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+    {
+       // padding of partial block
+       msg[0] = mm256_bswap_32( vdata[0] );
+       msg[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
+       rnd512_2way( state, msg );
+    }
+    else
+    {
+       // empty pad block
+       msg[0] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
+       msg[1] = m256_zero;
+       rnd512_2way( state, msg );
+    }
+
+    finalization512_2way( state, (uint32*)output );
+    if ( state->hashbitlen > 512 )
+        finalization512_2way( state, (uint32*)( output+32 ) );
+
+    return 0;
+}
+
+#endif
--- a/algo/luffa/luffa-hash-2way.h
+++ b/algo/luffa/luffa-hash-2way.h
@@ -0,0 +1,69 @@
+#if !defined(LUFFA_HASH_2WAY_H__)
+#define LUFFA_HASH_2WAY_H__ 1
+/*
+ * luffa_for_sse2.h
+ * Version 2.0 (Sep 15th 2009)
+ *
+ * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
+ *
+ * Hitachi, Ltd. is the owner of this software and hereby grant
+ * the U.S. Government and any interested party the right to use
+ * this software for the purposes of the SHA-3 evaluation process,
+ * notwithstanding that this software is copyrighted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#if defined(__AVX2__)
+
+#include <immintrin.h>
+#include "algo/sha/sha3-defs.h"
+#include "avxdefs.h"
+
+/* The length of digests*/
+#define DIGEST_BIT_LEN_224 224
+#define DIGEST_BIT_LEN_256 256
+#define DIGEST_BIT_LEN_384 384
+#define DIGEST_BIT_LEN_512 512
+
+/*********************************/
+/* The parameters of Luffa       */
+#define MSG_BLOCK_BIT_LEN 256  /*The bit length of a message block*/
+#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
+                                                     * of a message block*/
+
+/* The number of blocks in Luffa */
+#define WIDTH_224 3
+#define WIDTH_256 3
+#define WIDTH_384 4
+#define WIDTH_512 5
+
+/* The limit of the length of message */
+#define LIMIT_224 64
+#define LIMIT_256 64
+#define LIMIT_384 128
+#define LIMIT_512 128
+/*********************************/
+
+typedef struct {
+    uint32 buffer[8*2] __attribute((aligned(64)));
+    __m256i chainv[10] __attribute((aligned(32)));   /* Chaining values */
+    int hashbitlen;
+    int rembytes;
+} luffa_2way_context;
+
+int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
+int luffa_2way_update( luffa_2way_context *state, const void *data,
+                       size_t len );
+int luffa_2way_close( luffa_2way_context *state, void *hashval );
+int luffa_2way_update_close( luffa_2way_context *state, void *output,
+                                   const void *data, size_t inlen );
+
+#endif
+#endif
--- a/algo/luffa/luffa.c
+++ b/algo/luffa/luffa.c
@@ -1,4 +1,3 @@
-#include "miner.h"
 #include "algo-gate-api.h"

 #include <stdlib.h>
--- a/algo/luffa/sse2/luffa_for_sse2.c
+++ b/algo/luffa/sse2/luffa_for_sse2.c
@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
+                      mm_bswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      casti_m128i( state->buffer, 0 ) = mm_byteswap_epi32( cast_m128i( data ) );
+      casti_m128i( state->buffer, 0 ) = mm_bswap_32( cast_m128i( data ) );
      // padding of partial block
      casti_m128i( state->buffer, 1 ) =
            _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_byteswap_epi32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_epi32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
+                      mm_bswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    {
      // padding of partial block
      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
-                      mm_byteswap_epi32( cast_m128i( data ) ) );
+                      mm_bswap_32( cast_m128i( data ) ) );
    }
    else
    {
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )

    _mm256_store_si256( (__m256i*)hash, t );

-    casti_m256i( b, 0 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );

    rnd512( state, zero, zero );

@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )

    _mm256_store_si256( (__m256i*)hash, t );

-    casti_m256i( b, 1 ) = mm256_byteswap_epi32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
 }

 #else
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 0 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 1 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 0 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 1 ) = mm_bswap_32( casti_m128i( hash, 1 ) );

    rnd512( state, zero, zero );

@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 2 ) = mm_byteswap_epi32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 3 ) = mm_byteswap_epi32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 2 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 3 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
 }
 #endif

--- a/algo/luffa/sse2/luffa_for_sse2.h
+++ b/algo/luffa/sse2/luffa_for_sse2.h
--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -0,0 +1,138 @@
+#include "allium-gate.h"
+#include <memory.h>
+#include <mm_malloc.h>
+
+#if defined (ALLIUM_4WAY)	
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/keccak/keccak-hash-4way.h"
+#include "algo/skein/skein-hash-4way.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/groestl/aes_ni/hash-groestl256.h"
+
+typedef struct {
+   blake256_4way_context     blake;
+   keccak256_4way_context    keccak;
+   cubehashParam             cube;
+   skein256_4way_context     skein;
+   hashState_groestl256      groestl;
+
+} allium_4way_ctx_holder;
+
+static __thread allium_4way_ctx_holder allium_4way_ctx;
+
+bool init_allium_4way_ctx()
+{
+   keccak256_4way_init( &allium_4way_ctx.keccak );
+   cubehashInit( &allium_4way_ctx.cube, 256, 16, 32 );
+   skein256_4way_init( &allium_4way_ctx.skein );
+   init_groestl256( &allium_4way_ctx.groestl, 32 );
+   return true;
+}
+
+void allium_4way_hash( void *state, const void *input )
+{
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   uint32_t vhash32[8*4] __attribute__ ((aligned (64)));
+   uint32_t vhash64[8*4] __attribute__ ((aligned (64)));
+   allium_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
+
+   memcpy( &ctx, &allium_4way_ctx, sizeof(allium_4way_ctx) );
+   blake256_4way( &ctx.blake, input + (64<<2), 16 );
+   blake256_4way_close( &ctx.blake, vhash32 );
+
+   mm256_reinterleave_4x64( vhash64, vhash32, 256 );
+   keccak256_4way( &ctx.keccak, vhash64, 32 );
+   keccak256_4way_close( &ctx.keccak, vhash64 );
+   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*)hash0, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*)hash1, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash2, (const byte*)hash2, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*)hash3, (const byte*)hash3, 32 );
+
+   LYRA2RE( hash0, 32, hash0, 32, hash0, 32, 1, 8, 8 );
+   LYRA2RE( hash1, 32, hash1, 32, hash1, 32, 1, 8, 8 );
+   LYRA2RE( hash2, 32, hash2, 32, hash2, 32, 1, 8, 8 );
+   LYRA2RE( hash3, 32, hash3, 32, hash3, 32, 1, 8, 8 );
+
+   mm256_interleave_4x64( vhash64, hash0, hash1, hash2, hash3, 256 );
+   skein256_4way( &ctx.skein, vhash64, 32 );
+   skein256_4way_close( &ctx.skein, vhash64 );
+   mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash64, 256 );
+
+   update_and_final_groestl256( &ctx.groestl, hash0, hash0, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, hash1, hash1, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, hash2, hash2, 256 );
+   memcpy( &ctx.groestl, &allium_4way_ctx.groestl,
+           sizeof(hashState_groestl256) );
+   update_and_final_groestl256( &ctx.groestl, hash3, hash3, 256 );
+
+   memcpy( state,    hash0, 32 );
+   memcpy( state+32, hash1, 32 );
+   memcpy( state+64, hash2, 32 );
+   memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 76; // 19*4
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   swab32_array( edata, pdata, 20 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   blake256_4way_init( &allium_4way_ctx.blake );
+   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );
+
+   do {
+     be32enc( noncep,   n   );
+     be32enc( noncep+1, n+1 );
+     be32enc( noncep+2, n+2 );
+     be32enc( noncep+3, n+3 );
+
+     allium_4way_hash( hash, vdata );
+     pdata[19] = n;
+
+     for ( int i = 0; i < 4; i++ )
+     if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+     {
+         nonces[ num_found++ ] = n+i;
+         work_set_target_ratio( work, hash+(i<<3) );
+     }
+     n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/lyra2/allium-gate.c
+++ b/algo/lyra2/allium-gate.c
@@ -0,0 +1,22 @@
+#include "allium-gate.h"
+
+int64_t get_max64_0xFFFFLL() { return 0xFFFFLL; }
+
+bool register_allium_algo( algo_gate_t* gate )
+{
+#if defined (ALLIUM_4WAY)
+  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
+  gate->scanhash  = (void*)&scanhash_allium_4way;
+  gate->hash      = (void*)&allium_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&init_allium_ctx;
+  gate->scanhash  = (void*)&scanhash_allium;
+  gate->hash      = (void*)&allium_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->set_target        = (void*)&alt_set_target;
+  gate->get_max64         = (void*)&get_max64_0xFFFFLL;
+  return true;
+};
+
+
--- a/algo/lyra2/allium-gate.h
+++ b/algo/lyra2/allium-gate.h
@@ -0,0 +1,29 @@
+#ifndef ALLIUM_GATE_H__
+#define ALLIUM_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+#include "lyra2.h"
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define ALLIUM_4WAY
+#endif
+
+bool register_allium_algo( algo_gate_t* gate );
+
+#if defined(ALLIUM_4WAY)
+
+void allium_4way_hash( void *state, const void *input );
+int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+bool init_allium_4way_ctx();
+
+#endif
+
+void allium_hash( void *state, const void *input );
+int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+bool init_allium_ctx();
+
+#endif
+
--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -0,0 +1,112 @@
+#include "allium-gate.h"
+#include <memory.h>
+#include "algo/blake/sph_blake.h"
+#include "algo/keccak/sph_keccak.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#if defined(__AES__)
+#include "algo/groestl/aes_ni/hash-groestl256.h"
+#else
+#include "algo/groestl/sph_groestl.h"
+#endif
+#include "lyra2.h"
+
+typedef struct {
+        sph_blake256_context     blake;
+        sph_keccak256_context    keccak;
+        cubehashParam            cube;
+        sph_skein256_context     skein;
+#if defined (__AES__)
+        hashState_groestl256     groestl;
+#else
+        sph_groestl256_context   groestl;
+#endif
+} allium_ctx_holder;
+
+static __thread allium_ctx_holder allium_ctx;
+
+bool init_allium_ctx()
+{
+        sph_keccak256_init( &allium_ctx.keccak );
+        cubehashInit( &allium_ctx.cube, 256, 16, 32 );
+        sph_skein256_init( &allium_ctx.skein );
+#if defined (__AES__)
+        init_groestl256( &allium_ctx.groestl, 32 );
+#else
+        sph_groestl256_init( &allium_ctx.groestl );
+#endif
+        return true;
+}
+
+void allium_hash(void *state, const void *input)
+{
+    uint32_t hash[8] __attribute__ ((aligned (64)));
+    allium_ctx_holder ctx __attribute__ ((aligned (32)));
+
+    memcpy( &ctx, &allium_ctx, sizeof(allium_ctx) );
+    sph_blake256( &ctx.blake, input + 64, 16 );
+    sph_blake256_close( &ctx.blake, hash );
+
+    sph_keccak256( &ctx.keccak, hash, 32 );
+    sph_keccak256_close( &ctx.keccak, hash );
+
+    LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
+
+    cubehashUpdateDigest( &ctx.cube, (byte*)hash, (const byte*)hash, 32 );
+
+    LYRA2RE( hash, 32, hash, 32, hash, 32, 1, 8, 8 );
+
+    sph_skein256( &ctx.skein, hash, 32 );
+    sph_skein256_close( &ctx.skein, hash );
+
+#if defined (__AES__)
+   update_and_final_groestl256( &ctx.groestl, hash, hash, 256 );
+#else
+   sph_groestl256( &ctx.groestl, hash, 32 );
+   sph_groestl256_close( &ctx.groestl, hash );
+#endif
+
+    memcpy(state, hash, 32);
+}
+
+int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done )
+{
+    uint32_t _ALIGN(128) hash[8];
+    uint32_t _ALIGN(128) endiandata[20];
+    uint32_t *pdata = work->data;
+    uint32_t *ptarget = work->target;
+
+    const uint32_t Htarg = ptarget[7];
+    const uint32_t first_nonce = pdata[19];
+    uint32_t nonce = first_nonce;
+
+    if ( opt_benchmark )
+        ptarget[7] = 0x3ffff;
+
+    for ( int i = 0; i < 19; i++ )
+        be32enc( &endiandata[i], pdata[i] );
+
+    sph_blake256_init( &allium_ctx.blake );
+    sph_blake256( &allium_ctx.blake, endiandata, 64 );
+
+    do {
+        be32enc( &endiandata[19], nonce );
+        allium_hash( hash, endiandata );
+
+        if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+        {
+            work_set_target_ratio( work, hash );
+            pdata[19] = nonce;
+            *hashes_done = pdata[19] - first_nonce;
+            return 1;
+        }
+        nonce++;
+
+    } while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+    pdata[19] = nonce;
+    *hashes_done = pdata[19] - first_nonce + 1;
+    return 0;
+}
+
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -47,8 +47,9 @@
 */

 int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
-               uint64_t pwdlen, const void *salt, uint64_t saltlen,
-               uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
+               const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
+               const uint64_t timeCost, const uint64_t nRows,
+               const uint64_t nCols )
 {
   //====================== Basic variables ============================//
   uint64_t _ALIGN(256) state[16];
@@ -73,6 +74,8 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
   uint64_t *ptrWord = wholeMatrix;

+//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
   //=== Getting the password + salt + basil padded with 10*1 ==========//
   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
   //but this ensures that the password copied locally will be overwritten as soon as possible
@@ -209,8 +212,9 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
 }

 int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
-            uint64_t pwdlen, const void *salt, uint64_t saltlen,
-            uint64_t timeCost, uint64_t nRows, uint64_t nCols )
+            const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
+            const uint64_t timeCost, const uint64_t nRows,
+            const uint64_t nCols )
 {
    //========================== Basic variables ============================//
    uint64_t _ALIGN(256) state[16];
@@ -230,6 +234,8 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
    const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
 //    const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;

+//    memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
    //==== Getting the password + salt + basil padded with 10*1 ============//
    //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
    //but this ensures that the password copied locally will be overwritten as soon as possible
@@ -347,9 +353,9 @@ int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
 }

 // Lyra2RE doesn't like the new wholeMatrix implementation
-int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
-             uint64_t pwdlen, const void *salt, uint64_t saltlen,
-             uint64_t timeCost, const uint64_t nRows, const uint64_t nCols )
+int LYRA2RE( void *K, uint64_t kLen, const void *pwd, const uint64_t pwdlen,
+             const void *salt, const uint64_t saltlen, const uint64_t timeCost,
+             const uint64_t nRows, const uint64_t nCols )
 {
   //====================== Basic variables ============================//
   uint64_t _ALIGN(256) state[16];
@@ -378,12 +384,12 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
   if (wholeMatrix == NULL)
      return -1;

-#if defined (__AVX2__)
-   memset_zero_m256i( (__m256i*)wholeMatrix, i/32 );
+#if defined(__AVX2__)
+   memset_zero_256( (__m256i*)wholeMatrix, i>>5 );
 #elif defined(__AVX__)
-   memset_zero_m128i( (__m128i*)wholeMatrix, i/16 );
+   memset_zero_128( (__m128i*)wholeMatrix, i>>4 );   
 #else
-   memset(wholeMatrix, 0, i);
+   memset( wholeMatrix, 0, i );
 #endif

   uint64_t *ptrWord = wholeMatrix;
@@ -406,8 +412,8 @@ int LYRA2RE( void *K, uint64_t kLen, const void *pwd,
   memcpy(ptrByte, salt, saltlen);
   ptrByte += saltlen;

-   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
-                       - (saltlen + pwdlen) );
+//   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+//                       - (saltlen + pwdlen) );

   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
   memcpy(ptrByte, &kLen, sizeof(int64_t));
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -21,8 +21,9 @@
 #define LYRA2_H_

 #include <stdint.h>
+#include "algo/sha/sha3-defs.h"

-typedef unsigned char byte;
+//typedef unsigned char byte;

 //Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
 #define BLOCK_LEN_BLAKE2_SAFE_INT64 8                                   //512 bits (=64 bytes, =8 uint64_t)
@@ -53,4 +54,6 @@ int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
            uint64_t pwdlen, const void *salt, uint64_t saltlen,
            uint64_t timeCost, uint64_t nRows, uint64_t nCols );

+int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
+
 #endif /* LYRA2_H_ */
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -0,0 +1,101 @@
+#include "lyra2h-gate.h"
+
+#ifdef LYRA2H_4WAY
+
+#include <memory.h>
+#include <mm_malloc.h>
+#include "lyra2.h"
+#include "algo/blake/sph_blake.h"
+#include "algo/blake/blake-hash-4way.h"
+
+__thread uint64_t* lyra2h_4way_matrix;
+
+bool lyra2h_4way_thread_init()
+{
+ return ( lyra2h_4way_matrix = _mm_malloc( LYRA2H_MATRIX_SIZE, 64 ) );
+}
+
+static __thread blake256_4way_context l2h_4way_blake_mid;
+
+void lyra2h_4way_midstate( const void* input )
+{
+       blake256_4way_init( &l2h_4way_blake_mid );
+       blake256_4way( &l2h_4way_blake_mid, input, 64 );
+}
+
+void lyra2h_4way_hash( void *state, const void *input )
+{
+     uint32_t hash0[8] __attribute__ ((aligned (64)));
+     uint32_t hash1[8] __attribute__ ((aligned (64)));
+     uint32_t hash2[8] __attribute__ ((aligned (64)));
+     uint32_t hash3[8] __attribute__ ((aligned (64)));
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256_4way_context ctx_blake __attribute__ ((aligned (64)));
+
+     memcpy( &ctx_blake, &l2h_4way_blake_mid, sizeof l2h_4way_blake_mid );
+     blake256_4way( &ctx_blake, input + (64*4), 16 );
+     blake256_4way_close( &ctx_blake, vhash );
+
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+
+     LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, hash2, 32, hash2, 32, hash2, 32, 16, 16, 16 );
+     LYRA2Z( lyra2h_4way_matrix, hash3, 32, hash3, 32, hash3, 32, 16, 16, 16 );
+
+     memcpy( state,    hash0, 32 );
+     memcpy( state+32, hash1, 32 );
+     memcpy( state+64, hash2, 32 );
+     memcpy( state+96, hash3, 32 );
+}
+
+int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep= vdata + 76; // 19*4
+
+   if ( opt_benchmark )
+      ptarget[7] = 0x0000ff;
+
+   for ( int i=0; i < 19; i++ )
+      be32enc( &edata[i], pdata[i] );
+
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   lyra2h_4way_midstate( vdata );
+
+   do {
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );
+
+      be32enc( &edata[19], n );
+      lyra2h_4way_hash( hash, vdata );
+
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
+
--- a/algo/lyra2/lyra2h-gate.c
+++ b/algo/lyra2/lyra2h-gate.c
@@ -0,0 +1,25 @@
+#include "lyra2h-gate.h"
+#include "lyra2.h"
+
+void lyra2h_set_target( struct work* work, double job_diff )
+{
+ work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
+}
+
+bool register_lyra2h_algo( algo_gate_t* gate )
+{
+#ifdef LYRA2H_4WAY
+  gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h_4way;
+  gate->hash       = (void*)&lyra2h_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&lyra2h_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h;
+  gate->hash       = (void*)&lyra2h_hash;
+#endif
+  gate->optimizations = AVX_OPT | AVX2_OPT;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&lyra2h_set_target;
+  return true;
+};
+
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	3c02653dbe	v3.8.3	2018-02-23 12:39:15 -05:00
Jay D Dee	502ed0b1fe	v3.8.2.1	2018-02-17 13:52:24 -05:00
Jay D Dee	d60a268972	v3.8.2	2018-02-15 14:48:50 -05:00
Jay D Dee	e4265a6f11	v3.8.1.1	2018-02-09 23:30:14 -05:00
Jay D Dee	a28daca3ce	v3.8.1	2018-02-07 16:38:45 -05:00
Jay D Dee	54b8fd7362	v3.8.0.1	2018-02-05 22:10:18 -05:00
Jay D Dee	ad2275f74a	v3.8.0	2018-01-23 21:02:16 -05:00
Jay D Dee	a90d75b8f5	v3.7.10	2018-01-16 15:11:44 -05:00
Jay D Dee	bee78eac76	v3.7.9	2018-01-08 22:04:43 -05:00
Jay D Dee	2d2e54f001	v3.7.8	2017-12-30 19:19:46 -05:00
Jay D Dee	79164c24b5	v3.7.7	2017-12-17 12:00:42 -05:00
Jay D Dee	7a1389998b	v3.7.6	2017-12-14 18:28:51 -05:00
Jay D Dee	af1c940919	v3.7.5	2017-12-08 15:39:28 -05:00
Jay D Dee	4b57ac0eb9	v3.7.4	2017-11-28 16:32:04 -05:00
Jay D Dee	6d1361c87f	v3.7.3	2017-11-20 21:19:15 -05:00
Jay D Dee	ab39e88318	v3.7.2	2017-11-01 11:03:23 -04:00
Jay D Dee	8ff52e7ad6	v3.7.1	2017-10-31 00:25:24 -04:00
Jay D Dee	aaa48599ad	v3.7.0	2017-10-17 11:38:59 -04:00
Jay D Dee	c76574b2cd	v3.6.11	2017-10-12 15:14:37 -04:00
Jay D Dee	989fb42d20	v3.6.10	2017-10-12 11:49:40 -04:00
Jay D Dee	710c852f05	v3.6.9	2017-10-09 21:45:27 -04:00
Jay D Dee	39f089d3dc	v3.6.8	2017-07-31 20:02:45 -04:00
Jay D Dee	ec4f6028a2	v3.6.7	2017-07-24 21:38:32 -04:00
Jay D Dee	f8907677f6	v3.6.6	2017-07-01 14:37:11 -04:00
Jay D Dee	7544cb956c	v3.6.5	2017-05-19 16:38:26 -04:00
Jay D Dee	e7dbd27636	v3.6.4	2017-05-02 10:28:19 -04:00