v3.8.7.2

v3.8.7.1
v3.8.7
2025-09-17 23:44:27 +00:00 · 2018-04-11 13:44:26 -04:00 · 2018-04-10 21:49:06 -04:00 · 2018-04-09 19:14:38 -04:00 · 2018-04-06 11:42:01 -04:00 · 2018-03-31 12:50:52 -04:00
305 changed files with 22160 additions and 8038 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -22,7 +22,6 @@ cpuminer_SOURCES = \
  api.c \
  sysinfos.c \
  algo-gate-api.c\
-  crypto/blake2s.c \
  crypto/oaes_lib.c \
  crypto/c_keccak.c \
  crypto/c_groestl.c \
@@ -32,12 +31,19 @@ cpuminer_SOURCES = \
  crypto/hash.c \
  crypto/aesb.c \
  crypto/magimath.cpp \
-  algo/argon2/argon2a.c \
-  algo/argon2/ar2/argon2.c \
-  algo/argon2/ar2/opt.c \
-  algo/argon2/ar2/cores.c \
-  algo/argon2/ar2/ar2-scrypt-jane.c \
-  algo/argon2/ar2/blake2b.c \
+  algo/argon2/argon2a/argon2a.c \
+  algo/argon2/argon2a/ar2/argon2.c \
+  algo/argon2/argon2a/ar2/opt.c \
+  algo/argon2/argon2a/ar2/cores.c \
+  algo/argon2/argon2a/ar2/ar2-scrypt-jane.c \
+  algo/argon2/argon2a/ar2/blake2b.c \
+  algo/argon2/argon2d/argon2d-gate.c \
+  algo/argon2/argon2d/blake2/blake2b.c \
+  algo/argon2/argon2d/argon2d/argon2.c \
+  algo/argon2/argon2d/argon2d/core.c \
+  algo/argon2/argon2d/argon2d/opt.c \
+  algo/argon2/argon2d/argon2d/thread.c \
+  algo/argon2/argon2d/argon2d/encoding.c \
  algo/blake/sph_blake.c \
  algo/blake/blake-hash-4way.c \
  algo/blake/blake-gate.c \
@@ -45,9 +51,15 @@ cpuminer_SOURCES = \
  algo/blake/blake-4way.c \
  algo/blake/sph_blake2b.c \
  algo/blake/blake2b.c \
+  algo/blake/sph-blake2s.c \
+  algo/blake/blake2s-hash-4way.c \
  algo/blake/blake2s.c \
+  algo/blake/blake2s-gate.c \
+  algo/blake/blake2s-4way.c \
+  algo/blake/blakecoin-gate.c \
  algo/blake/mod_blakecoin.c \
  algo/blake/blakecoin.c \
+  algo/blake/blakecoin-4way.c \
  algo/blake/decred-gate.c \
  algo/blake/decred.c \
  algo/blake/decred-4way.c \
@@ -63,17 +75,22 @@ cpuminer_SOURCES = \
  algo/cryptonight/cryptonight.c\
  algo/cubehash/sph_cubehash.c \
  algo/cubehash/sse2/cubehash_sse2.c\
+  algo/cubehash/cube-hash-2way.c \
  algo/echo/sph_echo.c \
  algo/echo/aes_ni/hash.c\
  algo/gost/sph_gost.c \
  algo/groestl/sph_groestl.c \
  algo/groestl/groestl.c \
+  algo/groestl/myrgr-gate.c \
+  algo/groestl/myrgr-4way.c \
  algo/groestl/myr-groestl.c \
  algo/groestl/aes_ni/hash-groestl.c \
  algo/groestl/aes_ni/hash-groestl256.c \
  algo/fugue/sph_fugue.c \
  algo/hamsi/sph_hamsi.c \
-  algo/haval/haval.c\
+  algo/hamsi/hamsi-hash-4way.c \
+  algo/haval/haval.c \
+  algo/haval/haval-hash-4way.c \
  algo/heavy/sph_hefty1.c \
  algo/heavy/heavy.c \
  algo/heavy/bastion.c \
@@ -93,19 +110,26 @@ cpuminer_SOURCES = \
  algo/keccak/keccak-4way.c\
  algo/keccak/keccak-gate.c \
  algo/keccak/sse2/keccak.c \
-  algo/lbry.c \
  algo/luffa/sph_luffa.c \
  algo/luffa/luffa.c \
-  algo/luffa/sse2/luffa_for_sse2.c \
+  algo/luffa/luffa_for_sse2.c \
+  algo/luffa/luffa-hash-2way.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
+  algo/lyra2/lyra2rev2-gate.c \
  algo/lyra2/lyra2rev2.c \
+  algo/lyra2/lyra2rev2-4way.c \
  algo/lyra2/lyra2re.c \
  algo/lyra2/lyra2z-gate.c \
  algo/lyra2/lyra2z.c \
  algo/lyra2/lyra2z-4way.c \
  algo/lyra2/lyra2z330.c \
+  algo/lyra2/lyra2h-gate.c \
  algo/lyra2/lyra2h.c \
+  algo/lyra2/lyra2h-4way.c \
+  algo/lyra2/allium-gate.c \
+  algo/lyra2/allium-4way.c \
+  algo/lyra2/allium.c \
  algo/m7m.c \
  algo/neoscrypt/neoscrypt.c \
  algo/nist5/nist5-gate.c \
@@ -113,15 +137,31 @@ cpuminer_SOURCES = \
  algo/nist5/nist5.c \
  algo/nist5/zr5.c \
  algo/pluck.c \
+  algo/quark/quark-gate.c \
  algo/quark/quark.c \
+  algo/quark/quark-4way.c \
+  algo/quark/anime-gate.c \
+  algo/quark/anime.c \
+  algo/quark/anime-4way.c \
+  algo/qubit/qubit-gate.c \
  algo/qubit/qubit.c \
+  algo/qubit/qubit-2way.c \
+  algo/qubit/deep-gate.c \
+  algo/qubit/deep-2way.c \
  algo/qubit/deep.c \
  algo/ripemd/sph_ripemd.c \
+  algo/ripemd/ripemd-hash-4way.c \
+  algo/ripemd/lbry-gate.c \
+  algo/ripemd/lbry.c \
+  algo/ripemd/lbry-4way.c \
  algo/scrypt.c \
  algo/scryptjane/scrypt-jane.c \
  algo/sha/sph_sha2.c \
  algo/sha/sph_sha2big.c \
+  algo/sha/sha2-hash-4way.c \
  algo/sha/sha2.c \
+  algo/sha/sha256t-gate.c \
+  algo/sha/sha256t-4way.c \
  algo/sha/sha256t.c \
  algo/shabal/sph_shabal.c \
  algo/shabal/shabal-hash-4way.c \
@@ -129,8 +169,9 @@ cpuminer_SOURCES = \
  algo/shavite/sph-shavite-aesni.c \
  algo/shavite/shavite.c \
  algo/simd/sph_simd.c \
-  algo/simd/sse2/nist.c \
-  algo/simd/sse2/vector.c \
+  algo/simd/nist.c \
+  algo/simd/vector.c \
+  algo/simd/simd-hash-2way.c \
  algo/skein/sph_skein.c \
  algo/skein/skein-hash-4way.c \
  algo/skein/skein.c \
@@ -140,9 +181,8 @@ cpuminer_SOURCES = \
  algo/skein/skein2-4way.c \
  algo/skein/skein2-gate.c \
  algo/sm3/sm3.c \
+  algo/sm3/sm3-hash-4way.c \
  algo/tiger/sph_tiger.c \
-  algo/timetravel.c \
-  algo/timetravel10.c \
  algo/whirlpool/sph_whirlpool.c \
  algo/whirlpool/whirlpool-hash-4way.c \
  algo/whirlpool/whirlpool-gate.c \
@@ -161,8 +201,19 @@ cpuminer_SOURCES = \
  algo/x11/tribus-gate.c \
  algo/x11/tribus.c \
  algo/x11/tribus-4way.c \
+  algo/x11/timetravel-gate.c \
+  algo/x11/timetravel.c \
+  algo/x11/timetravel-4way.c \
+  algo/x11/timetravel10-gate.c \
+  algo/x11/timetravel10.c \
+  algo/x11/timetravel10-4way.c \
  algo/x11/fresh.c \
  algo/x11/x11evo.c \
+  algo/x11/x11evo-4way.c \
+  algo/x11/x11evo-gate.c \
+  algo/x12/x12-gate.c \
+  algo/x12/x12.c \
+  algo/x12/x12-4way.c \
  algo/x13/x13-gate.c \
  algo/x13/x13.c \
  algo/x13/x13-4way.c \
@@ -195,10 +246,13 @@ cpuminer_SOURCES = \
  algo/x17/xevan-gate.c \
  algo/x17/xevan.c \
  algo/x17/xevan-4way.c \
+  algo/x17/x16r-gate.c \
+  algo/x17/x16r.c \
+  algo/x17/x16r-4way.c \
  algo/x17/hmq1725.c \
  algo/yescrypt/yescrypt.c \
  algo/yescrypt/sha256_Y.c \
-  algo/yescrypt/yescrypt-simd.c
+  algo/yescrypt/yescrypt-best.c

 disable_flags =

--- a/README.md
+++ b/README.md
@@ -7,16 +7,50 @@ All of the code is believed to be open and free. If anyone has a
 claim to any of it post your case in the cpuminer-opt Bitcoin Talk forum
 or by email.

+Miner programs are often flagged as malware by antivirus programs. This is
+a false positive, they are flagged simply because they are cryptocurrency 
+miners. The source code is open for anyone to inspect. If you don't trust 
+the software, don't use it.
+
 https://bitcointalk.org/index.php?topic=1326803.0

 mailto://jayddee246@gmail.com

 See file RELEASE_NOTES for change log and compile instructions.

+Requirements
+------------
+
+1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
+Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
+optimizations a CPU with AES_NI is required. This includes Intel Westbridge
+and newer and AMD equivalents. Further optimizations are available on some
+algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
+
+Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
+performance.
+
+ARM CPUs are not supported.
+
+2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
+Centos, are known to work and have all dependencies in their repositories.
+Others may work but may require more effort. Older versions such as Centos 6
+don't work due to missing features. 
+64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
+
+MacOS, OSx and Android are not supported.
+
+3. Stratum pool. Some algos may work wallet mining using getwork or GBT. YMMV.
+
 Supported Algorithms
 --------------------

-                          argon2
+                          allium       Garlicoin
+                          anime        Animecoin
+                          argon2       Argon2 coin (AR2)
+                          argon2d250   argon2d-crds, Credits (CRDS)
+                          argon2d500   argon2d-dyn,  Dynamic (DYN)
+                          argon2d4096  argon2d-uis, Unitus, (UIS)
                          axiom        Shabal-256 MemoHash
                          bastion
                          blake        Blake-256 (SFR)
@@ -74,43 +108,26 @@ Supported Algorithms
                          x11          Dash
                          x11evo       Revolvercoin
                          x11gost      sib (SibCoin)
+                          x12          Galaxie Cash (GCH)
                          x13          X13
                          x13sm3       hsr (Hshare)
                          x14          X14
                          x15          X15
+                          x16r         Ravencoin (RVN)
+                          x16s         pigeoncoin (PGN)
                          x17
-                          xevan        Bitsend
+                          xevan        Bitsend (BSD)
                          yescrypt     Globalboost-Y (BSTY)
-                          yescryptr8   BitZeny (ZNY)\n\
+                          yescryptr8   BitZeny (ZNY)
                          yescryptr16  Yenten (YTN)
+                          yescryptr32  WAVI
                          zr5          Ziftr

-Requirements
------------
-
-1. A x86_64 architecture CPU with a minimum of SSE2 support. This includes
-Intel Core2 and newer and AMD equivalents. In order to take advantage of AES_NI
-optimizations a CPU with AES_NI is required. This includes Intel Westbridge
-and newer and AMD equivalents. Further optimizations are available on some
-algoritms for CPUs with AVX and AVX2, Sandybridge and Haswell respectively.
-
-Older CPUs are supported by cpuminer-multi by TPruvot but at reduced
-performance.
-
-ARM CPUs are not supported.
-
-2. 64 bit Linux OS. Ubuntu and Fedora based distributions, including Mint and
-Centos are known to work and have all dependencies in their repositories.
-Others may work but may require more effort.
-64 bit Windows OS is supported with mingw_w64 and msys or pre-built binaries.
-
-MacOS, OSx is not supported.
-
-3. Stratum pool. Some algos may work wallet mining using getwork.
-
 Errata
 ------

+Neoscrypt crashes on Windows, use legacy version.
+
 AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
 supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
@@ -136,10 +153,13 @@ output from the miner showing the startup and any errors.
 Donations
 ---------

-I do not do this for money but I have a donation address if users
-are so inclined.
+cpuminer-opt has no fees of any kind but donations are accepted.

-bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations
+ BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
+ ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
+ LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
+ BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
+ BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ

 Happy mining!

--- a/README.txt
+++ b/README.txt
@@ -4,6 +4,11 @@ for Linux and Windows can be found in RELEASE_NOTES.
 cpuminer is a console program that is executed from a DOS command prompt.
 There is no GUI and no mouse support.

+Miner programs are often flagged as malware by antivirus programs. This is
+a false positive, they are flagged simply because they are cryptocurrency 
+miners. The source code is open for anyone to inspect. If you don't trust
+the software, don't use it.
+
 Choose the exe that best matches you CPU's features or use trial and
 error to find the fastest one that doesn't crash. Pay attention to
 the features listed at cpuminer startup to ensure you are mining at
@@ -16,22 +21,22 @@ AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
 supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.
+Changes in v3.8.4 may have improved compatibility with some of these CPUs.

-Exe name                Compile flags              Arch name

-cpuminer-sse2.exe      "-march=core2"              Core2   
-cpuminer-sse42.exe     "-march=corei7"             Nehalem
-cpuminer-aes-sse42.exe "-maes -msse4.2"            Westmere
-cpuminer-avx.exe       "-march=corei7-avx"         Sandybridge, Ivybridge
-cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
-cpuminer-avx-sha       "-march=corei7-avx -msha"   Ryzen...
-cpuminer-4way.exe      "-march=core-avx2 -DFOUR_WAY"       same as avx2
-cpuminer-4way-sha.exe  "-march=core-avx2 -msha -DFOUR_WAY" same as avx2-sha
+Exe name                Compile flags            Arch name

-4way requires a CPU with AES and AVX2. It is still under development and
-only a few algos are supported. See change log in RELEASE_NOTES in source
-package for supported algos.
+cpuminer-sse2.exe      "-msse2"                  Core2, Nehalem   
+cpuminer-aes-sse42.exe "-march=westmere"         Westmere, Sandy-Ivybridge
+cpuminer-avx2.exe      "-march=core-avx2"        Haswell, Sky-Kaby-Coffeelake
+cpuminer-avx2-sha.exe  "-march=core-avx2 -msha"  Ryzen
+
+If you like this software feel free to donate:
+
+BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
+ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
+LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
+BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
+BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ

-Ryzen CPus perform better with AVX than AVX2 therefore an avx-sha build
-is provided. Four way still uses AVX2. 

--- a/150
+++ b/150
@@ -1,4 +1,4 @@
-cpuminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
+puminer-opt now supports HW SHA acceleration available on AMD Ryzen CPUs.
 This feature requires recent SW including GCC version 5 or higher and
 openssl version 1.1 or higher. It may also require using "-march=znver1"
 compile flag.
@@ -13,11 +13,11 @@ Security warning
 ----------------

 Miner programs are often flagged as malware by antivirus programs. This is
-a false positive, they are flagged simply because they are miners. The source
-code is open for anyone to inspect. If you don't trust the software, don't use
-it.
+a false positive, they are flagged simply because they are cryptocurrency 
+miners. The source code is open for anyone to inspect. If you don't trust 
+the software, don't use it.

-The cryptographic code has been taken from trusted sources but has been
+The cryptographic hashing code has been taken from trusted sources but has been
 modified for speed at the expense of accepted security practices. This
 code should not be imported into applications where secure cryptography is
 required.
@@ -81,7 +81,7 @@ cd cpuminer-opt-x.y.z
 Run ./build.sh to build on Linux or execute the following commands.

 ./autogen.sh
-CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
+CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 make

 Additional optional compile flags, add the following to CFLAGS to activate:
@@ -90,22 +90,17 @@ Additional optional compile flags, add the following to CFLAGS to activate:

 SPH may give slightly better performance on algos that use sha256 when using
 openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
-better than SPH.
+better than SPH. This option is ignored when 4-way is used, even for CPUs
+with SHA.

-DFOUR_WAY
-
-4 way will give much better performance on supported algos with CPUs
-that have AVX2 and should only be used on CPUs with AVX2. 4 way algo
-support will be added incrementally, see change log below for supported algos.
- 
 Start mining.

 ./cpuminer -a algo -o url -u username -p password

 Windows

-The following in how the Windows binary releases are built. It's old and
-not very good but it works, for me anyway.
+Precompiled Windows binaries are built on a Linux host using Mingw
+with a more recent compiler than the following Windows hosted procedure.

 Building on Windows prerequisites:

@@ -137,10 +132,10 @@ or similar Windows program.
 In msys shell cd to miner directory.
 cd /c/path/to/cpuminer-opt

-Run winbuild.sh to build on Windows or execute the following commands.
+Run build.sh to build on Windows or execute the following commands.

 ./autogen.sh
-CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
+CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 make

 Start mining
@@ -149,9 +144,9 @@ cpuminer.exe -a algo -o url -u user -p password

 The following tips may be useful for older AMD CPUs.

-AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
-supported by cpuminer-opt due to an incompatible implementation of SSE2 on
-these CPUs. Some algos may crash the miner with an invalid instruction.
+AMD CPUs older than Steamroller, including Athlon x2 and Phenom II x4, are
+not supported by cpuminer-opt due to an incompatible implementation of SSE2
+on these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.

 Some users with AMD CPUs without AES_NI have reported problems compiling
@@ -165,6 +160,121 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.8.7.2
+
+Fixed argon2d-dyn regression in v3.8.7.1.
+Changed compile options for aes-sse42 Windows build to -march=westmere
+
+v3.8.7.1
+
+Fixed argon2d-uis low difficulty rejects.
+Fixed argon2d aliases.
+
+v3.8.7
+
+Added argon2d4096 (alias argon2d-uis) for Unitus (UIS).
+argon2d-crds and argon2d-dyn renamed to argon2d250 and argon2d500 respectively.
+  The old names are recognized as aliases.
+AVX512 is now supported for argon2d algos, Linux only.
+AVX is no longer a reported feature and an AVX Windows binary is no longer
+  provided. Use AES-SSE42 build instead.
+
+v3.8.6.1
+
+Faster argon2d* AVX2.
+Untested AVX-512 for argon2d*, YMMV.
+
+v3.8.6
+
+Fixed argon2 regression in v3.8.5.
+Added x16s algo for Pigeoncoin.
+Some code cleanup.
+
+v3.8.5
+
+Added argon2d-crds and argon2d-dyn algos.
+sha256t 8 way AVX2 & 4 way SSE4.2 optimized.
+CPUs with SSE4.2 get optimizations previously reserved for AVX.
+
+v3.8.4.1
+
+Fixed sha256t low difficulty rejects.
+Fixed compile error on CPUs with AVX512.
+
+v3.8.4
+
+Added yescryptr32 algo for WAVI coin.
+Added URL to API data.
+Improved detection of __int128 support (linux only)
+Compile support for CPUs without SSSE3 (no binary support)
+
+v3.8.3.3
+
+Integrated getblocktemplate with algo_gate.
+Added support for hodl gbt (untested).
+Reworked some recent quick fixes.
+
+v3.8.3.2
+
+Reverted gbt changes from v3.8.0 that broke getwork.
+Reverted scaled hash rate for API, added HS term in addition to KHS. 
+Added blocks solved to console display and API.
+
+v3.8.3.1
+
+Fixed regression in v3.8.3 that broke several algos.
+
+v3.8.3
+
+More restoration of lost lyra2 hash.
+8 way AVX2 and 4way AVX optimization for blakecoin, vanilla & blake2s.
+8 way AVX2 for lbry.
+Scaled hashrate for API output.
+A couple of GBT fixes.
+
+v3.8.2.1
+
+Fixed low difficulty rejects with allium.
+Fixed qubit AVX2.
+Restored lyra2z lost hash.
+Fixed build.sh
+
+v3.8.2
+
+Fixed and faster myr-gr.
+Added x12 algo (Galaxie Cash), allium algo (Garlicoin).
+Faster lyra2rev2, lbry, skein.
+Large reduction in compiler warnings.
+
+v3.8.1.1
+
+Fixed Windows AVX2 crash.
+
+v3.8.1
+
+Fixes x16r on CPUs with only SSE2.
+More Optimizations for X algos, qubit & deep.
+Corrected algo optimizations for scrypt and yescrypt, no new optimizations.
+
+v3.8.0.1
+
+Fixed x16r AVX2 low hash rate.
+
+v3.8.0
+
+4way no longer a seperate feature, included in AVX2.
+Added x16r algo for Ravencoin, anime algo for Animecoin.
+More 4way optimizations for X13 and up.
+Tweaked CPU affinity to better support more than 64 CPUs.
+Fixed compile problem on some old AMD CPUs.
+
+v3.7.10
+
+4way optimizations for lyra2rev2, lyra2h, quark, timetravel8, timetravel10
+   x11evo, blakecoin.
+Faster x13sm3 (hsr).
+Added share difficulty to accepted message.
+
 v3.7.9

 Partial 4way optimizations for veltor, skunk, polytimos, lyra2z.
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -16,7 +16,7 @@
 #include <memory.h>
 #include <unistd.h>
 #include <openssl/sha.h>
-#include "miner.h"
+//#include "miner.h"
 #include "algo-gate-api.h"

 // Define null and standard functions.
@@ -119,9 +119,11 @@ void init_algo_gate( algo_gate_t* gate )
   gate->gen_merkle_root         = (void*)&sha256d_gen_merkle_root;
   gate->stratum_gen_work        = (void*)&std_stratum_gen_work;
   gate->build_stratum_request   = (void*)&std_le_build_stratum_request;
+   gate->malloc_txs_request      = (void*)&std_malloc_txs_request;
   gate->set_target              = (void*)&std_set_target;
   gate->work_decode             = (void*)&std_le_work_decode;
   gate->submit_getwork_result   = (void*)&std_le_submit_getwork_result;
+   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
   gate->set_work_data_endian    = (void*)&do_nothing;
   gate->calc_network_diff       = (void*)&std_calc_network_diff;
@@ -155,7 +157,12 @@ bool register_algo_gate( int algo, algo_gate_t *gate )

   switch (algo)
   {
+     case ALGO_ALLIUM:       register_allium_algo      ( gate ); break;
+     case ALGO_ANIME:        register_anime_algo       ( gate ); break;
     case ALGO_ARGON2:       register_argon2_algo      ( gate ); break;
+     case ALGO_ARGON2D250:   register_argon2d_crds_algo( gate ); break;
+     case ALGO_ARGON2D500:   register_argon2d_dyn_algo ( gate ); break;
+     case ALGO_ARGON2D4096:  register_argon2d4096_algo ( gate ); break;
     case ALGO_AXIOM:        register_axiom_algo       ( gate ); break;
     case ALGO_BASTION:      register_bastion_algo     ( gate ); break;
     case ALGO_BLAKE:        register_blake_algo       ( gate ); break;
@@ -212,15 +219,19 @@ bool register_algo_gate( int algo, algo_gate_t *gate )
     case ALGO_X11:          register_x11_algo         ( gate ); break;
     case ALGO_X11EVO:       register_x11evo_algo      ( gate ); break;
     case ALGO_X11GOST:      register_x11gost_algo     ( gate ); break;
+     case ALGO_X12:          register_x12_algo         ( gate ); break;
     case ALGO_X13:          register_x13_algo         ( gate ); break;
     case ALGO_X13SM3:       register_x13sm3_algo      ( gate ); break;
     case ALGO_X14:          register_x14_algo         ( gate ); break;
     case ALGO_X15:          register_x15_algo         ( gate ); break;
+     case ALGO_X16R:         register_x16r_algo        ( gate ); break;
+     case ALGO_X16S:         register_x16s_algo        ( gate ); break;
     case ALGO_X17:          register_x17_algo         ( gate ); break;
     case ALGO_XEVAN:        register_xevan_algo       ( gate ); break;
     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
     case ALGO_YESCRYPTR8:   register_yescryptr8_algo  ( gate ); break;
     case ALGO_YESCRYPTR16:  register_yescryptr16_algo ( gate ); break;
+     case ALGO_YESCRYPTR32:  register_yescryptr32_algo ( gate ); break;
     case ALGO_ZR5:          register_zr5_algo         ( gate ); break;
    default:
        applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
@@ -278,6 +289,9 @@ void exec_hash_function( int algo, void *output, const void *pdata )
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
+  { "argon2d-crds",      "argon2d250"   },
+  { "argon2d-dyn",       "argon2d500"   },
+  { "argon2d-uis",       "argon2d4096"  },
  { "bitcore",           "timetravel10" },
  { "bitzeny",           "yescryptr8"   },
  { "blake256r8",        "blakecoin"    },
@@ -296,6 +310,7 @@ const char* const algo_alias_map[][2] =
  { "lyra2",             "lyra2re"      },
  { "lyra2v2",           "lyra2rev2"    },
  { "lyra2zoin",         "lyra2z330"    },
+  { "myrgr",             "myr-gr"       },
  { "myriad",            "myr-gr"       },
  { "neo",               "neoscrypt"    },
  { "phi",               "phi1612"      },
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -1,8 +1,9 @@
 #include <stdlib.h>
 #include <stdbool.h>
 #include <stdint.h>
-
 #include "miner.h"
+#include "avxdefs.h"
+#include "interleave.h"

 /////////////////////////////
 ////
@@ -88,10 +89,11 @@ typedef  uint32_t set_t;
 #define EMPTY_SET       0
 #define SSE2_OPT        1
 #define AES_OPT         2  
-#define AVX_OPT         4
-#define AVX2_OPT        8
-#define SHA_OPT      0x10
-#define FOUR_WAY_OPT 0x20
+#define SSE42_OPT       4
+#define AVX_OPT         8
+#define AVX2_OPT     0x10
+#define SHA_OPT      0x20
+#define AVX512_OPT   0x40

 // return set containing all elements from sets a & b
 inline set_t set_union ( set_t a, set_t b ) { return a | b; }
@@ -128,7 +130,10 @@ void ( *set_target)              ( struct work*, double );
 bool ( *submit_getwork_result )  ( CURL*, struct work* );
 void ( *gen_merkle_root )        ( char*, struct stratum_ctx* );
 void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );
+void ( *build_block_header )     ( struct work*, uint32_t, uint32_t*,
+                                   uint32_t*, uint32_t, uint32_t );
 void ( *build_stratum_request )  ( char*, struct work*, struct stratum_ctx* );
+char* ( *malloc_txs_request )    ( struct work* );
 void ( *set_work_data_endian )   ( struct work* );
 double ( *calc_network_diff )    ( struct work* );
 bool ( *ready_to_mine )          ( struct work*, struct stratum_ctx*, int );
@@ -213,7 +218,8 @@ int64_t get_max64_0x3fffffLL();
 int64_t get_max64_0x1ffff();
 int64_t get_max64_0xffffLL();

-void std_set_target   ( struct work *work, double job_diff );
+void std_set_target(    struct work *work, double job_diff );
+void alt_set_target(    struct work* work, double job_diff );
 void scrypt_set_target( struct work *work, double job_diff );

 bool std_le_work_decode( const json_t *val, struct work *work );
@@ -228,11 +234,17 @@ void std_le_build_stratum_request( char *req, struct work *work );
 void std_be_build_stratum_request( char *req, struct work *work );
 void jr2_build_stratum_request   ( char *req, struct work *work );

+char* std_malloc_txs_request( struct work *work );
+
 // Default is do_nothing (assumed LE)
 void set_work_data_big_endian( struct work *work );

 double std_calc_network_diff( struct work *work );

+void std_build_block_header( struct work* g_work, uint32_t version,
+                             uint32_t *prevhash, uint32_t *merkle_root,
+                             uint32_t ntime, uint32_t nbits );
+
 void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );

 json_t* std_longpoll_rpc_call( CURL *curl, int *err, char *lp_url );
--- a/algo/argon2/argon2a/ar2/ar2-scrypt-jane.c
+++ b/algo/argon2/argon2a/ar2/ar2-scrypt-jane.c
--- a/algo/argon2/argon2a/ar2/ar2-scrypt-jane.h
+++ b/algo/argon2/argon2a/ar2/ar2-scrypt-jane.h
--- a/algo/argon2/argon2a/ar2/argon2.c
+++ b/algo/argon2/argon2a/ar2/argon2.c
@@ -99,18 +99,18 @@ static const char *Argon2_ErrorMessage[] = {
 {ARGON2_MISSING_ARGS, */ "Missing arguments", /*},*/
 };

-int argon2d(argon2_context *context) { return argon2_core(context, Argon2_d); }
+int argon2d(argon2_context *context) { return ar2_argon2_core(context, Argon2_d); }

-int argon2i(argon2_context *context) { return argon2_core(context, Argon2_i); }
+int argon2i(argon2_context *context) { return ar2_argon2_core(context, Argon2_i); }

-int verify_d(argon2_context *context, const char *hash)
+int ar2_verify_d(argon2_context *context, const char *hash)
 {
 	int result;
 	/*if (0 == context->outlen || NULL == hash) {
 		return ARGON2_OUT_PTR_MISMATCH;
 	}*/

-	result = argon2_core(context, Argon2_d);
+	result = ar2_argon2_core(context, Argon2_d);

 	if (ARGON2_OK != result) {
 		return result;
@@ -223,7 +223,7 @@ static size_t to_base64(char *dst, size_t dst_len, const void *src)
 * The output length is always exactly 32 bytes.
 */

-int encode_string(char *dst, size_t dst_len, argon2_context *ctx)
+int ar2_encode_string(char *dst, size_t dst_len, argon2_context *ctx)
 {
 #define SS(str)                                                                \
 	do {                                                                       \
--- a/algo/argon2/argon2a/ar2/argon2.h
+++ b/algo/argon2/argon2a/ar2/argon2.h
@@ -255,7 +255,7 @@ int argon2id(argon2_context *context);
 * specified by the context outlen member
 * @return  Zero if successful, a non zero error code otherwise
 */
-int verify_d(argon2_context *context, const char *hash);
+int ar2_verify_d(argon2_context *context, const char *hash);

 /*
 * Get the associated error message for given error code
@@ -283,7 +283,7 @@ const char *error_message(int error_code);
 * The output length is always exactly 32 bytes.
 */

-int encode_string(char *dst, size_t dst_len, argon2_context *ctx);
+int ar2_encode_string(char *dst, size_t dst_len, argon2_context *ctx);

 #if defined(__cplusplus)
 }
--- a/algo/argon2/argon2a/ar2/bench.c
+++ b/algo/argon2/argon2a/ar2/bench.c
--- a/algo/argon2/argon2a/ar2/blake2/blake2-impl.h
+++ b/algo/argon2/argon2a/ar2/blake2/blake2-impl.h
--- a/algo/argon2/argon2a/ar2/blake2/blake2.h
+++ b/algo/argon2/argon2a/ar2/blake2/blake2.h
@@ -52,22 +52,22 @@ enum {
 };

 /* Streaming API */
-int blake2b_init(blake2b_state *S, size_t outlen);
-int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
+int ar2_blake2b_init(blake2b_state *S, size_t outlen);
+int ar2_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
 					 size_t keylen);
-int blake2b_init_param(blake2b_state *S, const blake2b_param *P);
-int blake2b_update(blake2b_state *S, const void *in, size_t inlen);
+int ar2_blake2b_init_param(blake2b_state *S, const blake2b_param *P);
+int ar2_blake2b_update(blake2b_state *S, const void *in, size_t inlen);
 void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen);
-int blake2b_final(blake2b_state *S, void *out, size_t outlen);
+int ar2_blake2b_final(blake2b_state *S, void *out, size_t outlen);

 /* Simple API */
-int blake2b(void *out, const void *in, const void *key, size_t keylen);
+int ar2_blake2b(void *out, const void *in, const void *key, size_t keylen);

 /* Argon2 Team - Begin Code */
-int blake2b_long(void *out, const void *in);
+int ar2_blake2b_long(void *out, const void *in);
 /* Argon2 Team - End Code */
 /* Miouyouyou */
-void blake2b_too(void *out, const void *in);
+void ar2_blake2b_too(void *out, const void *in);

 #if defined(__cplusplus)
 }
--- a/algo/argon2/argon2a/ar2/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2a/ar2/blake2/blamka-round-opt.h
--- a/algo/argon2/argon2a/ar2/blake2/blamka-round-ref.h
+++ b/algo/argon2/argon2a/ar2/blake2/blamka-round-ref.h
--- a/algo/argon2/argon2a/ar2/blake2b.c
+++ b/algo/argon2/argon2a/ar2/blake2b.c
@@ -107,7 +107,7 @@ static const blake2b_state miou = {
 };


-int blake2b_init_param(blake2b_state *S, const blake2b_param *P)
+int ar2_blake2b_init_param(blake2b_state *S, const blake2b_param *P)
 {
 	const unsigned char *p = (const unsigned char *)P;
 	unsigned int i;
@@ -133,7 +133,7 @@ void compare_buffs(uint64_t *h, size_t outlen)
 }

 /* Sequential blake2b initialization */
-int blake2b_init(blake2b_state *S, size_t outlen)
+int ar2_blake2b_init(blake2b_state *S, size_t outlen)
 {
 	memcpy(S, &miou, sizeof(*S));
 	S->h[0] += outlen;
@@ -147,7 +147,7 @@ void print64(const char *name, const uint64_t *array, uint16_t size)
 	printf("};\n");
 }

-int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t keylen)
+int ar2_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t keylen)
 {
 	return 0;
 }
@@ -207,7 +207,7 @@ static void blake2b_compress(blake2b_state *S, const uint8_t *block)
 #undef ROUND
 }

-int blake2b_update(blake2b_state *S, const void *in, size_t inlen)
+int ar2_blake2b_update(blake2b_state *S, const void *in, size_t inlen)
 {
 	const uint8_t *pin = (const uint8_t *)in;
 	/* Complete current block */
@@ -235,7 +235,7 @@ void my_blake2b_update(blake2b_state *S, const void *in, size_t inlen)
 	S->buflen += (unsigned int)inlen;
 }

-int blake2b_final(blake2b_state *S, void *out, size_t outlen)
+int ar2_blake2b_final(blake2b_state *S, void *out, size_t outlen)
 {
 	uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
 	unsigned int i;
@@ -257,48 +257,48 @@ int blake2b_final(blake2b_state *S, void *out, size_t outlen)
 	return 0;
 }

-int blake2b(void *out, const void *in, const void *key, size_t keylen)
+int ar2_blake2b(void *out, const void *in, const void *key, size_t keylen)
 {
 	blake2b_state S;

-	blake2b_init(&S, 64);
+	ar2_blake2b_init(&S, 64);
 	my_blake2b_update(&S, in, 64);
-	blake2b_final(&S, out, 64);
+	ar2_blake2b_final(&S, out, 64);
 	burn(&S, sizeof(S));
 	return 0;
 }

-void blake2b_too(void *pout, const void *in)
+void ar2_blake2b_too(void *pout, const void *in)
 {
 	uint8_t *out = (uint8_t *)pout;
 	uint8_t out_buffer[64];
 	uint8_t in_buffer[64];

 	blake2b_state blake_state;
-	blake2b_init(&blake_state, 64);
+	ar2_blake2b_init(&blake_state, 64);
 	blake_state.buflen = blake_state.buf[1] = 4;
 	my_blake2b_update(&blake_state, in, 72);
-	blake2b_final(&blake_state, out_buffer, 64);
+	ar2_blake2b_final(&blake_state, out_buffer, 64);
 	memcpy(out, out_buffer, 32);
 	out += 32;

 	register uint8_t i = 29;
 	while (i--) {
 		memcpy(in_buffer, out_buffer, 64);
-		blake2b(out_buffer, in_buffer, NULL, 0);
+		ar2_blake2b(out_buffer, in_buffer, NULL, 0);
 		memcpy(out, out_buffer, 32);
 		out += 32;
 	}

 	memcpy(in_buffer, out_buffer, 64);
-	blake2b(out_buffer, in_buffer, NULL, 0);
+	ar2_blake2b(out_buffer, in_buffer, NULL, 0);
 	memcpy(out, out_buffer, 64);

 	burn(&blake_state, sizeof(blake_state));
 }

 /* Argon2 Team - Begin Code */
-int blake2b_long(void *pout, const void *in)
+int ar2_blake2b_long(void *pout, const void *in)
 {
 	uint8_t *out = (uint8_t *)pout;
 	blake2b_state blake_state;
@@ -306,10 +306,10 @@ int blake2b_long(void *pout, const void *in)

 	store32(outlen_bytes, 32);

-	blake2b_init(&blake_state, 32);
+	ar2_blake2b_init(&blake_state, 32);
 	my_blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes));
-	blake2b_update(&blake_state, in, 1024);
-	blake2b_final(&blake_state, out, 32);
+	ar2_blake2b_update(&blake_state, in, 1024);
+	ar2_blake2b_final(&blake_state, out, 32);
 	burn(&blake_state, sizeof(blake_state));
 	return 0;
 }
--- a/algo/argon2/argon2a/ar2/cores.c
+++ b/algo/argon2/argon2a/ar2/cores.c
@@ -51,15 +51,15 @@
 #endif

 /***************Instance and Position constructors**********/
-void init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }
+void ar2_init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }
 //inline void init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }

-void copy_block(block *dst, const block *src) {
+void ar2_copy_block(block *dst, const block *src) {
 //inline void copy_block(block *dst, const block *src) {
    memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_WORDS_IN_BLOCK);
 }

-void xor_block(block *dst, const block *src) {
+void ar2_xor_block(block *dst, const block *src) {
 //inline void xor_block(block *dst, const block *src) {
    int i;
    for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
@@ -67,7 +67,7 @@ void xor_block(block *dst, const block *src) {
    }
 }

-static void load_block(block *dst, const void *input) {
+static void ar2_load_block(block *dst, const void *input) {
 //static inline void load_block(block *dst, const void *input) {
    unsigned i;
    for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
@@ -75,7 +75,7 @@ static void load_block(block *dst, const void *input) {
    }
 }

-static void store_block(void *output, const block *src) {
+static void ar2_store_block(void *output, const block *src) {
 //static inline void store_block(void *output, const block *src) {
    unsigned i;
    for (i = 0; i < ARGON2_WORDS_IN_BLOCK; ++i) {
@@ -84,7 +84,7 @@ static void store_block(void *output, const block *src) {
 }

 /***************Memory allocators*****************/
-int allocate_memory(block **memory, uint32_t m_cost) {
+int ar2_allocate_memory(block **memory, uint32_t m_cost) {
    if (memory != NULL) {
        size_t memory_size = sizeof(block) * m_cost;
        if (m_cost != 0 &&
@@ -105,34 +105,34 @@ int allocate_memory(block **memory, uint32_t m_cost) {
    }
 }

-void secure_wipe_memory(void *v, size_t n) { memset(v, 0, n); }
+void ar2_secure_wipe_memory(void *v, size_t n) { memset(v, 0, n); }
 //inline void secure_wipe_memory(void *v, size_t n) { memset(v, 0, n); }

 /*********Memory functions*/

-void clear_memory(argon2_instance_t *instance, int clear) {
+void ar2_clear_memory(argon2_instance_t *instance, int clear) {
 //inline void clear_memory(argon2_instance_t *instance, int clear) {
    if (instance->memory != NULL && clear) {
-        secure_wipe_memory(instance->memory,
+        ar2_secure_wipe_memory(instance->memory,
                           sizeof(block) * /*instance->memory_blocks*/16);
    }
 }

-void free_memory(block *memory) { free(memory); }
+void ar2_free_memory(block *memory) { free(memory); }
 //inline void free_memory(block *memory) { free(memory); }

-void finalize(const argon2_context *context, argon2_instance_t *instance) {
+void ar2_finalize(const argon2_context *context, argon2_instance_t *instance) {
    if (context != NULL && instance != NULL) {
        block blockhash;
-        copy_block(&blockhash, instance->memory + 15);
+        ar2_copy_block(&blockhash, instance->memory + 15);

        /* Hash the result */
        {
            uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
-            store_block(blockhash_bytes, &blockhash);
-            blake2b_long(context->out, blockhash_bytes);
-            secure_wipe_memory(blockhash.v, ARGON2_BLOCK_SIZE);
-            secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); /* clear blockhash_bytes */
+            ar2_store_block(blockhash_bytes, &blockhash);
+            ar2_blake2b_long(context->out, blockhash_bytes);
+            ar2_secure_wipe_memory(blockhash.v, ARGON2_BLOCK_SIZE);
+            ar2_secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); /* clear blockhash_bytes */
        }

 #ifdef GENKAT
@@ -142,11 +142,11 @@ void finalize(const argon2_context *context, argon2_instance_t *instance) {
        /* Clear memory */
        // clear_memory(instance, 1);

-        free_memory(instance->memory);
+        ar2_free_memory(instance->memory);
    }
 }

-uint32_t index_alpha(const argon2_instance_t *instance,
+uint32_t ar2_index_alpha(const argon2_instance_t *instance,
                     const argon2_position_t *position, uint32_t pseudo_rand,
                     int same_lane) {
    /*
@@ -207,7 +207,7 @@ uint32_t index_alpha(const argon2_instance_t *instance,
    return absolute_position;
 }

-void fill_memory_blocks(argon2_instance_t *instance) {
+void ar2_fill_memory_blocks(argon2_instance_t *instance) {
    uint32_t r, s;

    for (r = 0; r < 2; ++r) {
@@ -218,7 +218,7 @@ void fill_memory_blocks(argon2_instance_t *instance) {
            position.lane = 0;
            position.slice = (uint8_t)s;
            position.index = 0;
-            fill_segment(instance, position);
+            ar2_fill_segment(instance, position);
        }

 #ifdef GENKAT
@@ -227,19 +227,19 @@ void fill_memory_blocks(argon2_instance_t *instance) {
    }
 }

-void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) {
+void ar2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) {
    /* Make the first and second block in each lane as G(H0||i||0) or
       G(H0||i||1) */
    uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
    store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0);
    store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, 0);
-    blake2b_too(blockhash_bytes, blockhash);
-    load_block(&instance->memory[0], blockhash_bytes);
+    ar2_blake2b_too(blockhash_bytes, blockhash);
+    ar2_load_block(&instance->memory[0], blockhash_bytes);

    store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1);
-    blake2b_too(blockhash_bytes, blockhash);
-    load_block(&instance->memory[1], blockhash_bytes);
-    secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
+    ar2_blake2b_too(blockhash_bytes, blockhash);
+    ar2_load_block(&instance->memory[1], blockhash_bytes);
+    ar2_secure_wipe_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
 }


@@ -268,7 +268,7 @@ static const blake2b_state base_hash = {
 #define SALTLEN 32
 #define SECRETLEN 0
 #define ADLEN 0
-void initial_hash(uint8_t *blockhash, argon2_context *context,
+void ar2_initial_hash(uint8_t *blockhash, argon2_context *context,
                  argon2_type type) {

    uint8_t value[sizeof(uint32_t)];
@@ -280,7 +280,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
                   PWDLEN);


-    secure_wipe_memory(context->pwd, PWDLEN);
+    ar2_secure_wipe_memory(context->pwd, PWDLEN);
    context->pwdlen = 0;

    store32(&value, SALTLEN);
@@ -295,22 +295,22 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
    store32(&value, ADLEN);
    my_blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));

-    blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
+    ar2_blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
 }

-int initialize(argon2_instance_t *instance, argon2_context *context) {
+int ar2_initialize(argon2_instance_t *instance, argon2_context *context) {
    /* 1. Memory allocation */


-    allocate_memory(&(instance->memory), 16);
+    ar2_allocate_memory(&(instance->memory), 16);

    /* 2. Initial hashing */
    /* H_0 + 8 extra bytes to produce the first blocks */
    /* Hashing all inputs */
    uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH];
-    initial_hash(blockhash, context, instance->type);
+    ar2_initial_hash(blockhash, context, instance->type);
    /* Zeroing 8 extra bytes */
-    secure_wipe_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
+    ar2_secure_wipe_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
                       ARGON2_PREHASH_SEED_LENGTH -
                           ARGON2_PREHASH_DIGEST_LENGTH);

@@ -320,14 +320,14 @@ int initialize(argon2_instance_t *instance, argon2_context *context) {

    /* 3. Creating first blocks, we always have at least two blocks in a slice
     */
-    fill_first_blocks(blockhash, instance);
+    ar2_fill_first_blocks(blockhash, instance);
    /* Clearing the hash */
-    secure_wipe_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH);
+    ar2_secure_wipe_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH);

    return ARGON2_OK;
 }

-int argon2_core(argon2_context *context, argon2_type type) {
+int ar2_argon2_core(argon2_context *context, argon2_type type) {
    argon2_instance_t instance;
    instance.memory = NULL;
    instance.type = type;
@@ -336,14 +336,14 @@ int argon2_core(argon2_context *context, argon2_type type) {
     * blocks
     */

-    int result = initialize(&instance, context);
+    int result = ar2_initialize(&instance, context);
    if (ARGON2_OK != result) return result;

    /* 4. Filling memory */
-    fill_memory_blocks(&instance);
+    ar2_fill_memory_blocks(&instance);

    /* 5. Finalization */
-    finalize(context, &instance);
+    ar2_finalize(context, &instance);

    return ARGON2_OK;
 }
--- a/algo/argon2/argon2a/ar2/cores.h
+++ b/algo/argon2/argon2a/ar2/cores.h
@@ -62,13 +62,13 @@ typedef struct _block { uint64_t v[ARGON2_WORDS_IN_BLOCK]; } ALIGN(16) block;
 /*****************Functions that work with the block******************/

 /* Initialize each byte of the block with @in */
-void init_block_value(block *b, uint8_t in);
+void ar2_init_block_value(block *b, uint8_t in);

 /* Copy block @src to block @dst */
-void copy_block(block *dst, const block *src);
+void ar2_copy_block(block *dst, const block *src);

 /* XOR @src onto @dst bytewise */
-void xor_block(block *dst, const block *src);
+void ar2_xor_block(block *dst, const block *src);

 /*
 * Argon2 instance: memory pointer, number of passes, amount of memory, type,
@@ -101,24 +101,24 @@ typedef struct Argon2_position_t {
 * @param m_cost number of blocks to allocate in the memory
 * @return ARGON2_OK if @memory is a valid pointer and memory is allocated
 */
-int allocate_memory(block **memory, uint32_t m_cost);
+int ar2_allocate_memory(block **memory, uint32_t m_cost);

 /* Function that securely cleans the memory
 * @param mem Pointer to the memory
 * @param s Memory size in bytes
 */
-void secure_wipe_memory(void *v, size_t n);
+void ar2_secure_wipe_memory(void *v, size_t n);

 /* Clears memory
 * @param instance pointer to the current instance
 * @param clear_memory indicates if we clear the memory with zeros.
 */
-void clear_memory(argon2_instance_t *instance, int clear);
+void ar2_clear_memory(argon2_instance_t *instance, int clear);

 /* Deallocates memory
 * @param memory pointer to the blocks
 */
-void free_memory(block *memory);
+void ar2_free_memory(block *memory);

 /*
 * Computes absolute position of reference block in the lane following a skewed
@@ -130,7 +130,7 @@ void free_memory(block *memory);
 * If so we can reference the current segment
 * @pre All pointers must be valid
 */
-uint32_t index_alpha(const argon2_instance_t *instance,
+uint32_t ar2_index_alpha(const argon2_instance_t *instance,
                     const argon2_position_t *position, uint32_t pseudo_rand,
                     int same_lane);

@@ -141,7 +141,7 @@ uint32_t index_alpha(const argon2_instance_t *instance,
 * @return ARGON2_OK if everything is all right, otherwise one of error codes
 * (all defined in <argon2.h>
 */
-int validate_inputs(const argon2_context *context);
+int ar2_validate_inputs(const argon2_context *context);

 /*
 * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears
@@ -153,7 +153,7 @@ int validate_inputs(const argon2_context *context);
 * @pre    @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes
 * allocated
 */
-void initial_hash(uint8_t *blockhash, argon2_context *context,
+void ar2_initial_hash(uint8_t *blockhash, argon2_context *context,
                  argon2_type type);

 /*
@@ -162,7 +162,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
 * @param blockhash Pointer to the pre-hashing digest
 * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values
 */
-void fill_firsts_blocks(uint8_t *blockhash, const argon2_instance_t *instance);
+void ar2_fill_firsts_blocks(uint8_t *blockhash, const argon2_instance_t *instance);

 /*
 * Function allocates memory, hashes the inputs with Blake,  and creates first
@@ -174,7 +174,7 @@ void fill_firsts_blocks(uint8_t *blockhash, const argon2_instance_t *instance);
 * @return Zero if successful, -1 if memory failed to allocate. @context->state
 * will be modified if successful.
 */
-int initialize(argon2_instance_t *instance, argon2_context *context);
+int ar2_initialize(argon2_instance_t *instance, argon2_context *context);

 /*
 * XORing the last block of each lane, hashing it, making the tag. Deallocates
@@ -187,7 +187,7 @@ int initialize(argon2_instance_t *instance, argon2_context *context);
 * @pre if context->free_cbk is not NULL, it should point to a function that
 * deallocates memory
 */
-void finalize(const argon2_context *context, argon2_instance_t *instance);
+void ar2_finalize(const argon2_context *context, argon2_instance_t *instance);

 /*
 * Function that fills the segment using previous segments also from other
@@ -196,7 +196,7 @@ void finalize(const argon2_context *context, argon2_instance_t *instance);
 * @param position Current position
 * @pre all block pointers must be valid
 */
-void fill_segment(const argon2_instance_t *instance,
+void ar2_fill_segment(const argon2_instance_t *instance,
                  argon2_position_t position);

 /*
@@ -204,13 +204,13 @@ void fill_segment(const argon2_instance_t *instance,
 * blocks in each lane
 * @param instance Pointer to the current instance
 */
-void fill_memory_blocks(argon2_instance_t *instance);
+void ar2_fill_memory_blocks(argon2_instance_t *instance);

 /*
 * Function that performs memory-hard hashing with certain degree of parallelism
 * @param  context  Pointer to the Argon2 internal structure
 * @return Error code if smth is wrong, ARGON2_OK otherwise
 */
-int argon2_core(argon2_context *context, argon2_type type);
+int ar2_argon2_core(argon2_context *context, argon2_type type);

 #endif
--- a/algo/argon2/argon2a/ar2/genkat.c.hide
+++ b/algo/argon2/argon2a/ar2/genkat.c.hide
--- a/algo/argon2/argon2a/ar2/genkat.h.hide
+++ b/algo/argon2/argon2a/ar2/genkat.h.hide
--- a/algo/argon2/argon2a/ar2/opt.c
+++ b/algo/argon2/argon2a/ar2/opt.c
@@ -26,7 +26,7 @@
 #include "blake2/blake2.h"
 #include "blake2/blamka-round-opt.h"

-void fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block)
+void ar2_fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block)
 {
    __m128i ALIGN(16) block_XY[ARGON2_QWORDS_IN_BLOCK];
    uint32_t i;
@@ -95,7 +95,7 @@ static const uint64_t bad_rands[32] = {
    UINT64_C(8548260058287621283),  UINT64_C(8641748798041936364)
 };

-void generate_addresses(const argon2_instance_t *instance,
+void ar2_generate_addresses(const argon2_instance_t *instance,
                        const argon2_position_t *position,
                        uint64_t *pseudo_rands)
 {
@@ -113,7 +113,7 @@ void generate_addresses(const argon2_instance_t *instance,
 #define LANE_LENGTH 16
 #define POS_LANE 0

-void fill_segment(const argon2_instance_t *instance,
+void ar2_fill_segment(const argon2_instance_t *instance,
                  argon2_position_t position)
 {
    block *ref_block = NULL, *curr_block = NULL;
@@ -129,7 +129,7 @@ void fill_segment(const argon2_instance_t *instance,
    pseudo_rands = (uint64_t *)malloc(/*sizeof(uint64_t) * 4*/32);

    if (data_independent_addressing) {
-        generate_addresses(instance, &position, pseudo_rands);
+        ar2_generate_addresses(instance, &position, pseudo_rands);
    }

    i = 0;
@@ -173,12 +173,12 @@ void fill_segment(const argon2_instance_t *instance,
         * lane.
         */
        position.index = i;
-        ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,1);
+        ref_index = ar2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,1);

        /* 2 Creating a new block */
        ref_block = instance->memory + ref_index;
        curr_block = instance->memory + curr_offset;
-        fill_block(state, (__m128i const *)ref_block->v, (__m128i *)curr_block->v);
+        ar2_fill_block(state, (__m128i const *)ref_block->v, (__m128i *)curr_block->v);
    }

    free(pseudo_rands);
--- a/algo/argon2/argon2a/ar2/opt.h
+++ b/algo/argon2/argon2a/ar2/opt.h
@@ -21,7 +21,7 @@
 * @param next_block Pointer to the block to be constructed
 * @pre all block pointers must be valid
 */
-void fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block);
+void ar2_fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block);

 /*
 * Generate pseudo-random values to reference blocks in the segment and puts
@@ -31,7 +31,7 @@ void fill_block(__m128i *state, __m128i const *ref_block, __m128i *next_block);
 * @param pseudo_rands Pointer to the array of 64-bit values
 * @pre pseudo_rands must point to @a instance->segment_length allocated values
 */
-void generate_addresses(const argon2_instance_t *instance,
+void ar2_generate_addresses(const argon2_instance_t *instance,
                        const argon2_position_t *position,
                        uint64_t *pseudo_rands);

@@ -43,7 +43,7 @@ void generate_addresses(const argon2_instance_t *instance,
 * @param position Current position
 * @pre all block pointers must be valid
 */
-void fill_segment(const argon2_instance_t *instance,
+void ar2_fill_segment(const argon2_instance_t *instance,
                  argon2_position_t position);

 #endif /* ARGON2_OPT_H */
--- a/algo/argon2/argon2a/ar2/ref.c.hide
+++ b/algo/argon2/argon2a/ar2/ref.c.hide
--- a/algo/argon2/argon2a/ar2/ref.h.hide
+++ b/algo/argon2/argon2a/ar2/ref.h.hide
--- a/algo/argon2/argon2a/ar2/run.c.hide
+++ b/algo/argon2/argon2a/ar2/run.c.hide
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash_skein512.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-hash_skein512.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx2.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-avx2.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-sse2.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-sse2.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-ssse3.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-xop.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64-xop.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-mix_salsa64.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-pbkdf2.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-pbkdf2.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable-x86.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-portable.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-basic.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-template.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix-template.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-romix.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-salsa64.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-salsa64.h
--- a/algo/argon2/argon2a/ar2/sj/scrypt-jane-test-vectors.h
+++ b/algo/argon2/argon2a/ar2/sj/scrypt-jane-test-vectors.h
--- a/algo/argon2/argon2a/argon2a.c
+++ b/algo/argon2/argon2a/argon2a.c
@@ -24,7 +24,7 @@ inline void argon_call(void *out, void *in, void *salt, int type)
 	context.allocate_cbk = NULL;
 	context.free_cbk = NULL;

-	argon2_core(&context, type);
+	ar2_argon2_core(&context, type);
 }

 void argon2hash(void *output, const void *input)
@@ -79,7 +79,7 @@ int64_t argon2_get_max64 ()

 bool register_argon2_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash        = (void*)&scanhash_argon2;
  gate->hash            = (void*)&argon2hash;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -0,0 +1,198 @@
+#include "argon2d-gate.h"
+#include "argon2d/argon2.h"
+
+static const size_t INPUT_BYTES = 80;  // Lenth of a block header in bytes. Input Length = Salt Length (salt = input)
+static const size_t OUTPUT_BYTES = 32; // Length of output needed for a 256-bit hash
+static const unsigned int DEFAULT_ARGON2_FLAG = 2; //Same as ARGON2_DEFAULT_FLAGS
+
+// Credits
+
+void argon2d_crds_hash( void *output, const void *input )
+{
+	argon2_context context;
+	context.out = (uint8_t *)output;
+	context.outlen = (uint32_t)OUTPUT_BYTES;
+	context.pwd = (uint8_t *)input;
+	context.pwdlen = (uint32_t)INPUT_BYTES;
+	context.salt = (uint8_t *)input; //salt = input
+	context.saltlen = (uint32_t)INPUT_BYTES;
+	context.secret = NULL;
+	context.secretlen = 0;
+	context.ad = NULL;
+	context.adlen = 0;
+	context.allocate_cbk = NULL;
+	context.free_cbk = NULL;
+	context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
+	// main configurable Argon2 hash parameters
+	context.m_cost = 250; // Memory in KiB (~256KB)
+	context.lanes = 4;    // Degree of Parallelism
+	context.threads = 1;  // Threads
+	context.t_cost = 1;   // Iterations
+        context.version = ARGON2_VERSION_10;
+
+	argon2_ctx( &context, Argon2_d );
+}
+
+int scanhash_argon2d_crds( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done )
+{
+        uint32_t _ALIGN(64) endiandata[20];
+        uint32_t _ALIGN(64) hash[8];
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+
+        const uint32_t first_nonce = pdata[19];
+        const uint32_t Htarg = ptarget[7];
+
+        uint32_t nonce = first_nonce;
+
+        swab32_array( endiandata, pdata, 20 );
+
+        do {
+                be32enc(&endiandata[19], nonce);
+                argon2d_crds_hash( hash, endiandata );
+                if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+                {
+                        pdata[19] = nonce;
+                        *hashes_done = pdata[19] - first_nonce;
+                        work_set_target_ratio(work, hash);
+                        return 1;
+                }
+                nonce++;
+        } while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+        pdata[19] = nonce;
+        *hashes_done = pdata[19] - first_nonce + 1;
+        return 0;
+}
+
+bool register_argon2d_crds_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d_crds;
+        gate->hash = (void*)&argon2d_crds_hash;
+        gate->set_target = (void*)&scrypt_set_target;
+        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        return true;
+}
+
+// Dynamic
+
+void argon2d_dyn_hash( void *output, const void *input )
+{
+    argon2_context context;
+    context.out = (uint8_t *)output;
+    context.outlen = (uint32_t)OUTPUT_BYTES;
+    context.pwd = (uint8_t *)input;
+    context.pwdlen = (uint32_t)INPUT_BYTES;
+    context.salt = (uint8_t *)input; //salt = input
+    context.saltlen = (uint32_t)INPUT_BYTES;
+    context.secret = NULL;
+    context.secretlen = 0;
+    context.ad = NULL;
+    context.adlen = 0;
+    context.allocate_cbk = NULL;
+    context.free_cbk = NULL;
+    context.flags = DEFAULT_ARGON2_FLAG; // = ARGON2_DEFAULT_FLAGS
+    // main configurable Argon2 hash parameters
+    context.m_cost = 500;  // Memory in KiB (512KB)
+    context.lanes = 8;     // Degree of Parallelism
+    context.threads = 1;   // Threads
+    context.t_cost = 2;    // Iterations
+    context.version = ARGON2_VERSION_10;
+
+    argon2_ctx( &context, Argon2_d );
+}
+
+int scanhash_argon2d_dyn( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done )
+{
+        uint32_t _ALIGN(64) endiandata[20];
+        uint32_t _ALIGN(64) hash[8];
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+
+        const uint32_t first_nonce = pdata[19];
+        const uint32_t Htarg = ptarget[7];
+
+        uint32_t nonce = first_nonce;
+
+        swab32_array( endiandata, pdata, 20 );
+
+        do {
+                be32enc(&endiandata[19], nonce);
+                argon2d_dyn_hash( hash, endiandata );
+                if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
+                {
+                        pdata[19] = nonce;
+                        *hashes_done = pdata[19] - first_nonce;
+                        work_set_target_ratio(work, hash);
+                        return 1;
+                }
+                nonce++;
+        } while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+        pdata[19] = nonce;
+        *hashes_done = pdata[19] - first_nonce + 1;
+        return 0;
+}
+
+bool register_argon2d_dyn_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d_dyn;
+        gate->hash = (void*)&argon2d_dyn_hash;
+        gate->set_target = (void*)&scrypt_set_target;
+        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        return true;
+}
+
+// Unitus
+
+int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done)
+{
+   uint32_t _ALIGN(64) vhash[8];
+   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+    
+   uint32_t t_cost = 1; // 1 iteration
+   uint32_t m_cost = 4096; // use 4MB
+   uint32_t parallelism = 1; // 1 thread, 2 lanes
+
+   for ( int i = 0; i < 19; i++ )
+      be32enc( &endiandata[i], pdata[i] );
+
+   do {
+      be32enc( &endiandata[19], n );
+      argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
+                 (char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
+      if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) )
+      {
+         *hashes_done = n - first_nonce + 1;
+         pdata[19] = n;
+         return true;
+      }
+      n++;
+
+   } while (n < max_nonce && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+
+   return 0;
+}
+
+int64_t get_max64_0x1ff() { return 0x1ff; }
+
+bool register_argon2d4096_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d4096;
+        gate->set_target = (void*)&scrypt_set_target;
+        gate->get_max64  = (void*)&get_max64_0x1ff;
+        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        return true;
+}
+
--- a/algo/argon2/argon2d/argon2d-gate.h
+++ b/algo/argon2/argon2d/argon2d-gate.h
@@ -0,0 +1,31 @@
+#ifndef ARGON2D_GATE_H__
+#define ARGON2D_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+// Credits: version = 0x10, m_cost = 250.
+bool register_argon2d_crds_algo( algo_gate_t* gate );
+
+void argon2d_crds_hash( void *state, const void *input );
+
+int scanhash_argon2d_crds( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+// Dynamic: version = 0x10, m_cost = 500.
+bool register_argon2d_dyn_algo( algo_gate_t* gate );
+
+void argon2d_dyn_hash( void *state, const void *input );
+
+int scanhash_argon2d_dyn( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+
+// Unitus: version = 0x13, m_cost = 4096.
+bool register_argon2d4096_algo( algo_gate_t* gate );
+
+int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+#endif
+
--- a/algo/argon2/argon2d/argon2d/argon2.c
+++ b/algo/argon2/argon2d/argon2d/argon2.c
@@ -0,0 +1,458 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "argon2.h"
+#include "encoding.h"
+#include "core.h"
+
+const char *argon2_type2string(argon2_type type, int uppercase) {
+    switch (type) {
+        case Argon2_d:
+            return uppercase ? "Argon2d" : "argon2d";
+        case Argon2_i:
+            return uppercase ? "Argon2i" : "argon2i";
+        case Argon2_id:
+            return uppercase ? "Argon2id" : "argon2id";
+    }
+
+    return NULL;
+}
+
+int argon2_ctx(argon2_context *context, argon2_type type) {
+    /* 1. Validate all inputs */
+    int result = validate_inputs(context);
+    uint32_t memory_blocks, segment_length;
+    argon2_instance_t instance;
+
+    if (ARGON2_OK != result) {
+        return result;
+    }
+
+    if (Argon2_d != type && Argon2_i != type && Argon2_id != type) {
+        return ARGON2_INCORRECT_TYPE;
+    }
+
+    /* 2. Align memory size */
+    /* Minimum memory_blocks = 8L blocks, where L is the number of lanes */
+    memory_blocks = context->m_cost;
+
+    if (memory_blocks < 2 * ARGON2_SYNC_POINTS * context->lanes) {
+        memory_blocks = 2 * ARGON2_SYNC_POINTS * context->lanes;
+    }
+
+    segment_length = memory_blocks / (context->lanes * ARGON2_SYNC_POINTS);
+    /* Ensure that all segments have equal length */
+    memory_blocks = segment_length * (context->lanes * ARGON2_SYNC_POINTS);
+
+    instance.version = context->version;
+    instance.memory = NULL;
+    instance.passes = context->t_cost;
+    instance.memory_blocks = memory_blocks;
+    instance.segment_length = segment_length;
+    instance.lane_length = segment_length * ARGON2_SYNC_POINTS;
+    instance.lanes = context->lanes;
+    instance.threads = context->threads;
+    instance.type = type;
+
+    if (instance.threads > instance.lanes) {
+        instance.threads = instance.lanes;
+    }
+
+    /* 3. Initialization: Hashing inputs, allocating memory, filling first
+     * blocks
+     */
+    result = initialize(&instance, context);
+
+    if (ARGON2_OK != result) {
+        return result;
+    }
+
+    /* 4. Filling memory */
+    result = fill_memory_blocks(&instance);
+
+    if (ARGON2_OK != result) {
+        return result;
+    }
+    /* 5. Finalization */
+    finalize(context, &instance);
+
+    return ARGON2_OK;
+}
+
+int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
+                const uint32_t parallelism, const void *pwd,
+                const size_t pwdlen, const void *salt, const size_t saltlen,
+                void *hash, const size_t hashlen, char *encoded,
+                const size_t encodedlen, argon2_type type,
+                const uint32_t version){
+
+    argon2_context context;
+    int result;
+    uint8_t *out;
+
+    if (pwdlen > ARGON2_MAX_PWD_LENGTH) {
+        return ARGON2_PWD_TOO_LONG;
+    }
+
+    if (saltlen > ARGON2_MAX_SALT_LENGTH) {
+        return ARGON2_SALT_TOO_LONG;
+    }
+
+    if (hashlen > ARGON2_MAX_OUTLEN) {
+        return ARGON2_OUTPUT_TOO_LONG;
+    }
+
+    if (hashlen < ARGON2_MIN_OUTLEN) {
+        return ARGON2_OUTPUT_TOO_SHORT;
+    }
+
+    out = malloc(hashlen);
+    if (!out) {
+        return ARGON2_MEMORY_ALLOCATION_ERROR;
+    }
+
+    context.out = (uint8_t *)out;
+    context.outlen = (uint32_t)hashlen;
+    context.pwd = CONST_CAST(uint8_t *)pwd;
+    context.pwdlen = (uint32_t)pwdlen;
+    context.salt = CONST_CAST(uint8_t *)salt;
+    context.saltlen = (uint32_t)saltlen;
+    context.secret = NULL;
+    context.secretlen = 0;
+    context.ad = NULL;
+    context.adlen = 0;
+    context.t_cost = t_cost;
+    context.m_cost = m_cost;
+    context.lanes = parallelism;
+    context.threads = parallelism;
+    context.allocate_cbk = NULL;
+    context.free_cbk = NULL;
+    context.flags = ARGON2_DEFAULT_FLAGS;
+    context.version = version;
+
+    result = argon2_ctx(&context, type);
+
+    if (result != ARGON2_OK) {
+        clear_internal_memory(out, hashlen);
+        free(out);
+        return result;
+    }
+
+    /* if raw hash requested, write it */
+    if (hash) {
+        memcpy(hash, out, hashlen);
+    }
+
+    /* if encoding requested, write it */
+    if (encoded && encodedlen) {
+        if (encode_string(encoded, encodedlen, &context, type) != ARGON2_OK) {
+            clear_internal_memory(out, hashlen); /* wipe buffers if error */
+            clear_internal_memory(encoded, encodedlen);
+            free(out);
+            return ARGON2_ENCODING_FAIL;
+        }
+    }
+    clear_internal_memory(out, hashlen);
+    free(out);
+
+    return ARGON2_OK;
+}
+
+int argon2i_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
+                         const uint32_t parallelism, const void *pwd,
+                         const size_t pwdlen, const void *salt,
+                         const size_t saltlen, const size_t hashlen,
+                         char *encoded, const size_t encodedlen,
+                         const uint32_t version) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       NULL, hashlen, encoded, encodedlen, Argon2_i,
+                       version );
+}
+
+int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                     const uint32_t parallelism, const void *pwd,
+                     const size_t pwdlen, const void *salt,
+                     const size_t saltlen, void *hash, const size_t hashlen,
+                     const uint32_t version ) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       hash, hashlen, NULL, 0, Argon2_i, version );
+}
+
+int argon2d_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
+                         const uint32_t parallelism, const void *pwd,
+                         const size_t pwdlen, const void *salt,
+                         const size_t saltlen, const size_t hashlen,
+                         char *encoded, const size_t encodedlen,
+                         const uint32_t version ) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       NULL, hashlen, encoded, encodedlen, Argon2_d,
+                       version );
+}
+
+int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                     const uint32_t parallelism, const void *pwd,
+                     const size_t pwdlen, const void *salt,
+                     const size_t saltlen, void *hash, const size_t hashlen,
+                     const uint32_t version ) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       hash, hashlen, NULL, 0, Argon2_d, version );
+}
+
+int argon2id_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
+                          const uint32_t parallelism, const void *pwd,
+                          const size_t pwdlen, const void *salt,
+                          const size_t saltlen, const size_t hashlen,
+                          char *encoded, const size_t encodedlen,
+                          const uint32_t version ) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       NULL, hashlen, encoded, encodedlen, Argon2_id,
+                       version);
+}
+
+int argon2id_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                      const uint32_t parallelism, const void *pwd,
+                      const size_t pwdlen, const void *salt,
+                      const size_t saltlen, void *hash, const size_t hashlen,
+                      const uint32_t version ) {
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       hash, hashlen, NULL, 0, Argon2_id, version );
+}
+
+static int argon2_compare(const uint8_t *b1, const uint8_t *b2, size_t len) {
+    size_t i;
+    uint8_t d = 0U;
+
+    for (i = 0U; i < len; i++) {
+        d |= b1[i] ^ b2[i];
+    }
+    return (int)((1 & ((d - 1) >> 8)) - 1);
+}
+
+int argon2_verify(const char *encoded, const void *pwd, const size_t pwdlen,
+                  argon2_type type) {
+
+    argon2_context ctx;
+    uint8_t *desired_result = NULL;
+
+    int ret = ARGON2_OK;
+
+    size_t encoded_len;
+    uint32_t max_field_len;
+
+    if (pwdlen > ARGON2_MAX_PWD_LENGTH) {
+        return ARGON2_PWD_TOO_LONG;
+    }
+
+    if (encoded == NULL) {
+        return ARGON2_DECODING_FAIL;
+    }
+
+    encoded_len = strlen(encoded);
+    if (encoded_len > UINT32_MAX) {
+        return ARGON2_DECODING_FAIL;
+    }
+
+    /* No field can be longer than the encoded length */
+    max_field_len = (uint32_t)encoded_len;
+
+    ctx.saltlen = max_field_len;
+    ctx.outlen = max_field_len;
+
+    ctx.salt = malloc(ctx.saltlen);
+    ctx.out = malloc(ctx.outlen);
+    if (!ctx.salt || !ctx.out) {
+        ret = ARGON2_MEMORY_ALLOCATION_ERROR;
+        goto fail;
+    }
+
+    ctx.pwd = (uint8_t *)pwd;
+    ctx.pwdlen = (uint32_t)pwdlen;
+
+    ret = decode_string(&ctx, encoded, type);
+    if (ret != ARGON2_OK) {
+        goto fail;
+    }
+
+    /* Set aside the desired result, and get a new buffer. */
+    desired_result = ctx.out;
+    ctx.out = malloc(ctx.outlen);
+    if (!ctx.out) {
+        ret = ARGON2_MEMORY_ALLOCATION_ERROR;
+        goto fail;
+    }
+
+    ret = argon2_verify_ctx(&ctx, (char *)desired_result, type);
+    if (ret != ARGON2_OK) {
+        goto fail;
+    }
+
+fail:
+    free(ctx.salt);
+    free(ctx.out);
+    free(desired_result);
+
+    return ret;
+}
+
+int argon2i_verify(const char *encoded, const void *pwd, const size_t pwdlen) {
+
+    return argon2_verify(encoded, pwd, pwdlen, Argon2_i);
+}
+
+int argon2d_verify(const char *encoded, const void *pwd, const size_t pwdlen) {
+
+    return argon2_verify(encoded, pwd, pwdlen, Argon2_d);
+}
+
+int argon2id_verify(const char *encoded, const void *pwd, const size_t pwdlen) {
+
+    return argon2_verify(encoded, pwd, pwdlen, Argon2_id);
+}
+
+int argon2d_ctx(argon2_context *context) {
+    return argon2_ctx(context, Argon2_d);
+}
+
+int argon2i_ctx(argon2_context *context) {
+    return argon2_ctx(context, Argon2_i);
+}
+
+int argon2id_ctx(argon2_context *context) {
+    return argon2_ctx(context, Argon2_id);
+}
+
+int argon2_verify_ctx(argon2_context *context, const char *hash,
+                      argon2_type type) {
+    int ret = argon2_ctx(context, type);
+    if (ret != ARGON2_OK) {
+        return ret;
+    }
+
+    if (argon2_compare((uint8_t *)hash, context->out, context->outlen)) {
+        return ARGON2_VERIFY_MISMATCH;
+    }
+
+    return ARGON2_OK;
+}
+
+int argon2d_verify_ctx(argon2_context *context, const char *hash) {
+    return argon2_verify_ctx(context, hash, Argon2_d);
+}
+
+int argon2i_verify_ctx(argon2_context *context, const char *hash) {
+    return argon2_verify_ctx(context, hash, Argon2_i);
+}
+
+int argon2id_verify_ctx(argon2_context *context, const char *hash) {
+    return argon2_verify_ctx(context, hash, Argon2_id);
+}
+
+const char *argon2_error_message(int error_code) {
+    switch (error_code) {
+    case ARGON2_OK:
+        return "OK";
+    case ARGON2_OUTPUT_PTR_NULL:
+        return "Output pointer is NULL";
+    case ARGON2_OUTPUT_TOO_SHORT:
+        return "Output is too short";
+    case ARGON2_OUTPUT_TOO_LONG:
+        return "Output is too long";
+    case ARGON2_PWD_TOO_SHORT:
+        return "Password is too short";
+    case ARGON2_PWD_TOO_LONG:
+        return "Password is too long";
+    case ARGON2_SALT_TOO_SHORT:
+        return "Salt is too short";
+    case ARGON2_SALT_TOO_LONG:
+        return "Salt is too long";
+    case ARGON2_AD_TOO_SHORT:
+        return "Associated data is too short";
+    case ARGON2_AD_TOO_LONG:
+        return "Associated data is too long";
+    case ARGON2_SECRET_TOO_SHORT:
+        return "Secret is too short";
+    case ARGON2_SECRET_TOO_LONG:
+        return "Secret is too long";
+    case ARGON2_TIME_TOO_SMALL:
+        return "Time cost is too small";
+    case ARGON2_TIME_TOO_LARGE:
+        return "Time cost is too large";
+    case ARGON2_MEMORY_TOO_LITTLE:
+        return "Memory cost is too small";
+    case ARGON2_MEMORY_TOO_MUCH:
+        return "Memory cost is too large";
+    case ARGON2_LANES_TOO_FEW:
+        return "Too few lanes";
+    case ARGON2_LANES_TOO_MANY:
+        return "Too many lanes";
+    case ARGON2_PWD_PTR_MISMATCH:
+        return "Password pointer is NULL, but password length is not 0";
+    case ARGON2_SALT_PTR_MISMATCH:
+        return "Salt pointer is NULL, but salt length is not 0";
+    case ARGON2_SECRET_PTR_MISMATCH:
+        return "Secret pointer is NULL, but secret length is not 0";
+    case ARGON2_AD_PTR_MISMATCH:
+        return "Associated data pointer is NULL, but ad length is not 0";
+    case ARGON2_MEMORY_ALLOCATION_ERROR:
+        return "Memory allocation error";
+    case ARGON2_FREE_MEMORY_CBK_NULL:
+        return "The free memory callback is NULL";
+    case ARGON2_ALLOCATE_MEMORY_CBK_NULL:
+        return "The allocate memory callback is NULL";
+    case ARGON2_INCORRECT_PARAMETER:
+        return "Argon2_Context context is NULL";
+    case ARGON2_INCORRECT_TYPE:
+        return "There is no such version of Argon2";
+    case ARGON2_OUT_PTR_MISMATCH:
+        return "Output pointer mismatch";
+    case ARGON2_THREADS_TOO_FEW:
+        return "Not enough threads";
+    case ARGON2_THREADS_TOO_MANY:
+        return "Too many threads";
+    case ARGON2_MISSING_ARGS:
+        return "Missing arguments";
+    case ARGON2_ENCODING_FAIL:
+        return "Encoding failed";
+    case ARGON2_DECODING_FAIL:
+        return "Decoding failed";
+    case ARGON2_THREAD_FAIL:
+        return "Threading failure";
+    case ARGON2_DECODING_LENGTH_FAIL:
+        return "Some of encoded parameters are too long or too short";
+    case ARGON2_VERIFY_MISMATCH:
+        return "The password does not match the supplied hash";
+    default:
+        return "Unknown error code";
+    }
+}
+/*
+size_t argon2_encodedlen(uint32_t t_cost, uint32_t m_cost, uint32_t parallelism,
+                         uint32_t saltlen, uint32_t hashlen, argon2_type type) {
+  return strlen("$$v=$m=,t=,p=$$") + strlen(argon2_type2string(type, 0)) +
+         numlen(t_cost) + numlen(m_cost) + numlen(parallelism) +
+         b64len(saltlen) + b64len(hashlen) + numlen(ARGON2_VERSION_NUMBER) + 1;
+}
+*/
--- a/algo/argon2/argon2d/argon2d/argon2.h
+++ b/algo/argon2/argon2d/argon2d/argon2.h
@@ -0,0 +1,440 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef ARGON2_H
+#define ARGON2_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <limits.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Symbols visibility control */
+#ifdef A2_VISCTL
+#define ARGON2_PUBLIC __attribute__((visibility("default")))
+#define ARGON2_LOCAL __attribute__ ((visibility ("hidden")))
+#elif _MSC_VER
+#define ARGON2_PUBLIC __declspec(dllexport)
+#define ARGON2_LOCAL
+#else
+#define ARGON2_PUBLIC
+#define ARGON2_LOCAL
+#endif
+
+/*
+ * Argon2 input parameter restrictions
+ */
+
+/* Minimum and maximum number of lanes (degree of parallelism) */
+#define ARGON2_MIN_LANES UINT32_C(1)
+#define ARGON2_MAX_LANES UINT32_C(0xFFFFFF)
+
+/* Minimum and maximum number of threads */
+#define ARGON2_MIN_THREADS UINT32_C(1)
+#define ARGON2_MAX_THREADS UINT32_C(0xFFFFFF)
+
+/* Number of synchronization points between lanes per pass */
+#define ARGON2_SYNC_POINTS UINT32_C(4)
+
+/* Minimum and maximum digest size in bytes */
+#define ARGON2_MIN_OUTLEN UINT32_C(4)
+#define ARGON2_MAX_OUTLEN UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */
+#define ARGON2_MIN_MEMORY (2 * ARGON2_SYNC_POINTS) /* 2 blocks per slice */
+
+#define ARGON2_MIN(a, b) ((a) < (b) ? (a) : (b))
+/* Max memory size is addressing-space/2, topping at 2^32 blocks (4 TB) */
+#define ARGON2_MAX_MEMORY_BITS                                                 \
+    ARGON2_MIN(UINT32_C(32), (sizeof(void *) * CHAR_BIT - 10 - 1))
+#define ARGON2_MAX_MEMORY                                                      \
+    ARGON2_MIN(UINT32_C(0xFFFFFFFF), UINT64_C(1) << ARGON2_MAX_MEMORY_BITS)
+
+/* Minimum and maximum number of passes */
+#define ARGON2_MIN_TIME UINT32_C(1)
+#define ARGON2_MAX_TIME UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum password length in bytes */
+#define ARGON2_MIN_PWD_LENGTH UINT32_C(0)
+#define ARGON2_MAX_PWD_LENGTH UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum associated data length in bytes */
+#define ARGON2_MIN_AD_LENGTH UINT32_C(0)
+#define ARGON2_MAX_AD_LENGTH UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum salt length in bytes */
+#define ARGON2_MIN_SALT_LENGTH UINT32_C(8)
+#define ARGON2_MAX_SALT_LENGTH UINT32_C(0xFFFFFFFF)
+
+/* Minimum and maximum key length in bytes */
+#define ARGON2_MIN_SECRET UINT32_C(0)
+#define ARGON2_MAX_SECRET UINT32_C(0xFFFFFFFF)
+
+/* Flags to determine which fields are securely wiped (default = no wipe). */
+#define ARGON2_DEFAULT_FLAGS UINT32_C(0)
+#define ARGON2_FLAG_CLEAR_PASSWORD (UINT32_C(1) << 0)
+#define ARGON2_FLAG_CLEAR_SECRET (UINT32_C(1) << 1)
+
+/* Global flag to determine if we are wiping internal memory buffers. This flag
+ * is defined in core.c and deafults to 1 (wipe internal memory). */
+extern int FLAG_clear_internal_memory;
+
+/* Error codes */
+typedef enum Argon2_ErrorCodes {
+    ARGON2_OK = 0,
+
+    ARGON2_OUTPUT_PTR_NULL = -1,
+
+    ARGON2_OUTPUT_TOO_SHORT = -2,
+    ARGON2_OUTPUT_TOO_LONG = -3,
+
+    ARGON2_PWD_TOO_SHORT = -4,
+    ARGON2_PWD_TOO_LONG = -5,
+
+    ARGON2_SALT_TOO_SHORT = -6,
+    ARGON2_SALT_TOO_LONG = -7,
+
+    ARGON2_AD_TOO_SHORT = -8,
+    ARGON2_AD_TOO_LONG = -9,
+
+    ARGON2_SECRET_TOO_SHORT = -10,
+    ARGON2_SECRET_TOO_LONG = -11,
+
+    ARGON2_TIME_TOO_SMALL = -12,
+    ARGON2_TIME_TOO_LARGE = -13,
+
+    ARGON2_MEMORY_TOO_LITTLE = -14,
+    ARGON2_MEMORY_TOO_MUCH = -15,
+
+    ARGON2_LANES_TOO_FEW = -16,
+    ARGON2_LANES_TOO_MANY = -17,
+
+    ARGON2_PWD_PTR_MISMATCH = -18,    /* NULL ptr with non-zero length */
+    ARGON2_SALT_PTR_MISMATCH = -19,   /* NULL ptr with non-zero length */
+    ARGON2_SECRET_PTR_MISMATCH = -20, /* NULL ptr with non-zero length */
+    ARGON2_AD_PTR_MISMATCH = -21,     /* NULL ptr with non-zero length */
+
+    ARGON2_MEMORY_ALLOCATION_ERROR = -22,
+
+    ARGON2_FREE_MEMORY_CBK_NULL = -23,
+    ARGON2_ALLOCATE_MEMORY_CBK_NULL = -24,
+
+    ARGON2_INCORRECT_PARAMETER = -25,
+    ARGON2_INCORRECT_TYPE = -26,
+
+    ARGON2_OUT_PTR_MISMATCH = -27,
+
+    ARGON2_THREADS_TOO_FEW = -28,
+    ARGON2_THREADS_TOO_MANY = -29,
+
+    ARGON2_MISSING_ARGS = -30,
+
+    ARGON2_ENCODING_FAIL = -31,
+
+    ARGON2_DECODING_FAIL = -32,
+
+    ARGON2_THREAD_FAIL = -33,
+
+    ARGON2_DECODING_LENGTH_FAIL = -34,
+
+    ARGON2_VERIFY_MISMATCH = -35
+} argon2_error_codes;
+
+/* Memory allocator types --- for external allocation */
+typedef int (*allocate_fptr)(uint8_t **memory, size_t bytes_to_allocate);
+typedef void (*deallocate_fptr)(uint8_t *memory, size_t bytes_to_allocate);
+
+/* Argon2 external data structures */
+
+/*
+ *****
+ * Context: structure to hold Argon2 inputs:
+ *  output array and its length,
+ *  password and its length,
+ *  salt and its length,
+ *  secret and its length,
+ *  associated data and its length,
+ *  number of passes, amount of used memory (in KBytes, can be rounded up a bit)
+ *  number of parallel threads that will be run.
+ * All the parameters above affect the output hash value.
+ * Additionally, two function pointers can be provided to allocate and
+ * deallocate the memory (if NULL, memory will be allocated internally).
+ * Also, three flags indicate whether to erase password, secret as soon as they
+ * are pre-hashed (and thus not needed anymore), and the entire memory
+ *****
+ * Simplest situation: you have output array out[8], password is stored in
+ * pwd[32], salt is stored in salt[16], you do not have keys nor associated
+ * data. You need to spend 1 GB of RAM and you run 5 passes of Argon2d with
+ * 4 parallel lanes.
+ * You want to erase the password, but you're OK with last pass not being
+ * erased. You want to use the default memory allocator.
+ * Then you initialize:
+ Argon2_Context(out,8,pwd,32,salt,16,NULL,0,NULL,0,5,1<<20,4,4,NULL,NULL,true,false,false,false)
+ */
+typedef struct Argon2_Context {
+    uint8_t *out;    /* output array */
+    uint32_t outlen; /* digest length */
+
+    uint8_t *pwd;    /* password array */
+    uint32_t pwdlen; /* password length */
+
+    uint8_t *salt;    /* salt array */
+    uint32_t saltlen; /* salt length */
+
+    uint8_t *secret;    /* key array */
+    uint32_t secretlen; /* key length */
+
+    uint8_t *ad;    /* associated data array */
+    uint32_t adlen; /* associated data length */
+
+    uint32_t t_cost;  /* number of passes */
+    uint32_t m_cost;  /* amount of memory requested (KB) */
+    uint32_t lanes;   /* number of lanes */
+    uint32_t threads; /* maximum number of threads */
+
+    uint32_t version; /* version number */
+
+    allocate_fptr allocate_cbk; /* pointer to memory allocator */
+    deallocate_fptr free_cbk;   /* pointer to memory deallocator */
+
+    uint32_t flags; /* array of bool options */
+} argon2_context;
+
+/* Argon2 primitive type */
+typedef enum Argon2_type {
+  Argon2_d = 0,
+  Argon2_i = 1,
+  Argon2_id = 2
+} argon2_type;
+
+/* Version of the algorithm */
+#define ARGON2_VERSION_10 0x10
+#define ARGON2_VERSION_13 0x13
+
+/*
+ * Function that gives the string representation of an argon2_type.
+ * @param type The argon2_type that we want the string for
+ * @param uppercase Whether the string should have the first letter uppercase
+ * @return NULL if invalid type, otherwise the string representation.
+ */
+ARGON2_PUBLIC const char *argon2_type2string(argon2_type type, int uppercase);
+
+/*
+ * Function that performs memory-hard hashing with certain degree of parallelism
+ * @param  context  Pointer to the Argon2 internal structure
+ * @return Error code if smth is wrong, ARGON2_OK otherwise
+ */
+ARGON2_PUBLIC int argon2_ctx(argon2_context *context, argon2_type type);
+
+/**
+ * Hashes a password with Argon2i, producing an encoded hash
+ * @param t_cost Number of iterations
+ * @param m_cost Sets memory usage to m_cost kibibytes
+ * @param parallelism Number of threads and compute lanes
+ * @param pwd Pointer to password
+ * @param pwdlen Password size in bytes
+ * @param salt Pointer to salt
+ * @param saltlen Salt size in bytes
+ * @param hashlen Desired length of the hash in bytes
+ * @param encoded Buffer where to write the encoded hash
+ * @param encodedlen Size of the buffer (thus max size of the encoded hash)
+ * @pre   Different parallelism levels will give different results
+ * @pre   Returns ARGON2_OK if successful
+ */
+ARGON2_PUBLIC int argon2i_hash_encoded(const uint32_t t_cost,
+                                       const uint32_t m_cost,
+                                       const uint32_t parallelism,
+                                       const void *pwd, const size_t pwdlen,
+                                       const void *salt, const size_t saltlen,
+                                       const size_t hashlen, char *encoded,
+                                       const size_t encodedlen,
+                                       const uint32_t version );
+
+/**
+ * Hashes a password with Argon2i, producing a raw hash at @hash
+ * @param t_cost Number of iterations
+ * @param m_cost Sets memory usage to m_cost kibibytes
+ * @param parallelism Number of threads and compute lanes
+ * @param pwd Pointer to password
+ * @param pwdlen Password size in bytes
+ * @param salt Pointer to salt
+ * @param saltlen Salt size in bytes
+ * @param hash Buffer where to write the raw hash - updated by the function
+ * @param hashlen Desired length of the hash in bytes
+ * @pre   Different parallelism levels will give different results
+ * @pre   Returns ARGON2_OK if successful
+ */
+ARGON2_PUBLIC int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                                   const uint32_t parallelism, const void *pwd,
+                                   const size_t pwdlen, const void *salt,
+                                   const size_t saltlen, void *hash,
+                                   const size_t hashlen,
+                                   const uint32_t version );
+
+ARGON2_PUBLIC int argon2d_hash_encoded(const uint32_t t_cost,
+                                       const uint32_t m_cost,
+                                       const uint32_t parallelism,
+                                       const void *pwd, const size_t pwdlen,
+                                       const void *salt, const size_t saltlen,
+                                       const size_t hashlen, char *encoded,
+                                       const size_t encodedlen,
+                                       const uint32_t version );
+
+ARGON2_PUBLIC int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                                   const uint32_t parallelism, const void *pwd,
+                                   const size_t pwdlen, const void *salt,
+                                   const size_t saltlen, void *hash,
+                                   const size_t hashlen,
+                                   const uint32_t version );
+
+ARGON2_PUBLIC int argon2id_hash_encoded(const uint32_t t_cost,
+                                        const uint32_t m_cost,
+                                        const uint32_t parallelism,
+                                        const void *pwd, const size_t pwdlen,
+                                        const void *salt, const size_t saltlen,
+                                        const size_t hashlen, char *encoded,
+                                        const size_t encodedlen,
+                                        const uint32_t version );
+
+ARGON2_PUBLIC int argon2id_hash_raw(const uint32_t t_cost,
+                                    const uint32_t m_cost,
+                                    const uint32_t parallelism, const void *pwd,
+                                    const size_t pwdlen, const void *salt,
+                                    const size_t saltlen, void *hash,
+                                    const size_t hashlen,
+                                    const uint32_t version );
+
+/* generic function underlying the above ones */
+ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
+                              const uint32_t parallelism, const void *pwd,
+                              const size_t pwdlen, const void *salt,
+                              const size_t saltlen, void *hash,
+                              const size_t hashlen, char *encoded,
+                              const size_t encodedlen, argon2_type type,
+                              const uint32_t version );
+
+/**
+ * Verifies a password against an encoded string
+ * Encoded string is restricted as in validate_inputs()
+ * @param encoded String encoding parameters, salt, hash
+ * @param pwd Pointer to password
+ * @pre   Returns ARGON2_OK if successful
+ */
+ARGON2_PUBLIC int argon2i_verify(const char *encoded, const void *pwd,
+                                 const size_t pwdlen);
+
+ARGON2_PUBLIC int argon2d_verify(const char *encoded, const void *pwd,
+                                 const size_t pwdlen);
+
+ARGON2_PUBLIC int argon2id_verify(const char *encoded, const void *pwd,
+                                  const size_t pwdlen);
+
+/* generic function underlying the above ones */
+ARGON2_PUBLIC int argon2_verify(const char *encoded, const void *pwd,
+                                const size_t pwdlen, argon2_type type);
+
+/**
+ * Argon2d: Version of Argon2 that picks memory blocks depending
+ * on the password and salt. Only for side-channel-free
+ * environment!!
+ *****
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2d_ctx(argon2_context *context);
+
+/**
+ * Argon2i: Version of Argon2 that picks memory blocks
+ * independent on the password and salt. Good for side-channels,
+ * but worse w.r.t. tradeoff attacks if only one pass is used.
+ *****
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2i_ctx(argon2_context *context);
+
+/**
+ * Argon2id: Version of Argon2 where the first half-pass over memory is
+ * password-independent, the rest are password-dependent (on the password and
+ * salt). OK against side channels (they reduce to 1/2-pass Argon2i), and
+ * better with w.r.t. tradeoff attacks (similar to Argon2d).
+ *****
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2id_ctx(argon2_context *context);
+
+/**
+ * Verify if a given password is correct for Argon2d hashing
+ * @param  context  Pointer to current Argon2 context
+ * @param  hash  The password hash to verify. The length of the hash is
+ * specified by the context outlen member
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2d_verify_ctx(argon2_context *context, const char *hash);
+
+/**
+ * Verify if a given password is correct for Argon2i hashing
+ * @param  context  Pointer to current Argon2 context
+ * @param  hash  The password hash to verify. The length of the hash is
+ * specified by the context outlen member
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2i_verify_ctx(argon2_context *context, const char *hash);
+
+/**
+ * Verify if a given password is correct for Argon2id hashing
+ * @param  context  Pointer to current Argon2 context
+ * @param  hash  The password hash to verify. The length of the hash is
+ * specified by the context outlen member
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2id_verify_ctx(argon2_context *context,
+                                      const char *hash);
+
+/* generic function underlying the above ones */
+ARGON2_PUBLIC int argon2_verify_ctx(argon2_context *context, const char *hash,
+                                    argon2_type type);
+
+/**
+ * Get the associated error message for given error code
+ * @return  The error message associated with the given error code
+ */
+ARGON2_PUBLIC const char *argon2_error_message(int error_code);
+
+/**
+ * Returns the encoded hash length for the given input parameters
+ * @param t_cost  Number of iterations
+ * @param m_cost  Memory usage in kibibytes
+ * @param parallelism  Number of threads; used to compute lanes
+ * @param saltlen  Salt size in bytes
+ * @param hashlen  Hash size in bytes
+ * @param type The argon2_type that we want the encoded length for
+ * @return  The encoded hash length in bytes
+ */
+ARGON2_PUBLIC size_t argon2_encodedlen(uint32_t t_cost, uint32_t m_cost,
+                                       uint32_t parallelism, uint32_t saltlen,
+                                       uint32_t hashlen, argon2_type type);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/algo/argon2/argon2d/argon2d/core.c
+++ b/algo/argon2/argon2d/argon2d/core.c
@@ -0,0 +1,635 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+/*For memory wiping*/
+#ifdef _MSC_VER
+#include <windows.h>
+#include <winbase.h> /* For SecureZeroMemory */
+#endif
+#if defined __STDC_LIB_EXT1__
+#define __STDC_WANT_LIB_EXT1__ 1
+#endif
+#define VC_GE_2005(version) (version >= 1400)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "core.h"
+#include "thread.h"
+#include "../blake2/blake2.h"
+#include "../blake2/blake2-impl.h"
+
+#ifdef GENKAT
+#include "genkat.h"
+#endif
+
+#if defined(__clang__)
+#if __has_attribute(optnone)
+#define NOT_OPTIMIZED __attribute__((optnone))
+#endif
+#elif defined(__GNUC__)
+#define GCC_VERSION                                                            \
+    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#if GCC_VERSION >= 40400
+#define NOT_OPTIMIZED __attribute__((optimize("O0")))
+#endif
+#endif
+#ifndef NOT_OPTIMIZED
+#define NOT_OPTIMIZED
+#endif
+
+/***************Instance and Position constructors**********/
+void init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); }
+
+void copy_block(block *dst, const block *src) {
+    memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK);
+}
+
+void xor_block(block *dst, const block *src) {
+    int i;
+    for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
+        dst->v[i] ^= src->v[i];
+    }
+}
+
+static void load_block(block *dst, const void *input) {
+    unsigned i;
+    for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
+        dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i]));
+    }
+}
+
+static void store_block(void *output, const block *src) {
+    unsigned i;
+    for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) {
+        store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]);
+    }
+}
+
+/***************Memory functions*****************/
+
+int allocate_memory(const argon2_context *context, uint8_t **memory,
+                    size_t num, size_t size) {
+    size_t memory_size = num*size;
+    if (memory == NULL) {
+        return ARGON2_MEMORY_ALLOCATION_ERROR;
+    }
+
+    /* 1. Check for multiplication overflow */
+    if (size != 0 && memory_size / size != num) {
+        return ARGON2_MEMORY_ALLOCATION_ERROR;
+    }
+
+    /* 2. Try to allocate with appropriate allocator */
+    if (context->allocate_cbk) {
+        (context->allocate_cbk)(memory, memory_size);
+    } else {
+        *memory = malloc(memory_size);
+    }
+
+    if (*memory == NULL) {
+        return ARGON2_MEMORY_ALLOCATION_ERROR;
+    }
+
+    return ARGON2_OK;
+}
+
+void free_memory(const argon2_context *context, uint8_t *memory,
+                 size_t num, size_t size) {
+    size_t memory_size = num*size;
+    clear_internal_memory(memory, memory_size);
+    if (context->free_cbk) {
+        (context->free_cbk)(memory, memory_size);
+    } else {
+        free(memory);
+    }
+}
+
+void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
+#if defined(_MSC_VER) && VC_GE_2005(_MSC_VER)
+    SecureZeroMemory(v, n);
+#elif defined memset_s
+    memset_s(v, n, 0, n);
+#elif defined(__OpenBSD__)
+    explicit_bzero(v, n);
+#else
+    static void *(*const volatile memset_sec)(void *, int, size_t) = &memset;
+    memset_sec(v, 0, n);
+#endif
+}
+
+/* Memory clear flag defaults to true. */
+int FLAG_clear_internal_memory = 0;
+void clear_internal_memory(void *v, size_t n) {
+  if (FLAG_clear_internal_memory && v) {
+    secure_wipe_memory(v, n);
+  }
+}
+
+void finalize(const argon2_context *context, argon2_instance_t *instance) {
+    if (context != NULL && instance != NULL) {
+        block blockhash;
+        uint32_t l;
+
+        copy_block(&blockhash, instance->memory + instance->lane_length - 1);
+
+        /* XOR the last blocks */
+        for (l = 1; l < instance->lanes; ++l) {
+            uint32_t last_block_in_lane =
+                l * instance->lane_length + (instance->lane_length - 1);
+            xor_block(&blockhash, instance->memory + last_block_in_lane);
+        }
+
+        /* Hash the result */
+        {
+            uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
+            store_block(blockhash_bytes, &blockhash);
+            blake2b_long(context->out, context->outlen, blockhash_bytes,
+                         ARGON2_BLOCK_SIZE);
+            /* clear blockhash and blockhash_bytes */
+            clear_internal_memory(blockhash.v, ARGON2_BLOCK_SIZE);
+            clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
+        }
+
+#ifdef GENKAT
+        print_tag(context->out, context->outlen);
+#endif
+
+        free_memory(context, (uint8_t *)instance->memory,
+                    instance->memory_blocks, sizeof(block));
+    }
+}
+
+uint32_t index_alpha(const argon2_instance_t *instance,
+                     const argon2_position_t *position, uint32_t pseudo_rand,
+                     int same_lane) {
+    /*
+     * Pass 0:
+     *      This lane : all already finished segments plus already constructed
+     * blocks in this segment
+     *      Other lanes : all already finished segments
+     * Pass 1+:
+     *      This lane : (SYNC_POINTS - 1) last segments plus already constructed
+     * blocks in this segment
+     *      Other lanes : (SYNC_POINTS - 1) last segments
+     */
+    uint32_t reference_area_size;
+    uint64_t relative_position;
+    uint32_t start_position, absolute_position;
+
+    if (0 == position->pass) {
+        /* First pass */
+        if (0 == position->slice) {
+            /* First slice */
+            reference_area_size =
+                position->index - 1; /* all but the previous */
+        } else {
+            if (same_lane) {
+                /* The same lane => add current segment */
+                reference_area_size =
+                    position->slice * instance->segment_length +
+                    position->index - 1;
+            } else {
+                reference_area_size =
+                    position->slice * instance->segment_length +
+                    ((position->index == 0) ? (-1) : 0);
+            }
+        }
+    } else {
+        /* Second pass */
+        if (same_lane) {
+            reference_area_size = instance->lane_length -
+                                  instance->segment_length + position->index -
+                                  1;
+        } else {
+            reference_area_size = instance->lane_length -
+                                  instance->segment_length +
+                                  ((position->index == 0) ? (-1) : 0);
+        }
+    }
+
+    /* 1.2.4. Mapping pseudo_rand to 0..<reference_area_size-1> and produce
+     * relative position */
+    relative_position = pseudo_rand;
+    relative_position = relative_position * relative_position >> 32;
+    relative_position = reference_area_size - 1 -
+                        (reference_area_size * relative_position >> 32);
+
+    /* 1.2.5 Computing starting position */
+    start_position = 0;
+
+    if (0 != position->pass) {
+        start_position = (position->slice == ARGON2_SYNC_POINTS - 1)
+                             ? 0
+                             : (position->slice + 1) * instance->segment_length;
+    }
+
+    /* 1.2.6. Computing absolute position */
+    absolute_position = (start_position + relative_position) %
+                        instance->lane_length; /* absolute position */
+    return absolute_position;
+}
+
+/* Single-threaded version for p=1 case */
+static int fill_memory_blocks_st(argon2_instance_t *instance) {
+    uint32_t r, s, l;
+
+    for (r = 0; r < instance->passes; ++r) {
+        for (s = 0; s < ARGON2_SYNC_POINTS; ++s) {
+            for (l = 0; l < instance->lanes; ++l) {
+                argon2_position_t position = {r, l, (uint8_t)s, 0};
+                fill_segment(instance, position);
+            }
+        }
+#ifdef GENKAT
+        internal_kat(instance, r); /* Print all memory blocks */
+#endif
+    }
+    return ARGON2_OK;
+}
+
+#if !defined(ARGON2_NO_THREADS)
+
+#ifdef _WIN32
+static unsigned __stdcall fill_segment_thr(void *thread_data)
+#else
+static void *fill_segment_thr(void *thread_data)
+#endif
+{
+    argon2_thread_data *my_data = thread_data;
+    fill_segment(my_data->instance_ptr, my_data->pos);
+    argon2_thread_exit();
+    return 0;
+}
+
+/* Multi-threaded version for p > 1 case */
+static int fill_memory_blocks_mt(argon2_instance_t *instance) {
+    uint32_t r, s;
+    argon2_thread_handle_t *thread = NULL;
+    argon2_thread_data *thr_data = NULL;
+    int rc = ARGON2_OK;
+
+    /* 1. Allocating space for threads */
+    thread = calloc(instance->lanes, sizeof(argon2_thread_handle_t));
+    if (thread == NULL) {
+        rc = ARGON2_MEMORY_ALLOCATION_ERROR;
+        goto fail;
+    }
+
+    thr_data = calloc(instance->lanes, sizeof(argon2_thread_data));
+    if (thr_data == NULL) {
+        rc = ARGON2_MEMORY_ALLOCATION_ERROR;
+        goto fail;
+    }
+
+    for (r = 0; r < instance->passes; ++r) {
+        for (s = 0; s < ARGON2_SYNC_POINTS; ++s) {
+            uint32_t l;
+
+            /* 2. Calling threads */
+            for (l = 0; l < instance->lanes; ++l) {
+                argon2_position_t position;
+
+                /* 2.1 Join a thread if limit is exceeded */
+                if (l >= instance->threads) {
+                    if (argon2_thread_join(thread[l - instance->threads])) {
+                        rc = ARGON2_THREAD_FAIL;
+                        goto fail;
+                    }
+                }
+
+                /* 2.2 Create thread */
+                position.pass = r;
+                position.lane = l;
+                position.slice = (uint8_t)s;
+                position.index = 0;
+                thr_data[l].instance_ptr =
+                    instance; /* preparing the thread input */
+                memcpy(&(thr_data[l].pos), &position,
+                       sizeof(argon2_position_t));
+                if (argon2_thread_create(&thread[l], &fill_segment_thr,
+                                         (void *)&thr_data[l])) {
+                    rc = ARGON2_THREAD_FAIL;
+                    goto fail;
+                }
+
+                /* fill_segment(instance, position); */
+                /*Non-thread equivalent of the lines above */
+            }
+
+            /* 3. Joining remaining threads */
+            for (l = instance->lanes - instance->threads; l < instance->lanes;
+                 ++l) {
+                if (argon2_thread_join(thread[l])) {
+                    rc = ARGON2_THREAD_FAIL;
+                    goto fail;
+                }
+            }
+        }
+
+#ifdef GENKAT
+        internal_kat(instance, r); /* Print all memory blocks */
+#endif
+    }
+
+fail:
+    if (thread != NULL) {
+        free(thread);
+    }
+    if (thr_data != NULL) {
+        free(thr_data);
+    }
+    return rc;
+}
+
+#endif /* ARGON2_NO_THREADS */
+
+int fill_memory_blocks(argon2_instance_t *instance) {
+	if (instance == NULL || instance->lanes == 0) {
+	    return ARGON2_INCORRECT_PARAMETER;
+    }
+#if defined(ARGON2_NO_THREADS)
+    return fill_memory_blocks_st(instance);
+#else
+    return instance->threads == 1 ?
+			fill_memory_blocks_st(instance) : fill_memory_blocks_mt(instance);
+#endif
+}
+
+int validate_inputs(const argon2_context *context) {
+    if (NULL == context) {
+        return ARGON2_INCORRECT_PARAMETER;
+    }
+
+    if (NULL == context->out) {
+        return ARGON2_OUTPUT_PTR_NULL;
+    }
+
+    /* Validate output length */
+    if (ARGON2_MIN_OUTLEN > context->outlen) {
+        return ARGON2_OUTPUT_TOO_SHORT;
+    }
+
+    if (ARGON2_MAX_OUTLEN < context->outlen) {
+        return ARGON2_OUTPUT_TOO_LONG;
+    }
+
+    /* Validate password (required param) */
+    if (NULL == context->pwd) {
+        if (0 != context->pwdlen) {
+            return ARGON2_PWD_PTR_MISMATCH;
+        }
+    }
+
+    if (ARGON2_MIN_PWD_LENGTH > context->pwdlen) {
+      return ARGON2_PWD_TOO_SHORT;
+    }
+
+    if (ARGON2_MAX_PWD_LENGTH < context->pwdlen) {
+        return ARGON2_PWD_TOO_LONG;
+    }
+
+    /* Validate salt (required param) */
+    if (NULL == context->salt) {
+        if (0 != context->saltlen) {
+            return ARGON2_SALT_PTR_MISMATCH;
+        }
+    }
+
+    if (ARGON2_MIN_SALT_LENGTH > context->saltlen) {
+        return ARGON2_SALT_TOO_SHORT;
+    }
+
+    if (ARGON2_MAX_SALT_LENGTH < context->saltlen) {
+        return ARGON2_SALT_TOO_LONG;
+    }
+
+    /* Validate secret (optional param) */
+    if (NULL == context->secret) {
+        if (0 != context->secretlen) {
+            return ARGON2_SECRET_PTR_MISMATCH;
+        }
+    } else {
+        if (ARGON2_MIN_SECRET > context->secretlen) {
+            return ARGON2_SECRET_TOO_SHORT;
+        }
+        if (ARGON2_MAX_SECRET < context->secretlen) {
+            return ARGON2_SECRET_TOO_LONG;
+        }
+    }
+
+    /* Validate associated data (optional param) */
+    if (NULL == context->ad) {
+        if (0 != context->adlen) {
+            return ARGON2_AD_PTR_MISMATCH;
+        }
+    } else {
+        if (ARGON2_MIN_AD_LENGTH > context->adlen) {
+            return ARGON2_AD_TOO_SHORT;
+        }
+        if (ARGON2_MAX_AD_LENGTH < context->adlen) {
+            return ARGON2_AD_TOO_LONG;
+        }
+    }
+
+    /* Validate memory cost */
+    if (ARGON2_MIN_MEMORY > context->m_cost) {
+        return ARGON2_MEMORY_TOO_LITTLE;
+    }
+
+    if (ARGON2_MAX_MEMORY < context->m_cost) {
+        return ARGON2_MEMORY_TOO_MUCH;
+    }
+
+    if (context->m_cost < 8 * context->lanes) {
+        return ARGON2_MEMORY_TOO_LITTLE;
+    }
+
+    /* Validate time cost */
+    if (ARGON2_MIN_TIME > context->t_cost) {
+        return ARGON2_TIME_TOO_SMALL;
+    }
+
+    if (ARGON2_MAX_TIME < context->t_cost) {
+        return ARGON2_TIME_TOO_LARGE;
+    }
+
+    /* Validate lanes */
+    if (ARGON2_MIN_LANES > context->lanes) {
+        return ARGON2_LANES_TOO_FEW;
+    }
+
+    if (ARGON2_MAX_LANES < context->lanes) {
+        return ARGON2_LANES_TOO_MANY;
+    }
+
+    /* Validate threads */
+    if (ARGON2_MIN_THREADS > context->threads) {
+        return ARGON2_THREADS_TOO_FEW;
+    }
+
+    if (ARGON2_MAX_THREADS < context->threads) {
+        return ARGON2_THREADS_TOO_MANY;
+    }
+
+    if (NULL != context->allocate_cbk && NULL == context->free_cbk) {
+        return ARGON2_FREE_MEMORY_CBK_NULL;
+    }
+
+    if (NULL == context->allocate_cbk && NULL != context->free_cbk) {
+        return ARGON2_ALLOCATE_MEMORY_CBK_NULL;
+    }
+
+    return ARGON2_OK;
+}
+
+void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) {
+    uint32_t l;
+    /* Make the first and second block in each lane as G(H0||0||i) or
+       G(H0||1||i) */
+    uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
+    for (l = 0; l < instance->lanes; ++l) {
+
+        store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0);
+        store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, l);
+        blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash,
+                     ARGON2_PREHASH_SEED_LENGTH);
+        load_block(&instance->memory[l * instance->lane_length + 0],
+                   blockhash_bytes);
+
+        store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1);
+        blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash,
+                     ARGON2_PREHASH_SEED_LENGTH);
+        load_block(&instance->memory[l * instance->lane_length + 1],
+                   blockhash_bytes);
+    }
+    clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
+}
+
+void initial_hash(uint8_t *blockhash, argon2_context *context,
+                  argon2_type type) {
+    blake2b_state BlakeHash;
+    uint8_t value[sizeof(uint32_t)];
+
+    if (NULL == context || NULL == blockhash) {
+        return;
+    }
+
+    blake2b_init(&BlakeHash, ARGON2_PREHASH_DIGEST_LENGTH);
+
+    store32(&value, context->lanes);
+    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    store32(&value, context->outlen);
+    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    store32(&value, context->m_cost);
+    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    store32(&value, context->t_cost);
+    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+//    store32(&value, ARGON2_VERSION_NUMBER);
+    store32(&value, context->version);
+    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    store32(&value, (uint32_t)type);
+    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    store32(&value, context->pwdlen);
+    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    if (context->pwd != NULL) {
+        blake2b_update(&BlakeHash, (const uint8_t *)context->pwd,
+                       context->pwdlen);
+
+        if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) {
+            secure_wipe_memory(context->pwd, context->pwdlen);
+            context->pwdlen = 0;
+        }
+    }
+
+    store32(&value, context->saltlen);
+    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    if (context->salt != NULL) {
+        blake2b_update(&BlakeHash, (const uint8_t *)context->salt,
+                       context->saltlen);
+    }
+
+    store32(&value, context->secretlen);
+    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    if (context->secret != NULL) {
+        blake2b_update(&BlakeHash, (const uint8_t *)context->secret,
+                       context->secretlen);
+
+        if (context->flags & ARGON2_FLAG_CLEAR_SECRET) {
+            secure_wipe_memory(context->secret, context->secretlen);
+            context->secretlen = 0;
+        }
+    }
+
+    store32(&value, context->adlen);
+    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
+
+    if (context->ad != NULL) {
+        blake2b_update(&BlakeHash, (const uint8_t *)context->ad,
+                       context->adlen);
+    }
+
+    blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH);
+}
+
+int initialize(argon2_instance_t *instance, argon2_context *context) {
+    uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH];
+    int result = ARGON2_OK;
+
+    if (instance == NULL || context == NULL)
+        return ARGON2_INCORRECT_PARAMETER;
+    instance->context_ptr = context;
+
+    /* 1. Memory allocation */
+    result = allocate_memory(context, (uint8_t **)&(instance->memory),
+                             instance->memory_blocks, sizeof(block));
+    if (result != ARGON2_OK) {
+        return result;
+    }
+
+    /* 2. Initial hashing */
+    /* H_0 + 8 extra bytes to produce the first blocks */
+    /* uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; */
+    /* Hashing all inputs */
+    initial_hash(blockhash, context, instance->type);
+    /* Zeroing 8 extra bytes */
+    clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
+                          ARGON2_PREHASH_SEED_LENGTH -
+                              ARGON2_PREHASH_DIGEST_LENGTH);
+
+#ifdef GENKAT
+    initial_kat(blockhash, context, instance->type);
+#endif
+
+    /* 3. Creating first blocks, we always have at least two blocks in a slice
+     */
+    fill_first_blocks(blockhash, instance);
+    /* Clearing the hash */
+    clear_internal_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH);
+
+    return ARGON2_OK;
+}
--- a/algo/argon2/argon2d/argon2d/core.h
+++ b/algo/argon2/argon2d/argon2d/core.h
@@ -0,0 +1,228 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef ARGON2_CORE_H
+#define ARGON2_CORE_H
+
+#include "argon2.h"
+
+#define CONST_CAST(x) (x)(uintptr_t)
+
+/**********************Argon2 internal constants*******************************/
+
+enum argon2_core_constants {
+    /* Memory block size in bytes */
+    ARGON2_BLOCK_SIZE = 1024,
+    ARGON2_QWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8,
+    ARGON2_OWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 16,
+    ARGON2_HWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 32,
+    ARGON2_512BIT_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 64,
+
+    /* Number of pseudo-random values generated by one call to Blake in Argon2i
+       to
+       generate reference block positions */
+    ARGON2_ADDRESSES_IN_BLOCK = 128,
+
+    /* Pre-hashing digest length and its extension*/
+    ARGON2_PREHASH_DIGEST_LENGTH = 64,
+    ARGON2_PREHASH_SEED_LENGTH = 72
+};
+
+/*************************Argon2 internal data types***********************/
+
+/*
+ * Structure for the (1KB) memory block implemented as 128 64-bit words.
+ * Memory blocks can be copied, XORed. Internal words can be accessed by [] (no
+ * bounds checking).
+ */
+typedef struct block_ { uint64_t v[ARGON2_QWORDS_IN_BLOCK]; } block;
+
+/*****************Functions that work with the block******************/
+
+/* Initialize each byte of the block with @in */
+void init_block_value(block *b, uint8_t in);
+
+/* Copy block @src to block @dst */
+void copy_block(block *dst, const block *src);
+
+/* XOR @src onto @dst bytewise */
+void xor_block(block *dst, const block *src);
+
+/*
+ * Argon2 instance: memory pointer, number of passes, amount of memory, type,
+ * and derived values.
+ * Used to evaluate the number and location of blocks to construct in each
+ * thread
+ */
+typedef struct Argon2_instance_t {
+    block *memory;          /* Memory pointer */
+    uint32_t version;
+    uint32_t passes;        /* Number of passes */
+    uint32_t memory_blocks; /* Number of blocks in memory */
+    uint32_t segment_length;
+    uint32_t lane_length;
+    uint32_t lanes;
+    uint32_t threads;
+    argon2_type type;
+    int print_internals; /* whether to print the memory blocks */
+    argon2_context *context_ptr; /* points back to original context */
+} argon2_instance_t;
+
+/*
+ * Argon2 position: where we construct the block right now. Used to distribute
+ * work between threads.
+ */
+typedef struct Argon2_position_t {
+    uint32_t pass;
+    uint32_t lane;
+    uint8_t slice;
+    uint32_t index;
+} argon2_position_t;
+
+/*Struct that holds the inputs for thread handling FillSegment*/
+typedef struct Argon2_thread_data {
+    argon2_instance_t *instance_ptr;
+    argon2_position_t pos;
+} argon2_thread_data;
+
+/*************************Argon2 core functions********************************/
+
+/* Allocates memory to the given pointer, uses the appropriate allocator as
+ * specified in the context. Total allocated memory is num*size.
+ * @param context argon2_context which specifies the allocator
+ * @param memory pointer to the pointer to the memory
+ * @param size the size in bytes for each element to be allocated
+ * @param num the number of elements to be allocated
+ * @return ARGON2_OK if @memory is a valid pointer and memory is allocated
+ */
+int allocate_memory(const argon2_context *context, uint8_t **memory,
+                    size_t num, size_t size);
+
+/*
+ * Frees memory at the given pointer, uses the appropriate deallocator as
+ * specified in the context. Also cleans the memory using clear_internal_memory.
+ * @param context argon2_context which specifies the deallocator
+ * @param memory pointer to buffer to be freed
+ * @param size the size in bytes for each element to be deallocated
+ * @param num the number of elements to be deallocated
+ */
+void free_memory(const argon2_context *context, uint8_t *memory,
+                 size_t num, size_t size);
+
+/* Function that securely cleans the memory. This ignores any flags set
+ * regarding clearing memory. Usually one just calls clear_internal_memory.
+ * @param mem Pointer to the memory
+ * @param s Memory size in bytes
+ */
+void secure_wipe_memory(void *v, size_t n);
+
+/* Function that securely clears the memory if FLAG_clear_internal_memory is
+ * set. If the flag isn't set, this function does nothing.
+ * @param mem Pointer to the memory
+ * @param s Memory size in bytes
+ */
+void clear_internal_memory(void *v, size_t n);
+
+/*
+ * Computes absolute position of reference block in the lane following a skewed
+ * distribution and using a pseudo-random value as input
+ * @param instance Pointer to the current instance
+ * @param position Pointer to the current position
+ * @param pseudo_rand 32-bit pseudo-random value used to determine the position
+ * @param same_lane Indicates if the block will be taken from the current lane.
+ * If so we can reference the current segment
+ * @pre All pointers must be valid
+ */
+uint32_t index_alpha(const argon2_instance_t *instance,
+                     const argon2_position_t *position, uint32_t pseudo_rand,
+                     int same_lane);
+
+/*
+ * Function that validates all inputs against predefined restrictions and return
+ * an error code
+ * @param context Pointer to current Argon2 context
+ * @return ARGON2_OK if everything is all right, otherwise one of error codes
+ * (all defined in <argon2.h>
+ */
+int validate_inputs(const argon2_context *context);
+
+/*
+ * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears
+ * password and secret if needed
+ * @param  context  Pointer to the Argon2 internal structure containing memory
+ * pointer, and parameters for time and space requirements.
+ * @param  blockhash Buffer for pre-hashing digest
+ * @param  type Argon2 type
+ * @pre    @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes
+ * allocated
+ */
+void initial_hash(uint8_t *blockhash, argon2_context *context,
+                  argon2_type type);
+
+/*
+ * Function creates first 2 blocks per lane
+ * @param instance Pointer to the current instance
+ * @param blockhash Pointer to the pre-hashing digest
+ * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values
+ */
+void fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance);
+
+/*
+ * Function allocates memory, hashes the inputs with Blake,  and creates first
+ * two blocks. Returns the pointer to the main memory with 2 blocks per lane
+ * initialized
+ * @param  context  Pointer to the Argon2 internal structure containing memory
+ * pointer, and parameters for time and space requirements.
+ * @param  instance Current Argon2 instance
+ * @return Zero if successful, -1 if memory failed to allocate. @context->state
+ * will be modified if successful.
+ */
+int initialize(argon2_instance_t *instance, argon2_context *context);
+
+/*
+ * XORing the last block of each lane, hashing it, making the tag. Deallocates
+ * the memory.
+ * @param context Pointer to current Argon2 context (use only the out parameters
+ * from it)
+ * @param instance Pointer to current instance of Argon2
+ * @pre instance->state must point to necessary amount of memory
+ * @pre context->out must point to outlen bytes of memory
+ * @pre if context->free_cbk is not NULL, it should point to a function that
+ * deallocates memory
+ */
+void finalize(const argon2_context *context, argon2_instance_t *instance);
+
+/*
+ * Function that fills the segment using previous segments also from other
+ * threads
+ * @param context current context
+ * @param instance Pointer to the current instance
+ * @param position Current position
+ * @pre all block pointers must be valid
+ */
+void fill_segment(const argon2_instance_t *instance,
+                  argon2_position_t position);
+
+/*
+ * Function that fills the entire memory t_cost times based on the first two
+ * blocks in each lane
+ * @param instance Pointer to the current instance
+ * @return ARGON2_OK if successful, @context->state
+ */
+int fill_memory_blocks(argon2_instance_t *instance);
+
+#endif
--- a/algo/argon2/argon2d/argon2d/encoding.c
+++ b/algo/argon2/argon2d/argon2d/encoding.c
@@ -0,0 +1,463 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "encoding.h"
+#include "core.h"
+
+/*
+ * Example code for a decoder and encoder of "hash strings", with Argon2
+ * parameters.
+ *
+ * This code comprises three sections:
+ *
+ *   -- The first section contains generic Base64 encoding and decoding
+ *   functions. It is conceptually applicable to any hash function
+ *   implementation that uses Base64 to encode and decode parameters,
+ *   salts and outputs. It could be made into a library, provided that
+ *   the relevant functions are made public (non-static) and be given
+ *   reasonable names to avoid collisions with other functions.
+ *
+ *   -- The second section is specific to Argon2. It encodes and decodes
+ *   the parameters, salts and outputs. It does not compute the hash
+ *   itself.
+ *
+ * The code was originally written by Thomas Pornin <pornin@bolet.org>,
+ * to whom comments and remarks may be sent. It is released under what
+ * should amount to Public Domain or its closest equivalent; the
+ * following mantra is supposed to incarnate that fact with all the
+ * proper legal rituals:
+ *
+ * ---------------------------------------------------------------------
+ * This file is provided under the terms of Creative Commons CC0 1.0
+ * Public Domain Dedication. To the extent possible under law, the
+ * author (Thomas Pornin) has waived all copyright and related or
+ * neighboring rights to this file. This work is published from: Canada.
+ * ---------------------------------------------------------------------
+ *
+ * Copyright (c) 2015 Thomas Pornin
+ */
+
+/* ==================================================================== */
+/*
+ * Common code; could be shared between different hash functions.
+ *
+ * Note: the Base64 functions below assume that uppercase letters (resp.
+ * lowercase letters) have consecutive numerical codes, that fit on 8
+ * bits. All modern systems use ASCII-compatible charsets, where these
+ * properties are true. If you are stuck with a dinosaur of a system
+ * that still defaults to EBCDIC then you already have much bigger
+ * interoperability issues to deal with.
+ */
+
+/*
+ * Some macros for constant-time comparisons. These work over values in
+ * the 0..255 range. Returned value is 0x00 on "false", 0xFF on "true".
+ */
+#define EQ(x, y) ((((0U - ((unsigned)(x) ^ (unsigned)(y))) >> 8) & 0xFF) ^ 0xFF)
+#define GT(x, y) ((((unsigned)(y) - (unsigned)(x)) >> 8) & 0xFF)
+#define GE(x, y) (GT(y, x) ^ 0xFF)
+#define LT(x, y) GT(y, x)
+#define LE(x, y) GE(y, x)
+
+/*
+ * Convert value x (0..63) to corresponding Base64 character.
+ */
+static int b64_byte_to_char(unsigned x) {
+    return (LT(x, 26) & (x + 'A')) |
+           (GE(x, 26) & LT(x, 52) & (x + ('a' - 26))) |
+           (GE(x, 52) & LT(x, 62) & (x + ('0' - 52))) | (EQ(x, 62) & '+') |
+           (EQ(x, 63) & '/');
+}
+
+/*
+ * Convert character c to the corresponding 6-bit value. If character c
+ * is not a Base64 character, then 0xFF (255) is returned.
+ */
+static unsigned b64_char_to_byte(int c) {
+    unsigned x;
+
+    x = (GE(c, 'A') & LE(c, 'Z') & (c - 'A')) |
+        (GE(c, 'a') & LE(c, 'z') & (c - ('a' - 26))) |
+        (GE(c, '0') & LE(c, '9') & (c - ('0' - 52))) | (EQ(c, '+') & 62) |
+        (EQ(c, '/') & 63);
+    return x | (EQ(x, 0) & (EQ(c, 'A') ^ 0xFF));
+}
+
+/*
+ * Convert some bytes to Base64. 'dst_len' is the length (in characters)
+ * of the output buffer 'dst'; if that buffer is not large enough to
+ * receive the result (including the terminating 0), then (size_t)-1
+ * is returned. Otherwise, the zero-terminated Base64 string is written
+ * in the buffer, and the output length (counted WITHOUT the terminating
+ * zero) is returned.
+ */
+static size_t to_base64(char *dst, size_t dst_len, const void *src,
+                        size_t src_len) {
+    size_t olen;
+    const unsigned char *buf;
+    unsigned acc, acc_len;
+
+    olen = (src_len / 3) << 2;
+    switch (src_len % 3) {
+    case 2:
+        olen++;
+    /* fall through */
+    case 1:
+        olen += 2;
+        break;
+    }
+    if (dst_len <= olen) {
+        return (size_t)-1;
+    }
+    acc = 0;
+    acc_len = 0;
+    buf = (const unsigned char *)src;
+    while (src_len-- > 0) {
+        acc = (acc << 8) + (*buf++);
+        acc_len += 8;
+        while (acc_len >= 6) {
+            acc_len -= 6;
+            *dst++ = (char)b64_byte_to_char((acc >> acc_len) & 0x3F);
+        }
+    }
+    if (acc_len > 0) {
+        *dst++ = (char)b64_byte_to_char((acc << (6 - acc_len)) & 0x3F);
+    }
+    *dst++ = 0;
+    return olen;
+}
+
+/*
+ * Decode Base64 chars into bytes. The '*dst_len' value must initially
+ * contain the length of the output buffer '*dst'; when the decoding
+ * ends, the actual number of decoded bytes is written back in
+ * '*dst_len'.
+ *
+ * Decoding stops when a non-Base64 character is encountered, or when
+ * the output buffer capacity is exceeded. If an error occurred (output
+ * buffer is too small, invalid last characters leading to unprocessed
+ * buffered bits), then NULL is returned; otherwise, the returned value
+ * points to the first non-Base64 character in the source stream, which
+ * may be the terminating zero.
+ */
+static const char *from_base64(void *dst, size_t *dst_len, const char *src) {
+    size_t len;
+    unsigned char *buf;
+    unsigned acc, acc_len;
+
+    buf = (unsigned char *)dst;
+    len = 0;
+    acc = 0;
+    acc_len = 0;
+    for (;;) {
+        unsigned d;
+
+        d = b64_char_to_byte(*src);
+        if (d == 0xFF) {
+            break;
+        }
+        src++;
+        acc = (acc << 6) + d;
+        acc_len += 6;
+        if (acc_len >= 8) {
+            acc_len -= 8;
+            if ((len++) >= *dst_len) {
+                return NULL;
+            }
+            *buf++ = (acc >> acc_len) & 0xFF;
+        }
+    }
+
+    /*
+     * If the input length is equal to 1 modulo 4 (which is
+     * invalid), then there will remain 6 unprocessed bits;
+     * otherwise, only 0, 2 or 4 bits are buffered. The buffered
+     * bits must also all be zero.
+     */
+    if (acc_len > 4 || (acc & (((unsigned)1 << acc_len) - 1)) != 0) {
+        return NULL;
+    }
+    *dst_len = len;
+    return src;
+}
+
+/*
+ * Decode decimal integer from 'str'; the value is written in '*v'.
+ * Returned value is a pointer to the next non-decimal character in the
+ * string. If there is no digit at all, or the value encoding is not
+ * minimal (extra leading zeros), or the value does not fit in an
+ * 'unsigned long', then NULL is returned.
+ */
+static const char *decode_decimal(const char *str, unsigned long *v) {
+    const char *orig;
+    unsigned long acc;
+
+    acc = 0;
+    for (orig = str;; str++) {
+        int c;
+
+        c = *str;
+        if (c < '0' || c > '9') {
+            break;
+        }
+        c -= '0';
+        if (acc > (ULONG_MAX / 10)) {
+            return NULL;
+        }
+        acc *= 10;
+        if ((unsigned long)c > (ULONG_MAX - acc)) {
+            return NULL;
+        }
+        acc += (unsigned long)c;
+    }
+    if (str == orig || (*orig == '0' && str != (orig + 1))) {
+        return NULL;
+    }
+    *v = acc;
+    return str;
+}
+
+/* ==================================================================== */
+/*
+ * Code specific to Argon2.
+ *
+ * The code below applies the following format:
+ *
+ *  $argon2<T>[$v=<num>]$m=<num>,t=<num>,p=<num>$<bin>$<bin>
+ *
+ * where <T> is either 'd', 'id', or 'i', <num> is a decimal integer (positive,
+ * fits in an 'unsigned long'), and <bin> is Base64-encoded data (no '=' padding
+ * characters, no newline or whitespace).
+ *
+ * The last two binary chunks (encoded in Base64) are, in that order,
+ * the salt and the output. Both are required. The binary salt length and the
+ * output length must be in the allowed ranges defined in argon2.h.
+ *
+ * The ctx struct must contain buffers large enough to hold the salt and pwd
+ * when it is fed into decode_string.
+ */
+
+int decode_string(argon2_context *ctx, const char *str, argon2_type type) {
+
+/* check for prefix */
+#define CC(prefix)                                                             \
+    do {                                                                       \
+        size_t cc_len = strlen(prefix);                                        \
+        if (strncmp(str, prefix, cc_len) != 0) {                               \
+            return ARGON2_DECODING_FAIL;                                       \
+        }                                                                      \
+        str += cc_len;                                                         \
+    } while ((void)0, 0)
+
+/* optional prefix checking with supplied code */
+#define CC_opt(prefix, code)                                                   \
+    do {                                                                       \
+        size_t cc_len = strlen(prefix);                                        \
+        if (strncmp(str, prefix, cc_len) == 0) {                               \
+            str += cc_len;                                                     \
+            { code; }                                                          \
+        }                                                                      \
+    } while ((void)0, 0)
+
+/* Decoding prefix into decimal */
+#define DECIMAL(x)                                                             \
+    do {                                                                       \
+        unsigned long dec_x;                                                   \
+        str = decode_decimal(str, &dec_x);                                     \
+        if (str == NULL) {                                                     \
+            return ARGON2_DECODING_FAIL;                                       \
+        }                                                                      \
+        (x) = dec_x;                                                           \
+    } while ((void)0, 0)
+
+
+/* Decoding prefix into uint32_t decimal */
+#define DECIMAL_U32(x)                                                         \
+    do {                                                                       \
+        unsigned long dec_x;                                                   \
+        str = decode_decimal(str, &dec_x);                                     \
+        if (str == NULL || dec_x > UINT32_MAX) {                               \
+            return ARGON2_DECODING_FAIL;                                       \
+        }                                                                      \
+        (x) = (uint32_t)dec_x;                                                 \
+    } while ((void)0, 0)
+
+
+/* Decoding base64 into a binary buffer */
+#define BIN(buf, max_len, len)                                                 \
+    do {                                                                       \
+        size_t bin_len = (max_len);                                            \
+        str = from_base64(buf, &bin_len, str);                                 \
+        if (str == NULL || bin_len > UINT32_MAX) {                             \
+            return ARGON2_DECODING_FAIL;                                       \
+        }                                                                      \
+        (len) = (uint32_t)bin_len;                                             \
+    } while ((void)0, 0)
+
+    size_t maxsaltlen = ctx->saltlen;
+    size_t maxoutlen = ctx->outlen;
+    int validation_result;
+    const char* type_string;
+
+    /* We should start with the argon2_type we are using */
+    type_string = argon2_type2string(type, 0);
+    if (!type_string) {
+        return ARGON2_INCORRECT_TYPE;
+    }
+
+    CC("$");
+    CC(type_string);
+
+    /* Reading the version number if the default is suppressed */
+    ctx->version = ARGON2_VERSION_10;
+    CC_opt("$v=", DECIMAL_U32(ctx->version));
+
+    CC("$m=");
+    DECIMAL_U32(ctx->m_cost);
+    CC(",t=");
+    DECIMAL_U32(ctx->t_cost);
+    CC(",p=");
+    DECIMAL_U32(ctx->lanes);
+    ctx->threads = ctx->lanes;
+
+    CC("$");
+    BIN(ctx->salt, maxsaltlen, ctx->saltlen);
+    CC("$");
+    BIN(ctx->out, maxoutlen, ctx->outlen);
+
+    /* The rest of the fields get the default values */
+    ctx->secret = NULL;
+    ctx->secretlen = 0;
+    ctx->ad = NULL;
+    ctx->adlen = 0;
+    ctx->allocate_cbk = NULL;
+    ctx->free_cbk = NULL;
+    ctx->flags = ARGON2_DEFAULT_FLAGS;
+
+    /* On return, must have valid context */
+    validation_result = validate_inputs(ctx);
+    if (validation_result != ARGON2_OK) {
+        return validation_result;
+    }
+
+    /* Can't have any additional characters */
+    if (*str == 0) {
+        return ARGON2_OK;
+    } else {
+        return ARGON2_DECODING_FAIL;
+    }
+#undef CC
+#undef CC_opt
+#undef DECIMAL
+#undef BIN
+}
+
+int encode_string(char *dst, size_t dst_len, argon2_context *ctx,
+                  argon2_type type) {
+#define SS(str)                                                                \
+    do {                                                                       \
+        size_t pp_len = strlen(str);                                           \
+        if (pp_len >= dst_len) {                                               \
+            return ARGON2_ENCODING_FAIL;                                       \
+        }                                                                      \
+        memcpy(dst, str, pp_len + 1);                                          \
+        dst += pp_len;                                                         \
+        dst_len -= pp_len;                                                     \
+    } while ((void)0, 0)
+
+#define SX(x)                                                                  \
+    do {                                                                       \
+        char tmp[30];                                                          \
+        sprintf(tmp, "%lu", (unsigned long)(x));                               \
+        SS(tmp);                                                               \
+    } while ((void)0, 0)
+
+#define SB(buf, len)                                                           \
+    do {                                                                       \
+        size_t sb_len = to_base64(dst, dst_len, buf, len);                     \
+        if (sb_len == (size_t)-1) {                                            \
+            return ARGON2_ENCODING_FAIL;                                       \
+        }                                                                      \
+        dst += sb_len;                                                         \
+        dst_len -= sb_len;                                                     \
+    } while ((void)0, 0)
+
+    const char* type_string = argon2_type2string(type, 0);
+    int validation_result = validate_inputs(ctx);
+
+    if (!type_string) {
+      return ARGON2_ENCODING_FAIL;
+    }
+
+    if (validation_result != ARGON2_OK) {
+      return validation_result;
+    }
+
+
+    SS("$");
+    SS(type_string);
+
+    SS("$v=");
+    SX(ctx->version);
+
+    SS("$m=");
+    SX(ctx->m_cost);
+    SS(",t=");
+    SX(ctx->t_cost);
+    SS(",p=");
+    SX(ctx->lanes);
+
+    SS("$");
+    SB(ctx->salt, ctx->saltlen);
+
+    SS("$");
+    SB(ctx->out, ctx->outlen);
+    return ARGON2_OK;
+
+#undef SS
+#undef SX
+#undef SB
+}
+
+size_t b64len(uint32_t len) {
+    size_t olen = ((size_t)len / 3) << 2;
+
+    switch (len % 3) {
+    case 2:
+        olen++;
+    /* fall through */
+    case 1:
+        olen += 2;
+        break;
+    }
+
+    return olen;
+}
+
+size_t numlen(uint32_t num) {
+    size_t len = 1;
+    while (num >= 10) {
+        ++len;
+        num = num / 10;
+    }
+    return len;
+}
+
--- a/algo/argon2/argon2d/argon2d/encoding.h
+++ b/algo/argon2/argon2d/argon2d/encoding.h
@@ -0,0 +1,57 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef ENCODING_H
+#define ENCODING_H
+#include "argon2.h"
+
+#define ARGON2_MAX_DECODED_LANES UINT32_C(255)
+#define ARGON2_MIN_DECODED_SALT_LEN UINT32_C(8)
+#define ARGON2_MIN_DECODED_OUT_LEN UINT32_C(12)
+
+/*
+* encode an Argon2 hash string into the provided buffer. 'dst_len'
+* contains the size, in characters, of the 'dst' buffer; if 'dst_len'
+* is less than the number of required characters (including the
+* terminating 0), then this function returns ARGON2_ENCODING_ERROR.
+*
+* on success, ARGON2_OK is returned.
+*/
+int encode_string(char *dst, size_t dst_len, argon2_context *ctx,
+                  argon2_type type);
+
+/*
+* Decodes an Argon2 hash string into the provided structure 'ctx'.
+* The only fields that must be set prior to this call are ctx.saltlen and
+* ctx.outlen (which must be the maximal salt and out length values that are
+* allowed), ctx.salt and ctx.out (which must be buffers of the specified
+* length), and ctx.pwd and ctx.pwdlen which must hold a valid password.
+*
+* Invalid input string causes an error. On success, the ctx is valid and all
+* fields have been initialized.
+*
+* Returned value is ARGON2_OK on success, other ARGON2_ codes on error.
+*/
+int decode_string(argon2_context *ctx, const char *str, argon2_type type);
+
+/* Returns the length of the encoded byte stream with length len */
+size_t b64len(uint32_t len);
+
+/* Returns the length of the encoded number num */
+size_t numlen(uint32_t num);
+
+#endif
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -0,0 +1,359 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "argon2.h"
+#include "core.h"
+
+#include "../blake2/blake2.h"
+#include "../blake2/blamka-round-opt.h"
+
+/*
+ * Function fills a new memory block and optionally XORs the old block over the new one.
+ * Memory must be initialized.
+ * @param state Pointer to the just produced block. Content will be updated(!)
+ * @param ref_block Pointer to the reference block
+ * @param next_block Pointer to the block to be XORed over. May coincide with @ref_block
+ * @param with_xor Whether to XOR into the new block (1) or just overwrite (0)
+ * @pre all block pointers must be valid
+ */
+
+#if defined(__AVX512F__)
+
+static void fill_block(__m512i *state, const block *ref_block,
+                       block *next_block, int with_xor) {
+    __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+            state[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+            block_XY[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
+        }
+    } else {
+        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+            block_XY[i] = state[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+        }
+    }
+
+    BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3],
+                    state[ 4], state[ 5], state[ 6], state[ 7] );
+    BLAKE2_ROUND_1( state[ 8], state[ 9], state[10], state[11],
+                    state[12], state[13], state[14], state[15] );
+
+    BLAKE2_ROUND_2( state[ 0], state[ 2], state[ 4], state[ 6],
+                    state[ 8], state[10], state[12], state[14] );
+    BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7],
+                    state[ 9], state[11], state[13], state[15] );
+
+/*
+    for (i = 0; i < 2; ++i) {
+        BLAKE2_ROUND_1(
+            state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+            state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+    }
+
+    for (i = 0; i < 2; ++i) {
+        BLAKE2_ROUND_2(
+            state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
+            state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
+    }
+*/
+
+    for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+        state[i] = _mm512_xor_si512(state[i], block_XY[i]);
+        _mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
+    }
+}
+
+#elif defined(__AVX2__)
+
+static void fill_block(__m256i *state, const block *ref_block,
+                       block *next_block, int with_xor) {
+    __m256i block_XY[ARGON2_HWORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+            state[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+            block_XY[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i));
+        }
+    } else {
+        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+            block_XY[i] = state[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+        }
+    }
+
+    BLAKE2_ROUND_1( state[ 0], state[ 4], state[ 1], state[ 5],
+                    state[ 2], state[ 6], state[ 3], state[ 7] );
+    BLAKE2_ROUND_1( state[ 8], state[12], state[ 9], state[13],
+                    state[10], state[14], state[11], state[15] );
+    BLAKE2_ROUND_1( state[16], state[20], state[17], state[21],
+                    state[18], state[22], state[19], state[23] );
+    BLAKE2_ROUND_1( state[24], state[28], state[25], state[29],
+                    state[26], state[30], state[27], state[31] );
+
+    BLAKE2_ROUND_2( state[ 0], state[ 4], state[ 8], state[12],
+                    state[16], state[20], state[24], state[28] );
+    BLAKE2_ROUND_2( state[ 1], state[ 5], state[ 9], state[13],
+                    state[17], state[21], state[25], state[29] );
+    BLAKE2_ROUND_2( state[ 2], state[ 6], state[10], state[14],
+                    state[18], state[22], state[26], state[30] );
+    BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
+                    state[19], state[23], state[27], state[31] );
+
+/*
+    for (i = 0; i < 4; ++i) {
+        BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
+                       state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
+    }
+
+    for (i = 0; i < 4; ++i) {
+        BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
+                       state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
+    }
+*/
+
+    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+        state[i] = _mm256_xor_si256(state[i], block_XY[i]);
+        _mm256_storeu_si256((__m256i *)next_block->v + i, state[i]);
+    }
+}
+
+#else  // SSE2
+
+static void fill_block(__m128i *state, const block *ref_block,
+                       block *next_block, int with_xor) {
+    __m128i block_XY[ARGON2_OWORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
+            state[i] = _mm_xor_si128(
+                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+            block_XY[i] = _mm_xor_si128(
+                state[i], _mm_loadu_si128((const __m128i *)next_block->v + i));
+        }
+    } else {
+        for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
+            block_XY[i] = state[i] = _mm_xor_si128(
+                state[i], _mm_loadu_si128((const __m128i *)ref_block->v + i));
+        }
+    }
+
+    BLAKE2_ROUND( state[ 0], state[ 1], state[ 2], state[ 3],
+                  state[ 4], state[ 5], state[ 6], state[ 7] );
+    BLAKE2_ROUND( state[ 8], state[ 9], state[10], state[11], 
+                  state[12], state[13], state[14], state[15] );
+    BLAKE2_ROUND( state[16], state[17], state[18], state[19], 
+                  state[20], state[21], state[22], state[23] );
+    BLAKE2_ROUND( state[24], state[25], state[26], state[27], 
+                  state[28], state[29], state[30], state[31] );
+    BLAKE2_ROUND( state[32], state[33], state[34], state[35], 
+                  state[36], state[37], state[38], state[39] );
+    BLAKE2_ROUND( state[40], state[41], state[42], state[43], 
+                  state[44], state[45], state[46], state[47] );
+    BLAKE2_ROUND( state[48], state[49], state[50], state[51], 
+                  state[52], state[53], state[54], state[55] );
+    BLAKE2_ROUND( state[56], state[57], state[58], state[59], 
+                  state[60], state[61], state[62], state[63] );
+
+    BLAKE2_ROUND( state[ 0], state[ 8], state[16], state[24], 
+                  state[32], state[40], state[48], state[56] );
+    BLAKE2_ROUND( state[ 1], state[ 9], state[17], state[25],  
+                  state[33], state[41], state[49], state[57] );
+    BLAKE2_ROUND( state[ 2], state[10], state[18], state[26],  
+                  state[34], state[42], state[50], state[58] );
+    BLAKE2_ROUND( state[ 3], state[11], state[19], state[27],  
+                  state[35], state[43], state[51], state[59] );
+    BLAKE2_ROUND( state[ 4], state[12], state[20], state[28],  
+                  state[36], state[44], state[52], state[60] );
+    BLAKE2_ROUND( state[ 5], state[13], state[21], state[29],  
+                  state[37], state[45], state[53], state[61] );
+    BLAKE2_ROUND( state[ 6], state[14], state[22], state[30],  
+                  state[38], state[46], state[54], state[62] );
+    BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],  
+                  state[39], state[47], state[55], state[63] );
+
+/*
+    for (i = 0; i < 8; ++i) {
+        BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
+            state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
+            state[8 * i + 6], state[8 * i + 7]);
+    }
+
+    for (i = 0; i < 8; ++i) {
+        BLAKE2_ROUND(state[8 * 0 + i], state[8 * 1 + i], state[8 * 2 + i],
+            state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
+            state[8 * 6 + i], state[8 * 7 + i]);
+    }
+*/
+    for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
+        state[i] = _mm_xor_si128(state[i], block_XY[i]);
+        _mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
+    }
+}
+
+#endif
+
+#if 0
+static void next_addresses(block *address_block, block *input_block) {
+    /*Temporary zero-initialized blocks*/
+#if defined(__AVX512F__)
+    __m512i zero_block[ARGON2_512BIT_WORDS_IN_BLOCK];
+    __m512i zero2_block[ARGON2_512BIT_WORDS_IN_BLOCK];
+#elif defined(__AVX2__)
+    __m256i zero_block[ARGON2_HWORDS_IN_BLOCK];
+    __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK];
+#else
+    __m128i zero_block[ARGON2_OWORDS_IN_BLOCK];
+    __m128i zero2_block[ARGON2_OWORDS_IN_BLOCK];
+#endif
+
+    memset(zero_block, 0, sizeof(zero_block));
+    memset(zero2_block, 0, sizeof(zero2_block));
+
+    /*Increasing index counter*/
+    input_block->v[6]++;
+
+    /*First iteration of G*/
+    fill_block(zero_block, input_block, address_block, 0);
+
+    /*Second iteration of G*/
+    fill_block(zero2_block, address_block, address_block, 0);
+}
+#endif
+
+void fill_segment(const argon2_instance_t *instance,
+                  argon2_position_t position) {
+    block *ref_block = NULL, *curr_block = NULL;
+//    block address_block, input_block;
+    uint64_t pseudo_rand, ref_index, ref_lane;
+    uint32_t prev_offset, curr_offset;
+    uint32_t starting_index, i;
+#if defined(__AVX512F__)
+    __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
+#elif defined(__AVX2__)
+    __m256i state[ARGON2_HWORDS_IN_BLOCK];
+#else
+    __m128i state[ARGON2_OWORDS_IN_BLOCK];
+#endif
+//    int data_independent_addressing;
+
+    if (instance == NULL) {
+        return;
+    }
+
+    // data_independent_addressing =
+    //     (instance->type == Argon2_i) ||
+    //     (instance->type == Argon2_id && (position.pass == 0) &&
+    //      (position.slice < ARGON2_SYNC_POINTS / 2));
+
+    // if (data_independent_addressing) {
+    //     init_block_value(&input_block, 0);
+
+    //     input_block.v[0] = position.pass;
+    //     input_block.v[1] = position.lane;
+    //     input_block.v[2] = position.slice;
+    //     input_block.v[3] = instance->memory_blocks;
+    //     input_block.v[4] = instance->passes;
+    //     input_block.v[5] = instance->type;
+    // }
+
+    starting_index = 0;
+
+    if ((0 == position.pass) && (0 == position.slice)) {
+        starting_index = 2; /* we have already generated the first two blocks */
+
+        /* Don't forget to generate the first block of addresses: */
+//        if (data_independent_addressing) {
+//            next_addresses(&address_block, &input_block);
+//        }
+    }
+
+    /* Offset of the current block */
+    curr_offset = position.lane * instance->lane_length +
+                  position.slice * instance->segment_length + starting_index;
+
+    if (0 == curr_offset % instance->lane_length) {
+        /* Last block in this lane */
+        prev_offset = curr_offset + instance->lane_length - 1;
+    } else {
+        /* Previous block */
+        prev_offset = curr_offset - 1;
+    }
+
+    memcpy(state, ((instance->memory + prev_offset)->v), ARGON2_BLOCK_SIZE);
+
+    for (i = starting_index; i < instance->segment_length;
+         ++i, ++curr_offset, ++prev_offset) {
+        /*1.1 Rotating prev_offset if needed */
+        if (curr_offset % instance->lane_length == 1) {
+            prev_offset = curr_offset - 1;
+        }
+
+        /* 1.2 Computing the index of the reference block */
+        /* 1.2.1 Taking pseudo-random value from the previous block */
+//        if (data_independent_addressing) {
+//            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
+//                next_addresses(&address_block, &input_block);
+//            }
+//            pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
+//        } else {
+            pseudo_rand = instance->memory[prev_offset].v[0];
+//        }
+
+        /* 1.2.2 Computing the lane of the reference block */
+        ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
+
+        if ((position.pass == 0) && (position.slice == 0)) {
+            /* Can not reference other lanes yet */
+            ref_lane = position.lane;
+        }
+
+        /* 1.2.3 Computing the number of possible reference block within the
+         * lane.
+         */
+        position.index = i;
+        ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
+                                ref_lane == position.lane);
+
+        /* 2 Creating a new block */
+        ref_block =
+            instance->memory + instance->lane_length * ref_lane + ref_index;
+        curr_block = instance->memory + curr_offset;
+         if (ARGON2_VERSION_10 == instance->version) {
+             /* version 1.2.1 and earlier: overwrite, not XOR */
+             fill_block(state, ref_block, curr_block, 0);
+         } else {
+             if(0 == position.pass) {
+                fill_block(state, ref_block, curr_block, 0);
+             } else {
+                 fill_block(state, ref_block, curr_block, 1);
+             }
+         }
+    }
+}
--- a/algo/argon2/argon2d/argon2d/thread.c
+++ b/algo/argon2/argon2d/argon2d/thread.c
@@ -0,0 +1,57 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#if !defined(ARGON2_NO_THREADS)
+
+#include "thread.h"
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+int argon2_thread_create(argon2_thread_handle_t *handle,
+                         argon2_thread_func_t func, void *args) {
+    if (NULL == handle || func == NULL) {
+        return -1;
+    }
+#if defined(_WIN32)
+    *handle = _beginthreadex(NULL, 0, func, args, 0, NULL);
+    return *handle != 0 ? 0 : -1;
+#else
+    return pthread_create(handle, NULL, func, args);
+#endif
+}
+
+int argon2_thread_join(argon2_thread_handle_t handle) {
+#if defined(_WIN32)
+    if (WaitForSingleObject((HANDLE)handle, INFINITE) == WAIT_OBJECT_0) {
+        return CloseHandle((HANDLE)handle) != 0 ? 0 : -1;
+    }
+    return -1;
+#else
+    return pthread_join(handle, NULL);
+#endif
+}
+
+void argon2_thread_exit(void) {
+#if defined(_WIN32)
+    _endthreadex(0);
+#else
+    pthread_exit(NULL);
+#endif
+}
+
+#endif /* ARGON2_NO_THREADS */
--- a/algo/argon2/argon2d/argon2d/thread.h
+++ b/algo/argon2/argon2d/argon2d/thread.h
@@ -0,0 +1,67 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef ARGON2_THREAD_H
+#define ARGON2_THREAD_H
+
+#if !defined(ARGON2_NO_THREADS)
+
+/*
+        Here we implement an abstraction layer for the simpĺe requirements
+        of the Argon2 code. We only require 3 primitives---thread creation,
+        joining, and termination---so full emulation of the pthreads API
+        is unwarranted. Currently we wrap pthreads and Win32 threads.
+
+        The API defines 2 types: the function pointer type,
+   argon2_thread_func_t,
+        and the type of the thread handle---argon2_thread_handle_t.
+*/
+#if defined(_WIN32)
+#include <process.h>
+typedef unsigned(__stdcall *argon2_thread_func_t)(void *);
+typedef uintptr_t argon2_thread_handle_t;
+#else
+#include <pthread.h>
+typedef void *(*argon2_thread_func_t)(void *);
+typedef pthread_t argon2_thread_handle_t;
+#endif
+
+/* Creates a thread
+ * @param handle pointer to a thread handle, which is the output of this
+ * function. Must not be NULL.
+ * @param func A function pointer for the thread's entry point. Must not be
+ * NULL.
+ * @param args Pointer that is passed as an argument to @func. May be NULL.
+ * @return 0 if @handle and @func are valid pointers and a thread is successfully
+ * created.
+ */
+int argon2_thread_create(argon2_thread_handle_t *handle,
+                         argon2_thread_func_t func, void *args);
+
+/* Waits for a thread to terminate
+ * @param handle Handle to a thread created with argon2_thread_create.
+ * @return 0 if @handle is a valid handle, and joining completed successfully.
+*/
+int argon2_thread_join(argon2_thread_handle_t handle);
+
+/* Terminate the current thread. Must be run inside a thread created by
+ * argon2_thread_create.
+*/
+void argon2_thread_exit(void);
+
+#endif /* ARGON2_NO_THREADS */
+#endif
--- a/algo/argon2/argon2d/blake2/blake2-impl.h
+++ b/algo/argon2/argon2d/blake2/blake2-impl.h
@@ -0,0 +1,156 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef PORTABLE_BLAKE2_IMPL_H
+#define PORTABLE_BLAKE2_IMPL_H
+
+#include <stdint.h>
+#include <string.h>
+
+#if defined(_MSC_VER)
+#define BLAKE2_INLINE __inline
+#elif defined(__GNUC__) || defined(__clang__)
+#define BLAKE2_INLINE __inline__
+#else
+#define BLAKE2_INLINE
+#endif
+
+/* Argon2 Team - Begin Code */
+/*
+   Not an exhaustive list, but should cover the majority of modern platforms
+   Additionally, the code will always be correct---this is only a performance
+   tweak.
+*/
+#if (defined(__BYTE_ORDER__) &&                                                \
+     (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) ||                           \
+    defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \
+    defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) ||       \
+    defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) ||                \
+    defined(_M_ARM)
+#define NATIVE_LITTLE_ENDIAN
+#endif
+/* Argon2 Team - End Code */
+
+static BLAKE2_INLINE uint32_t load32(const void *src) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+    uint32_t w;
+    memcpy(&w, src, sizeof w);
+    return w;
+#else
+    const uint8_t *p = (const uint8_t *)src;
+    uint32_t w = *p++;
+    w |= (uint32_t)(*p++) << 8;
+    w |= (uint32_t)(*p++) << 16;
+    w |= (uint32_t)(*p++) << 24;
+    return w;
+#endif
+}
+
+static BLAKE2_INLINE uint64_t load64(const void *src) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+    uint64_t w;
+    memcpy(&w, src, sizeof w);
+    return w;
+#else
+    const uint8_t *p = (const uint8_t *)src;
+    uint64_t w = *p++;
+    w |= (uint64_t)(*p++) << 8;
+    w |= (uint64_t)(*p++) << 16;
+    w |= (uint64_t)(*p++) << 24;
+    w |= (uint64_t)(*p++) << 32;
+    w |= (uint64_t)(*p++) << 40;
+    w |= (uint64_t)(*p++) << 48;
+    w |= (uint64_t)(*p++) << 56;
+    return w;
+#endif
+}
+
+static BLAKE2_INLINE void store32(void *dst, uint32_t w) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+    memcpy(dst, &w, sizeof w);
+#else
+    uint8_t *p = (uint8_t *)dst;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+#endif
+}
+
+static BLAKE2_INLINE void store64(void *dst, uint64_t w) {
+#if defined(NATIVE_LITTLE_ENDIAN)
+    memcpy(dst, &w, sizeof w);
+#else
+    uint8_t *p = (uint8_t *)dst;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+#endif
+}
+
+static BLAKE2_INLINE uint64_t load48(const void *src) {
+    const uint8_t *p = (const uint8_t *)src;
+    uint64_t w = *p++;
+    w |= (uint64_t)(*p++) << 8;
+    w |= (uint64_t)(*p++) << 16;
+    w |= (uint64_t)(*p++) << 24;
+    w |= (uint64_t)(*p++) << 32;
+    w |= (uint64_t)(*p++) << 40;
+    return w;
+}
+
+static BLAKE2_INLINE void store48(void *dst, uint64_t w) {
+    uint8_t *p = (uint8_t *)dst;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+    w >>= 8;
+    *p++ = (uint8_t)w;
+}
+
+static BLAKE2_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) {
+    return (w >> c) | (w << (32 - c));
+}
+
+static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) {
+    return (w >> c) | (w << (64 - c));
+}
+
+void clear_internal_memory(void *v, size_t n);
+
+#endif
--- a/algo/argon2/argon2d/blake2/blake2.h
+++ b/algo/argon2/argon2d/blake2/blake2.h
@@ -0,0 +1,91 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef PORTABLE_BLAKE2_H
+#define PORTABLE_BLAKE2_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <limits.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+enum blake2b_constant {
+    BLAKE2B_BLOCKBYTES = 128,
+    BLAKE2B_OUTBYTES = 64,
+    BLAKE2B_KEYBYTES = 64,
+    BLAKE2B_SALTBYTES = 16,
+    BLAKE2B_PERSONALBYTES = 16
+};
+
+#pragma pack(push, 1)
+typedef struct __blake2b_param {
+    uint8_t digest_length;                   /* 1 */
+    uint8_t key_length;                      /* 2 */
+    uint8_t fanout;                          /* 3 */
+    uint8_t depth;                           /* 4 */
+    uint32_t leaf_length;                    /* 8 */
+    uint64_t node_offset;                    /* 16 */
+    uint8_t node_depth;                      /* 17 */
+    uint8_t inner_length;                    /* 18 */
+    uint8_t reserved[14];                    /* 32 */
+    uint8_t salt[BLAKE2B_SALTBYTES];         /* 48 */
+    uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */
+} blake2b_param;
+#pragma pack(pop)
+
+typedef struct __blake2b_state {
+    uint64_t h[8];
+    uint64_t t[2];
+    uint64_t f[2];
+    uint8_t buf[BLAKE2B_BLOCKBYTES];
+    unsigned buflen;
+    unsigned outlen;
+    uint8_t last_node;
+} blake2b_state;
+
+/* Ensure param structs have not been wrongly padded */
+/* Poor man's static_assert */
+enum {
+    blake2_size_check_0 = 1 / !!(CHAR_BIT == 8),
+    blake2_size_check_2 =
+        1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT)
+};
+
+/* Streaming API */
+int blake2b_init(blake2b_state *S, size_t outlen);
+int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
+                     size_t keylen);
+int blake2b_init_param(blake2b_state *S, const blake2b_param *P);
+int blake2b_update(blake2b_state *S, const void *in, size_t inlen);
+int blake2b_final(blake2b_state *S, void *out, size_t outlen);
+
+/* Simple API */
+int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
+                         const void *key, size_t keylen);
+
+/* Argon2 Team - Begin Code */
+int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
+/* Argon2 Team - End Code */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/algo/argon2/argon2d/blake2/blake2b.c
+++ b/algo/argon2/argon2d/blake2/blake2b.c
@@ -0,0 +1,390 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+static const uint64_t blake2b_IV[8] = {
+    UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b),
+    UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1),
+    UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f),
+    UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179)};
+
+static const unsigned int blake2b_sigma[12][16] = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+    {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
+    {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
+    {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
+    {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
+    {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
+    {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
+    {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
+    {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
+};
+
+static BLAKE2_INLINE void blake2b_set_lastnode(blake2b_state *S) {
+    S->f[1] = (uint64_t)-1;
+}
+
+static BLAKE2_INLINE void blake2b_set_lastblock(blake2b_state *S) {
+    if (S->last_node) {
+        blake2b_set_lastnode(S);
+    }
+    S->f[0] = (uint64_t)-1;
+}
+
+static BLAKE2_INLINE void blake2b_increment_counter(blake2b_state *S,
+                                                    uint64_t inc) {
+    S->t[0] += inc;
+    S->t[1] += (S->t[0] < inc);
+}
+
+static BLAKE2_INLINE void blake2b_invalidate_state(blake2b_state *S) {
+    clear_internal_memory(S, sizeof(*S));      /* wipe */
+    blake2b_set_lastblock(S); /* invalidate for further use */
+}
+
+static BLAKE2_INLINE void blake2b_init0(blake2b_state *S) {
+    memset(S, 0, sizeof(*S));
+    memcpy(S->h, blake2b_IV, sizeof(S->h));
+}
+
+int blake2b_init_param(blake2b_state *S, const blake2b_param *P) {
+    const unsigned char *p = (const unsigned char *)P;
+    unsigned int i;
+
+    if (NULL == P || NULL == S) {
+        return -1;
+    }
+
+    blake2b_init0(S);
+    /* IV XOR Parameter Block */
+    for (i = 0; i < 8; ++i) {
+        S->h[i] ^= load64(&p[i * sizeof(S->h[i])]);
+    }
+    S->outlen = P->digest_length;
+    return 0;
+}
+
+/* Sequential blake2b initialization */
+int blake2b_init(blake2b_state *S, size_t outlen) {
+    blake2b_param P;
+
+    if (S == NULL) {
+        return -1;
+    }
+
+    if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) {
+        blake2b_invalidate_state(S);
+        return -1;
+    }
+
+    /* Setup Parameter Block for unkeyed BLAKE2 */
+    P.digest_length = (uint8_t)outlen;
+    P.key_length = 0;
+    P.fanout = 1;
+    P.depth = 1;
+    P.leaf_length = 0;
+    P.node_offset = 0;
+    P.node_depth = 0;
+    P.inner_length = 0;
+    memset(P.reserved, 0, sizeof(P.reserved));
+    memset(P.salt, 0, sizeof(P.salt));
+    memset(P.personal, 0, sizeof(P.personal));
+
+    return blake2b_init_param(S, &P);
+}
+
+int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
+                     size_t keylen) {
+    blake2b_param P;
+
+    if (S == NULL) {
+        return -1;
+    }
+
+    if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) {
+        blake2b_invalidate_state(S);
+        return -1;
+    }
+
+    if ((key == 0) || (keylen == 0) || (keylen > BLAKE2B_KEYBYTES)) {
+        blake2b_invalidate_state(S);
+        return -1;
+    }
+
+    /* Setup Parameter Block for keyed BLAKE2 */
+    P.digest_length = (uint8_t)outlen;
+    P.key_length = (uint8_t)keylen;
+    P.fanout = 1;
+    P.depth = 1;
+    P.leaf_length = 0;
+    P.node_offset = 0;
+    P.node_depth = 0;
+    P.inner_length = 0;
+    memset(P.reserved, 0, sizeof(P.reserved));
+    memset(P.salt, 0, sizeof(P.salt));
+    memset(P.personal, 0, sizeof(P.personal));
+
+    if (blake2b_init_param(S, &P) < 0) {
+        blake2b_invalidate_state(S);
+        return -1;
+    }
+
+    {
+        uint8_t block[BLAKE2B_BLOCKBYTES];
+        memset(block, 0, BLAKE2B_BLOCKBYTES);
+        memcpy(block, key, keylen);
+        blake2b_update(S, block, BLAKE2B_BLOCKBYTES);
+        /* Burn the key from stack */
+        clear_internal_memory(block, BLAKE2B_BLOCKBYTES);
+    }
+    return 0;
+}
+
+static void blake2b_compress(blake2b_state *S, const uint8_t *block) {
+    uint64_t m[16];
+    uint64_t v[16];
+    unsigned int i, r;
+
+    for (i = 0; i < 16; ++i) {
+        m[i] = load64(block + i * sizeof(m[i]));
+    }
+
+    for (i = 0; i < 8; ++i) {
+        v[i] = S->h[i];
+    }
+
+    v[8] = blake2b_IV[0];
+    v[9] = blake2b_IV[1];
+    v[10] = blake2b_IV[2];
+    v[11] = blake2b_IV[3];
+    v[12] = blake2b_IV[4] ^ S->t[0];
+    v[13] = blake2b_IV[5] ^ S->t[1];
+    v[14] = blake2b_IV[6] ^ S->f[0];
+    v[15] = blake2b_IV[7] ^ S->f[1];
+
+#define G(r, i, a, b, c, d)                                                    \
+    do {                                                                       \
+        a = a + b + m[blake2b_sigma[r][2 * i + 0]];                            \
+        d = rotr64(d ^ a, 32);                                                 \
+        c = c + d;                                                             \
+        b = rotr64(b ^ c, 24);                                                 \
+        a = a + b + m[blake2b_sigma[r][2 * i + 1]];                            \
+        d = rotr64(d ^ a, 16);                                                 \
+        c = c + d;                                                             \
+        b = rotr64(b ^ c, 63);                                                 \
+    } while ((void)0, 0)
+
+#define ROUND(r)                                                               \
+    do {                                                                       \
+        G(r, 0, v[0], v[4], v[8], v[12]);                                      \
+        G(r, 1, v[1], v[5], v[9], v[13]);                                      \
+        G(r, 2, v[2], v[6], v[10], v[14]);                                     \
+        G(r, 3, v[3], v[7], v[11], v[15]);                                     \
+        G(r, 4, v[0], v[5], v[10], v[15]);                                     \
+        G(r, 5, v[1], v[6], v[11], v[12]);                                     \
+        G(r, 6, v[2], v[7], v[8], v[13]);                                      \
+        G(r, 7, v[3], v[4], v[9], v[14]);                                      \
+    } while ((void)0, 0)
+
+    for (r = 0; r < 12; ++r) {
+        ROUND(r);
+    }
+
+    for (i = 0; i < 8; ++i) {
+        S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+    }
+
+#undef G
+#undef ROUND
+}
+
+int blake2b_update(blake2b_state *S, const void *in, size_t inlen) {
+    const uint8_t *pin = (const uint8_t *)in;
+
+    if (inlen == 0) {
+        return 0;
+    }
+
+    /* Sanity check */
+    if (S == NULL || in == NULL) {
+        return -1;
+    }
+
+    /* Is this a reused state? */
+    if (S->f[0] != 0) {
+        return -1;
+    }
+
+    if (S->buflen + inlen > BLAKE2B_BLOCKBYTES) {
+        /* Complete current block */
+        size_t left = S->buflen;
+        size_t fill = BLAKE2B_BLOCKBYTES - left;
+        memcpy(&S->buf[left], pin, fill);
+        blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
+        blake2b_compress(S, S->buf);
+        S->buflen = 0;
+        inlen -= fill;
+        pin += fill;
+        /* Avoid buffer copies when possible */
+        while (inlen > BLAKE2B_BLOCKBYTES) {
+            blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
+            blake2b_compress(S, pin);
+            inlen -= BLAKE2B_BLOCKBYTES;
+            pin += BLAKE2B_BLOCKBYTES;
+        }
+    }
+    memcpy(&S->buf[S->buflen], pin, inlen);
+    S->buflen += (unsigned int)inlen;
+    return 0;
+}
+
+int blake2b_final(blake2b_state *S, void *out, size_t outlen) {
+    uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
+    unsigned int i;
+
+    /* Sanity checks */
+    if (S == NULL || out == NULL || outlen < S->outlen) {
+        return -1;
+    }
+
+    /* Is this a reused state? */
+    if (S->f[0] != 0) {
+        return -1;
+    }
+
+    blake2b_increment_counter(S, S->buflen);
+    blake2b_set_lastblock(S);
+    memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */
+    blake2b_compress(S, S->buf);
+
+    for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */
+        store64(buffer + sizeof(S->h[i]) * i, S->h[i]);
+    }
+
+    memcpy(out, buffer, S->outlen);
+    clear_internal_memory(buffer, sizeof(buffer));
+    clear_internal_memory(S->buf, sizeof(S->buf));
+    clear_internal_memory(S->h, sizeof(S->h));
+    return 0;
+}
+
+int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
+            const void *key, size_t keylen) {
+    blake2b_state S;
+    int ret = -1;
+
+    /* Verify parameters */
+    if (NULL == in && inlen > 0) {
+        goto fail;
+    }
+
+    if (NULL == out || outlen == 0 || outlen > BLAKE2B_OUTBYTES) {
+        goto fail;
+    }
+
+    if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) {
+        goto fail;
+    }
+
+    if (keylen > 0) {
+        if (blake2b_init_key(&S, outlen, key, keylen) < 0) {
+            goto fail;
+        }
+    } else {
+        if (blake2b_init(&S, outlen) < 0) {
+            goto fail;
+        }
+    }
+
+    if (blake2b_update(&S, in, inlen) < 0) {
+        goto fail;
+    }
+    ret = blake2b_final(&S, out, outlen);
+
+fail:
+    clear_internal_memory(&S, sizeof(S));
+    return ret;
+}
+
+/* Argon2 Team - Begin Code */
+int blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) {
+    uint8_t *out = (uint8_t *)pout;
+    blake2b_state blake_state;
+    uint8_t outlen_bytes[sizeof(uint32_t)] = {0};
+    int ret = -1;
+
+    if (outlen > UINT32_MAX) {
+        goto fail;
+    }
+
+    /* Ensure little-endian byte order! */
+    store32(outlen_bytes, (uint32_t)outlen);
+
+#define TRY(statement)                                                         \
+    do {                                                                       \
+        ret = statement;                                                       \
+        if (ret < 0) {                                                         \
+            goto fail;                                                         \
+        }                                                                      \
+    } while ((void)0, 0)
+
+    if (outlen <= BLAKE2B_OUTBYTES) {
+        TRY(blake2b_init(&blake_state, outlen));
+        TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
+        TRY(blake2b_update(&blake_state, in, inlen));
+        TRY(blake2b_final(&blake_state, out, outlen));
+    } else {
+        uint32_t toproduce;
+        uint8_t out_buffer[BLAKE2B_OUTBYTES];
+        uint8_t in_buffer[BLAKE2B_OUTBYTES];
+        TRY(blake2b_init(&blake_state, BLAKE2B_OUTBYTES));
+        TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes)));
+        TRY(blake2b_update(&blake_state, in, inlen));
+        TRY(blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES));
+        memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+        out += BLAKE2B_OUTBYTES / 2;
+        toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2;
+
+        while (toproduce > BLAKE2B_OUTBYTES) {
+            memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+            TRY(blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer,
+                        BLAKE2B_OUTBYTES, NULL, 0));
+            memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2);
+            out += BLAKE2B_OUTBYTES / 2;
+            toproduce -= BLAKE2B_OUTBYTES / 2;
+        }
+
+        memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES);
+        TRY(blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL,
+                    0));
+        memcpy(out, out_buffer, toproduce);
+    }
+fail:
+    clear_internal_memory(&blake_state, sizeof(blake_state));
+    return ret;
+#undef TRY
+}
+/* Argon2 Team - End Code */
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -0,0 +1,471 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef BLAKE_ROUND_MKA_OPT_H
+#define BLAKE_ROUND_MKA_OPT_H
+
+#include "blake2-impl.h"
+
+#include <emmintrin.h>
+#if defined(__SSSE3__)
+#include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
+#endif
+
+#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
+#include <x86intrin.h>
+#endif
+
+#if !defined(__AVX512F__)
+#if !defined(__AVX2__)
+#if !defined(__XOP__)
+#if defined(__SSSE3__)
+#define r16                                                                    \
+    (_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define r24                                                                    \
+    (_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define _mm_roti_epi64(x, c)                                                   \
+    (-(c) == 32)                                                               \
+        ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))                      \
+        : (-(c) == 24)                                                         \
+              ? _mm_shuffle_epi8((x), r24)                                     \
+              : (-(c) == 16)                                                   \
+                    ? _mm_shuffle_epi8((x), r16)                               \
+                    : (-(c) == 63)                                             \
+                          ? _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
+                                          _mm_add_epi64((x), (x)))             \
+                          : _mm_xor_si128(_mm_srli_epi64((x), -(c)),           \
+                                          _mm_slli_epi64((x), 64 - (-(c))))
+#else /* defined(__SSE2__) */
+#define _mm_roti_epi64(r, c)                                                   \
+    _mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
+#endif
+#else
+#endif
+
+static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
+    const __m128i z = _mm_mul_epu32(x, y);
+    return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
+    do {                                                                       \
+        A0 = fBlaMka(A0, B0);                                                  \
+        A1 = fBlaMka(A1, B1);                                                  \
+                                                                               \
+        D0 = _mm_xor_si128(D0, A0);                                            \
+        D1 = _mm_xor_si128(D1, A1);                                            \
+                                                                               \
+        D0 = _mm_roti_epi64(D0, -32);                                          \
+        D1 = _mm_roti_epi64(D1, -32);                                          \
+                                                                               \
+        C0 = fBlaMka(C0, D0);                                                  \
+        C1 = fBlaMka(C1, D1);                                                  \
+                                                                               \
+        B0 = _mm_xor_si128(B0, C0);                                            \
+        B1 = _mm_xor_si128(B1, C1);                                            \
+                                                                               \
+        B0 = _mm_roti_epi64(B0, -24);                                          \
+        B1 = _mm_roti_epi64(B1, -24);                                          \
+    } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1)                                     \
+    do {                                                                       \
+        A0 = fBlaMka(A0, B0);                                                  \
+        A1 = fBlaMka(A1, B1);                                                  \
+                                                                               \
+        D0 = _mm_xor_si128(D0, A0);                                            \
+        D1 = _mm_xor_si128(D1, A1);                                            \
+                                                                               \
+        D0 = _mm_roti_epi64(D0, -16);                                          \
+        D1 = _mm_roti_epi64(D1, -16);                                          \
+                                                                               \
+        C0 = fBlaMka(C0, D0);                                                  \
+        C1 = fBlaMka(C1, D1);                                                  \
+                                                                               \
+        B0 = _mm_xor_si128(B0, C0);                                            \
+        B1 = _mm_xor_si128(B1, C1);                                            \
+                                                                               \
+        B0 = _mm_roti_epi64(B0, -63);                                          \
+        B1 = _mm_roti_epi64(B1, -63);                                          \
+    } while ((void)0, 0)
+
+#if defined(__SSSE3__)
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
+    do {                                                                       \
+        __m128i t0 = _mm_alignr_epi8(B1, B0, 8);                               \
+        __m128i t1 = _mm_alignr_epi8(B0, B1, 8);                               \
+        B0 = t0;                                                               \
+        B1 = t1;                                                               \
+                                                                               \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+                                                                               \
+        t0 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        t1 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        D0 = t1;                                                               \
+        D1 = t0;                                                               \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
+    do {                                                                       \
+        __m128i t0 = _mm_alignr_epi8(B0, B1, 8);                               \
+        __m128i t1 = _mm_alignr_epi8(B1, B0, 8);                               \
+        B0 = t0;                                                               \
+        B1 = t1;                                                               \
+                                                                               \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+                                                                               \
+        t0 = _mm_alignr_epi8(D0, D1, 8);                                       \
+        t1 = _mm_alignr_epi8(D1, D0, 8);                                       \
+        D0 = t1;                                                               \
+        D1 = t0;                                                               \
+    } while ((void)0, 0)
+#else /* SSE2 */
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                            \
+    do {                                                                       \
+        __m128i t0 = D0;                                                       \
+        __m128i t1 = B0;                                                       \
+        D0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = D0;                                                               \
+        D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0));               \
+        D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1));               \
+        B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1));               \
+        B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1));               \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1)                          \
+    do {                                                                       \
+        __m128i t0, t1;                                                        \
+        t0 = C0;                                                               \
+        C0 = C1;                                                               \
+        C1 = t0;                                                               \
+        t0 = B0;                                                               \
+        t1 = D0;                                                               \
+        B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0));               \
+        B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1));               \
+        D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1));               \
+        D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1));               \
+    } while ((void)0, 0)
+#endif
+
+#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1)                           \
+    do {                                                                       \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+                                                                               \
+        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                           \
+                                                                               \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1);                                    \
+                                                                               \
+        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
+    } while ((void)0, 0)
+#else /* __AVX2__ */
+
+#include <immintrin.h>
+
+#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
+#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
+
+#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr32(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr24(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr32(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr24(B1); \
+    } while((void)0, 0);
+
+#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr16(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr63(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr16(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr63(B1); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while(0);
+
+#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while((void)0, 0);
+
+#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    } while((void)0, 0);
+
+#endif /* __AVX2__ */
+
+#else /* __AVX512F__ */
+
+#include <immintrin.h>
+
+#define ror64(x, n) _mm512_ror_epi64((x), (n))
+
+static __m512i muladd(__m512i x, __m512i y)
+{
+    __m512i z = _mm512_mul_epu32(x, y);
+    return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        A0 = muladd(A0, B0); \
+        A1 = muladd(A1, B1); \
+\
+        D0 = _mm512_xor_si512(D0, A0); \
+        D1 = _mm512_xor_si512(D1, A1); \
+\
+        D0 = ror64(D0, 32); \
+        D1 = ror64(D1, 32); \
+\
+        C0 = muladd(C0, D0); \
+        C1 = muladd(C1, D1); \
+\
+        B0 = _mm512_xor_si512(B0, C0); \
+        B1 = _mm512_xor_si512(B1, C1); \
+\
+        B0 = ror64(B0, 24); \
+        B1 = ror64(B1, 24); \
+    } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        A0 = muladd(A0, B0); \
+        A1 = muladd(A1, B1); \
+\
+        D0 = _mm512_xor_si512(D0, A0); \
+        D1 = _mm512_xor_si512(D1, A1); \
+\
+        D0 = ror64(D0, 16); \
+        D1 = ror64(D1, 16); \
+\
+        C0 = muladd(C0, D0); \
+        C1 = muladd(C1, D1); \
+\
+        B0 = _mm512_xor_si512(B0, C0); \
+        B1 = _mm512_xor_si512(B1, C1); \
+\
+        B0 = ror64(B0, 63); \
+        B1 = ror64(B1, 63); \
+    } while ((void)0, 0)
+
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+\
+        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+\
+        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+    } while ((void)0, 0)
+
+#define SWAP_HALVES(A0, A1) \
+    do { \
+        __m512i t0, t1; \
+        t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
+        t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
+        A0 = t0; \
+        A1 = t1; \
+    } while((void)0, 0)
+
+#define SWAP_QUARTERS(A0, A1) \
+    do { \
+        SWAP_HALVES(A0, A1); \
+        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+    } while((void)0, 0)
+
+#define UNSWAP_QUARTERS(A0, A1) \
+    do { \
+        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        SWAP_HALVES(A0, A1); \
+    } while((void)0, 0)
+
+#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
+    do { \
+        SWAP_HALVES(A0, B0); \
+        SWAP_HALVES(C0, D0); \
+        SWAP_HALVES(A1, B1); \
+        SWAP_HALVES(C1, D1); \
+        BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+        SWAP_HALVES(A0, B0); \
+        SWAP_HALVES(C0, D0); \
+        SWAP_HALVES(A1, B1); \
+        SWAP_HALVES(C1, D1); \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        SWAP_QUARTERS(A0, A1); \
+        SWAP_QUARTERS(B0, B1); \
+        SWAP_QUARTERS(C0, C1); \
+        SWAP_QUARTERS(D0, D1); \
+        BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+        UNSWAP_QUARTERS(A0, A1); \
+        UNSWAP_QUARTERS(B0, B1); \
+        UNSWAP_QUARTERS(C0, C1); \
+        UNSWAP_QUARTERS(D0, D1); \
+    } while ((void)0, 0)
+
+#endif /* __AVX512F__ */
+#endif /* BLAKE_ROUND_MKA_OPT_H */
--- a/algo/argon2/argon2d/blake2/blamka-round-ref.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-ref.h
@@ -0,0 +1,56 @@
+/*
+ * Argon2 reference source code package - reference C implementations
+ *
+ * Copyright 2015
+ * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
+ *
+ * You may use this work under the terms of a Creative Commons CC0 1.0
+ * License/Waiver or the Apache Public License 2.0, at your option. The terms of
+ * these licenses can be found at:
+ *
+ * - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+ * - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * You should have received a copy of both of these licenses along with this
+ * software. If not, they may be obtained at the above URLs.
+ */
+
+#ifndef BLAKE_ROUND_MKA_H
+#define BLAKE_ROUND_MKA_H
+
+#include "blake2.h"
+#include "blake2-impl.h"
+
+/* designed by the Lyra PHC team */
+static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
+    const uint64_t m = UINT64_C(0xFFFFFFFF);
+    const uint64_t xy = (x & m) * (y & m);
+    return x + y + 2 * xy;
+}
+
+#define G(a, b, c, d)                                                          \
+    do {                                                                       \
+        a = fBlaMka(a, b);                                                     \
+        d = rotr64(d ^ a, 32);                                                 \
+        c = fBlaMka(c, d);                                                     \
+        b = rotr64(b ^ c, 24);                                                 \
+        a = fBlaMka(a, b);                                                     \
+        d = rotr64(d ^ a, 16);                                                 \
+        c = fBlaMka(c, d);                                                     \
+        b = rotr64(b ^ c, 63);                                                 \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,   \
+                           v12, v13, v14, v15)                                 \
+    do {                                                                       \
+        G(v0, v4, v8, v12);                                                    \
+        G(v1, v5, v9, v13);                                                    \
+        G(v2, v6, v10, v14);                                                   \
+        G(v3, v7, v11, v15);                                                   \
+        G(v0, v5, v10, v15);                                                   \
+        G(v1, v6, v11, v12);                                                   \
+        G(v2, v7, v8, v13);                                                    \
+        G(v3, v4, v9, v14);                                                    \
+    } while ((void)0, 0)
+
+#endif
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -1,5 +1,4 @@
 #include "blake-gate.h"
-#include "sph_blake.h"
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
@@ -7,25 +6,16 @@

 #if defined (BLAKE_4WAY)

+blake256r14_4way_context blake_4w_ctx;
+
 void blakehash_4way(void *state, const void *input)
 {
-     uint32_t vhash[4*4] __attribute__ ((aligned (64)));
-     uint32_t hash0[4] __attribute__ ((aligned (32)));
-     uint32_t hash1[4] __attribute__ ((aligned (32)));
-     uint32_t hash2[4] __attribute__ ((aligned (32)));
-     uint32_t hash3[4] __attribute__ ((aligned (32)));
-     blake256_4way_context ctx;
-
-     blake256_4way_init( &ctx );
-     blake256_4way( &ctx, input, 16 );
-     blake256_4way_close( &ctx, vhash );
-
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash1, 32 );
-     memcpy( state+96, hash1, 32 );
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256r14_4way_context ctx;
+     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
+     blake256r14_4way( &ctx, input + (64<<2), 16 );
+     blake256r14_4way_close( &ctx, vhash );
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -36,24 +26,23 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   const uint32_t first_nonce = pdata[19];
-//   uint32_t HTarget = ptarget[7];
+   uint32_t HTarget = ptarget[7];
   uint32_t _ALIGN(32) edata[20];
   uint32_t n = first_nonce;
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;

-//   if (opt_benchmark)
-//      HTarget = 0x7f;
+   if (opt_benchmark)
+      HTarget = 0x7f;

   // we need big endian data...
   swab32_array( edata, pdata, 20 );
-
   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   blake256r14_4way_init( &blake_4w_ctx );
+   blake256r14_4way( &blake_4w_ctx, vdata, 64 );

   uint32_t *noncep = vdata + 76;   // 19*4
   do {
-      found[0] = found[1] = found[2] = found[3] = false;
      be32enc( noncep,    n   );
      be32enc( noncep +1, n+1 );
      be32enc( noncep +2, n+2 );
@@ -61,45 +50,14 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,

      blakehash_4way( hash, vdata );

-      if ( hash[7] == 0 )
+      for ( int i = 0; i < 4; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
      {
-         if ( fulltest( hash, ptarget ) )
-         {
-             found[0] = true;
-             num_found++;
-             nonces[0] = n;
-             pdata[19] = n;
-         }
+          pdata[19] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
-      if ( (hash+8)[7] == 0 ) 
-      {
-         if ( fulltest( hash+8, ptarget ) ) 
-         {
-             found[1] = true;
-             num_found++;
-             nonces[1] = n+1;
-         }
-      }
-      if ( (hash+16)[7] == 0 )
-      {
-          if ( fulltest( hash+8, ptarget ) )
-          {
-              found[2] = true;
-              num_found++;
-              nonces[2] = n+2;
-          }
-      }
-      if ( (hash+24)[7] == 0 )
-      {
-         if ( fulltest( hash+8, ptarget ) )
-         {
-              found[3] = true;
-              num_found++;
-              nonces[3] = n+3;
-         }
-      }
-       n += 4;
-      *hashes_done = n - first_nonce + 1;
+      n += 4;

   } while ( (num_found == 0) && (n < max_nonce) 
             && !work_restart[thr_id].restart );
@@ -110,3 +68,77 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,

 #endif

+#if defined(BLAKE_8WAY)
+
+blake256r14_8way_context blake_8w_ctx;
+
+void blakehash_8way( void *state, const void *input )
+{
+     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+     blake256r14_8way_context ctx;
+     memcpy( &ctx, &blake_8w_ctx, sizeof ctx );
+     blake256r14_8way( &ctx, input + (64<<3), 16 );
+     blake256r14_8way_close( &ctx, vhash );
+     mm256_deinterleave_8x32( state,     state+ 32, state+ 64, state+ 96,
+                              state+128, state+160, state+192, state+224,
+                              vhash, 256 );
+}
+
+int scanhash_blake_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+
+   if (opt_benchmark)
+      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( edata, pdata, 20 );
+
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+
+   blake256r14_8way_init( &blake_8w_ctx );
+   blake256r14_8way( &blake_8w_ctx, vdata, 64 );
+
+   uint32_t *noncep = vdata + 152;   // 19*8
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      be32enc( noncep +4, n+4 );
+      be32enc( noncep +5, n+5 );
+      be32enc( noncep +6, n+6 );
+      be32enc( noncep +7, n+7 );
+      pdata[19] = n;
+
+      blakehash_8way( hash, vdata );
+
+      for ( int i = 0; i < 8; i++ )
+      if ( (hash+i)[7] <= HTarget && fulltest( hash+i, ptarget ) )
+      {
+          pdata[19] = n+i;
+          num_found++;
+          nonces[i] = n+i;
+          work_set_target_ratio( work, hash+1 );
+      }
+      n += 8;
+
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/blake/blake-gate.c
+++ b/algo/blake/blake-gate.c
@@ -7,14 +7,14 @@ int64_t blake_get_max64 ()

 bool register_blake_algo( algo_gate_t* gate )
 {
+  gate->optimizations = AVX2_OPT;
  gate->get_max64 = (void*)&blake_get_max64;
 //#if defined (__AVX2__) && defined (FOUR_WAY)
-//   gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
+//   gate->optimizations = SSE2_OPT | AVX2_OPT;
 //  gate->scanhash  = (void*)&scanhash_blake_8way;
 //  gate->hash      = (void*)&blakehash_8way;
 #if defined(BLAKE_4WAY)
  four_way_not_tested();
-  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_blake_4way;
  gate->hash      = (void*)&blakehash_4way;
 #else
--- a/algo/blake/blake-gate.h
+++ b/algo/blake/blake-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX__)
+#if defined(__AVX2__)
  #define BLAKE_4WAY
 #endif

--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
--- a/algo/blake/blake-hash-4way.h
+++ b/algo/blake/blake-hash-4way.h
@@ -35,7 +35,9 @@
 */

 #ifndef __BLAKE_HASH_4WAY__
-#define __BLAKE_HASH_4WAY___
+#define __BLAKE_HASH_4WAY__ 1
+
+#ifdef __SSE4_2__

 #ifdef __cplusplus
 extern "C"{
@@ -45,47 +47,81 @@ extern "C"{
 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"

-/**
- * Output size (in bits) for BLAKE-256.
- */
 #define SPH_SIZE_blake256   256

-#if SPH_64
-
-/**
- * Output size (in bits) for BLAKE-512.
- */
 #define SPH_SIZE_blake512   512

-#endif
+// With SSE4.2 only Blake-256 4 way is available.
+// With AVX2 Blake-256 8way & Blake-512 4 way are also available.
+
+// Blake-256 4 way

-#ifdef __AVX__
 typedef struct {
-        __m128i buf[16] __attribute__ ((aligned (64)));
-        __m128i H[8];
-        __m128i S[4];    
-        size_t ptr;
-	sph_u32 T0, T1;
+   __m128i buf[16] __attribute__ ((aligned (64)));
+   __m128i H[8];
+   __m128i S[4];    
+   size_t ptr;
+   sph_u32 T0, T1;
+   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
 } blake_4way_small_context;

+// Default 14 rounds
 typedef blake_4way_small_context blake256_4way_context;
-
 void blake256_4way_init(void *cc);
 void blake256_4way(void *cc, const void *data, size_t len);
 void blake256_4way_close(void *cc, void *dst);
-void blake256_4way_addbits_and_close(
-        void *cc, unsigned ub, unsigned n, void *dst);

-#endif
+// 14 rounds, blake, decred
+typedef blake_4way_small_context blake256r14_4way_context;
+void blake256r14_4way_init(void *cc);
+void blake256r14_4way(void *cc, const void *data, size_t len);
+void blake256r14_4way_close(void *cc, void *dst);
+
+// 8 rounds, blakecoin, vanilla
+typedef blake_4way_small_context blake256r8_4way_context;
+void blake256r8_4way_init(void *cc);
+void blake256r8_4way(void *cc, const void *data, size_t len);
+void blake256r8_4way_close(void *cc, void *dst);

 #ifdef __AVX2__

+// Blake-256 8 way
+
 typedef struct {
-        __m256i buf[16] __attribute__ ((aligned (64)));
-        __m256i H[8];
-        __m256i S[4];   
-        size_t ptr;
-	sph_u64 T0, T1;
+   __m256i buf[16] __attribute__ ((aligned (64)));
+   __m256i H[8];
+   __m256i S[4];
+   size_t ptr;
+   sph_u32 T0, T1;
+   int rounds;   // 14 for blake, 8 for blakecoin & vanilla
+} blake_8way_small_context;
+
+// Default 14 rounds
+typedef blake_8way_small_context blake256_8way_context;
+void blake256_8way_init(void *cc);
+void blake256_8way(void *cc, const void *data, size_t len);
+void blake256_8way_close(void *cc, void *dst);
+
+// 14 rounds, blake, decred
+typedef blake_8way_small_context blake256r14_8way_context;
+void blake256r14_8way_init(void *cc);
+void blake256r14_8way(void *cc, const void *data, size_t len);
+void blake256r14_8way_close(void *cc, void *dst);
+
+// 8 rounds, blakecoin, vanilla
+typedef blake_8way_small_context blake256r8_8way_context;
+void blake256r8_8way_init(void *cc);
+void blake256r8_8way(void *cc, const void *data, size_t len);
+void blake256r8_8way_close(void *cc, void *dst);
+
+// Blake-512 4 way
+
+typedef struct {
+   __m256i buf[16] __attribute__ ((aligned (64)));
+   __m256i H[8];
+   __m256i S[4];   
+   size_t ptr;
+   sph_u64 T0, T1;
 } blake_4way_big_context;

 typedef blake_4way_big_context blake512_4way_context;
@@ -103,3 +139,5 @@ void blake512_4way_addbits_and_close(
 #endif

 #endif
+
+#endif
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -0,0 +1,136 @@
+#include "blake2s-gate.h"
+#include "blake2s-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+
+#if defined(BLAKE2S_8WAY)
+
+static __thread blake2s_8way_state blake2s_8w_ctx;
+
+void blake2s_8way_hash( void *output, const void *input )
+{
+   uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+   blake2s_8way_state ctx;
+   memcpy( &ctx, &blake2s_8w_ctx, sizeof ctx );
+
+   blake2s_8way_update( &ctx, input + (64<<3), 16 );
+   blake2s_8way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
+
+   mm256_deinterleave_8x32( output,     output+ 32, output+ 64, output+ 96,
+                            output+128, output+160, output+192, output+224,
+                            vhash, 256 );
+}
+
+int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(64) edata[20];
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 152;   // 19*8
+
+   swab32_array( edata, pdata, 20 );
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+   blake2s_8way_init( &blake2s_8w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_8way_update( &blake2s_8w_ctx, vdata, 64 );
+
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      be32enc( noncep +4, n+4 );
+      be32enc( noncep +5, n+5 );
+      be32enc( noncep +6, n+6 );
+      be32enc( noncep +7, n+7 );
+      pdata[19] = n;
+
+      blake2s_8way_hash( hash, vdata );
+
+
+      for ( int i = 0; i < 8; i++ )
+      if (  (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          pdata[19] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 8;
+
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#elif defined(BLAKE2S_4WAY)
+
+static __thread blake2s_4way_state blake2s_4w_ctx;
+
+void blake2s_4way_hash( void *output, const void *input )
+{
+   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+   blake2s_4way_state ctx;
+   memcpy( &ctx, &blake2s_4w_ctx, sizeof ctx );
+
+   blake2s_4way_update( &ctx, input + (64<<2), 16 );
+   blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );
+
+   mm_deinterleave_4x32( output, output+32, output+64, output+96, vhash, 256 );
+}
+
+int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   uint32_t _ALIGN(64) edata[20];
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 76;   // 19*4
+
+   swab32_array( edata, pdata, 20 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
+   blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );
+
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      pdata[19] = n;
+
+      blake2s_4way_hash( hash, vdata );
+
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          pdata[19] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/blake/blake2s-gate.c
+++ b/algo/blake/blake2s-gate.c
@@ -0,0 +1,27 @@
+#include "blake2s-gate.h"
+
+
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blake2s_get_max64 ()
+{
+   return 0x7ffffLL;
+}
+
+bool register_blake2s_algo( algo_gate_t* gate )
+{
+#if defined(BLAKE2S_8WAY)
+  gate->scanhash  = (void*)&scanhash_blake2s_8way;
+  gate->hash      = (void*)&blake2s_8way_hash;
+#elif defined(BLAKE2S_4WAY)
+  gate->scanhash  = (void*)&scanhash_blake2s_4way;
+  gate->hash      = (void*)&blake2s_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_blake2s;
+  gate->hash      = (void*)&blake2s_hash;
+#endif
+  gate->get_max64 = (void*)&blake2s_get_max64;
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  return true;
+};
+
+
--- a/algo/blake/blake2s-gate.h
+++ b/algo/blake/blake2s-gate.h
@@ -0,0 +1,35 @@
+#ifndef __BLAKE2S_GATE_H__
+#define __BLAKE2S_GATE_H__ 1
+
+#include <stdint.h>
+#include "algo-gate-api.h"
+
+#if defined(__SSE4_2__)
+  #define BLAKE2S_4WAY
+#endif
+#if defined(__AVX2__)
+  #define BLAKE2S_8WAY
+#endif
+
+bool register_blake2s_algo( algo_gate_t* gate );
+
+#if defined(BLAKE2S_8WAY)
+
+void blake2s_8way_hash( void *state, const void *input );
+int scanhash_blake2s_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+#elif defined (BLAKE2S_4WAY)
+
+void blake2s_4way_hash( void *state, const void *input );
+int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#else
+
+void blake2s_hash( void *state, const void *input );
+int scanhash_blake2s( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done );
+
+#endif
+
+#endif
--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -0,0 +1,362 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+#include "blake2s-hash-4way.h"
+
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#if defined(__SSE4_2__)
+
+static const uint32_t blake2s_IV[8] =
+{
+	0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+	0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
+};
+
+static const uint8_t blake2s_sigma[10][16] =
+{
+	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+};
+
+// define a constant for initial param.
+
+int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen )
+{
+   blake2s_nway_param P[1];
+
+   P->digest_length = outlen;
+   P->key_length    = 0;
+   P->fanout        = 1;
+   P->depth         = 1;
+   P->leaf_length   = 0;    
+   *((uint64_t*)(P->node_offset)) = 0;
+   P->node_depth    = 0;
+   P->inner_length  = 0;
+   memset( P->salt,     0, sizeof( P->salt ) );
+   memset( P->personal, 0, sizeof( P->personal ) );
+
+   memset( S, 0, sizeof( blake2s_4way_state ) );
+   for( int i = 0; i < 8; ++i )
+      S->h[i] = _mm_set1_epi32( blake2s_IV[i] );
+
+   uint32_t *p = ( uint32_t * )( P );
+
+   /* IV XOR ParamBlock */
+   for ( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm_xor_si128( S->h[i], _mm_set1_epi32( p[i] ) );
+   return 0;
+}
+
+int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
+{
+   __m128i m[16];
+   __m128i v[16];
+
+   memcpy_128( m, block, 16 );
+   memcpy_128( v, S->h, 8 );
+
+   v[ 8] = _mm_set1_epi32( blake2s_IV[0] );
+   v[ 9] = _mm_set1_epi32( blake2s_IV[1] );
+   v[10] = _mm_set1_epi32( blake2s_IV[2] );
+   v[11] = _mm_set1_epi32( blake2s_IV[3] );
+   v[12] = _mm_xor_si128( _mm_set1_epi32( S->t[0] ),
+                          _mm_set1_epi32( blake2s_IV[4] ) );
+   v[13] = _mm_xor_si128( _mm_set1_epi32( S->t[1] ),
+                          _mm_set1_epi32( blake2s_IV[5] ) );
+   v[14] = _mm_xor_si128( _mm_set1_epi32( S->f[0] ),
+                          _mm_set1_epi32( blake2s_IV[6] ) );
+   v[15] = _mm_xor_si128( _mm_set1_epi32( S->f[1] ),
+                          _mm_set1_epi32( blake2s_IV[7] ) );
+
+#define G4W(r,i,a,b,c,d) \
+do { \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+0] ] ); \
+   d = mm_ror_32( _mm_xor_si128( d, a ), 16 ); \
+   c = _mm_add_epi32( c, d ); \
+   b = mm_ror_32( _mm_xor_si128( b, c ), 12 ); \
+   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+1] ] ); \
+   d = mm_ror_32( _mm_xor_si128( d, a ),  8 ); \
+   c = _mm_add_epi32( c, d ); \
+   b = mm_ror_32( _mm_xor_si128( b, c ),  7 ); \
+} while(0)
+
+#define ROUND4W(r)  \
+do { \
+   G4W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
+   G4W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
+   G4W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
+   G4W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
+   G4W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
+   G4W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
+   G4W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
+   G4W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
+} while(0)
+
+   ROUND4W( 0 );
+   ROUND4W( 1 );
+   ROUND4W( 2 );
+   ROUND4W( 3 );
+   ROUND4W( 4 );
+   ROUND4W( 5 );
+   ROUND4W( 6 );
+   ROUND4W( 7 );
+   ROUND4W( 8 );
+   ROUND4W( 9 );
+
+   for( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm_xor_si128( _mm_xor_si128( S->h[i], v[i] ), v[i + 8] );
+
+#undef G4W
+#undef ROUND4W
+   return 0;
+}
+
+int blake2s_4way_update( blake2s_4way_state *S, const void *in,
+                         uint64_t inlen )
+{
+  __m128i *input = (__m128i*)in;
+  __m128i *buf = (__m128i*)S->buf;
+  const int bsize = BLAKE2S_BLOCKBYTES;
+
+   while( inlen > 0 )
+   {
+      size_t left = S->buflen;
+      if( inlen >= bsize - left )
+      {
+         memcpy_128( buf + (left>>2), input, (bsize - left) >> 2 );
+         S->buflen += bsize - left;
+         S->t[0] += BLAKE2S_BLOCKBYTES;
+         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         blake2s_4way_compress( S, buf ); 
+         S->buflen = 0;
+         input += ( bsize >> 2 );
+         inlen -= bsize;
+      }
+      else
+      {
+          memcpy_128( buf + ( left>>2 ), input, inlen>>2 );
+          S->buflen += (size_t) inlen; 
+          input += ( inlen>>2 );
+          inlen -= inlen;
+      }
+   }
+   return 0;
+}
+
+int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen )
+{
+   __m128i *buf = (__m128i*)S->buf;
+
+   S->t[0] += S->buflen;
+   S->t[1] += ( S->t[0] < S->buflen );
+   if ( S->last_node ) 
+      S->f[1] = ~0U;
+   S->f[0] = ~0U;
+
+   memset_zero_128( buf + ( S->buflen>>2 ),
+                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );      
+   blake2s_4way_compress( S, buf );
+
+   for ( int i = 0; i < 8; ++i )
+      casti_m128i( out, i ) = S->h[ i ];
+   return 0;
+}
+
+#if defined(__AVX2__)
+
+int blake2s_8way_compress( blake2s_8way_state *S, const __m256i *block )
+{
+   __m256i m[16];
+   __m256i v[16];
+
+   memcpy_256( m, block, 16 );
+   memcpy_256( v, S->h, 8 );
+
+   v[ 8] = _mm256_set1_epi32( blake2s_IV[0] );
+   v[ 9] = _mm256_set1_epi32( blake2s_IV[1] );
+   v[10] = _mm256_set1_epi32( blake2s_IV[2] );
+   v[11] = _mm256_set1_epi32( blake2s_IV[3] );
+   v[12] = _mm256_xor_si256( _mm256_set1_epi32( S->t[0] ),
+                             _mm256_set1_epi32( blake2s_IV[4] ) );
+   v[13] = _mm256_xor_si256( _mm256_set1_epi32( S->t[1] ),
+                             _mm256_set1_epi32( blake2s_IV[5] ) );
+   v[14] = _mm256_xor_si256( _mm256_set1_epi32( S->f[0] ),
+                             _mm256_set1_epi32( blake2s_IV[6] ) );
+   v[15] = _mm256_xor_si256( _mm256_set1_epi32( S->f[1] ),
+                             _mm256_set1_epi32( blake2s_IV[7] ) );
+
+#define G8W(r,i,a,b,c,d) \
+do { \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
+                          m[ blake2s_sigma[r][2*i+0] ] ); \
+   d = mm256_ror_32( _mm256_xor_si256( d, a ), 16 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_ror_32( _mm256_xor_si256( b, c ), 12 ); \
+   a = _mm256_add_epi32( _mm256_add_epi32( a, b ), \
+                         m[ blake2s_sigma[r][2*i+1] ] ); \
+   d = mm256_ror_32( _mm256_xor_si256( d, a ),  8 ); \
+   c = _mm256_add_epi32( c, d ); \
+   b = mm256_ror_32( _mm256_xor_si256( b, c ),  7 ); \
+} while(0)
+
+#define ROUND8W(r)  \
+do { \
+   G8W( r, 0, v[ 0], v[ 4], v[ 8], v[12] ); \
+   G8W( r, 1, v[ 1], v[ 5], v[ 9], v[13] ); \
+   G8W( r, 2, v[ 2], v[ 6], v[10], v[14] ); \
+   G8W( r, 3, v[ 3], v[ 7], v[11], v[15] ); \
+   G8W( r, 4, v[ 0], v[ 5], v[10], v[15] ); \
+   G8W( r, 5, v[ 1], v[ 6], v[11], v[12] ); \
+   G8W( r, 6, v[ 2], v[ 7], v[ 8], v[13] ); \
+   G8W( r, 7, v[ 3], v[ 4], v[ 9], v[14] ); \
+} while(0)
+
+   ROUND8W( 0 );
+   ROUND8W( 1 );
+   ROUND8W( 2 );
+   ROUND8W( 3 );
+   ROUND8W( 4 );
+   ROUND8W( 5 );
+   ROUND8W( 6 );
+   ROUND8W( 7 );
+   ROUND8W( 8 );
+   ROUND8W( 9 );
+
+   for( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm256_xor_si256( _mm256_xor_si256( S->h[i], v[i] ), v[i + 8] );
+
+#undef G8W
+#undef ROUND8W
+   return 0;
+}
+
+int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen )
+{
+   blake2s_nway_param P[1];
+
+   P->digest_length = outlen;
+   P->key_length    = 0;
+   P->fanout        = 1;
+   P->depth         = 1;
+   P->leaf_length   = 0;
+   *((uint64_t*)(P->node_offset)) = 0;
+   P->node_depth    = 0;
+   P->inner_length  = 0;
+   memset( P->salt,     0, sizeof( P->salt ) );
+   memset( P->personal, 0, sizeof( P->personal ) );
+
+   memset( S, 0, sizeof( blake2s_8way_state ) );
+   for( int i = 0; i < 8; ++i )
+      S->h[i] = _mm256_set1_epi32( blake2s_IV[i] );
+
+   uint32_t *p = ( uint32_t * )( P );
+
+   /* IV XOR ParamBlock */
+   for ( size_t i = 0; i < 8; ++i )
+      S->h[i] = _mm256_xor_si256( S->h[i], _mm256_set1_epi32( p[i] ) );
+   return 0;
+}
+
+int blake2s_8way_update( blake2s_8way_state *S, const void *in,
+                         uint64_t inlen )
+{
+  __m256i *input = (__m256i*)in;
+  __m256i *buf = (__m256i*)S->buf;
+  const int bsize = BLAKE2S_BLOCKBYTES;
+
+   while( inlen > 0 )
+   {
+      size_t left = S->buflen;
+      if( inlen >= bsize - left )
+      {
+         memcpy_256( buf + (left>>2), input, (bsize - left) >> 2 );
+         S->buflen += bsize - left;
+         S->t[0] += BLAKE2S_BLOCKBYTES;
+         S->t[1] += ( S->t[0] < BLAKE2S_BLOCKBYTES );
+         blake2s_8way_compress( S, buf );
+         S->buflen = 0;
+         input += ( bsize >> 2 );
+         inlen -= bsize;
+      }
+      else
+      {
+          memcpy_256( buf + ( left>>2 ), input, inlen>>2 );
+          S->buflen += (size_t) inlen;
+          input += ( inlen>>2 );
+          inlen -= inlen;
+      }
+   }
+   return 0;
+}
+
+int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen )
+{
+   __m256i *buf = (__m256i*)S->buf;
+
+   S->t[0] += S->buflen;
+   S->t[1] += ( S->t[0] < S->buflen );
+   if ( S->last_node )
+      S->f[1] = ~0U;
+   S->f[0] = ~0U;
+
+   memset_zero_256( buf + ( S->buflen>>2 ),
+                    ( BLAKE2S_BLOCKBYTES - S->buflen ) >> 2 );
+   blake2s_8way_compress( S, buf );
+
+   for ( int i = 0; i < 8; ++i )
+      casti_m256i( out, i ) = S->h[ i ];
+   return 0;
+}
+
+
+#endif // __AVX2__
+
+#if 0
+int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+{
+	blake2s_state S[1];
+
+	/* Verify parameters */
+	if ( NULL == in ) return -1;
+
+	if ( NULL == out ) return -1;
+
+	if ( NULL == key ) keylen = 0; /* Fail here instead if keylen != 0 and key == NULL? */
+
+	if( keylen > 0 )
+	{
+		if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1;
+	}
+	else
+	{
+		if( blake2s_init( S, outlen ) < 0 ) return -1;
+	}
+
+	blake2s_update( S, ( uint8_t * )in, inlen );
+	blake2s_final( S, out, outlen );
+	return 0;
+}
+#endif
+
+#endif // __SSE4_2__
--- a/algo/blake/blake2s-hash-4way.h
+++ b/algo/blake/blake2s-hash-4way.h
@@ -0,0 +1,112 @@
+/**
+ * BLAKE2 reference source code package - reference C implementations
+ *
+ * Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * You should have received a copy of the CC0 Public Domain Dedication along with
+ * this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+//#pragma once
+#ifndef __BLAKE2S_HASH_4WAY_H__
+#define __BLAKE2S_HASH_4WAY_H__ 1
+
+#if defined(__SSE4_2__)
+
+#include "avxdefs.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+#include <inttypes.h>
+#define inline __inline
+#define ALIGN(x) __declspec(align(x))
+#else
+#define ALIGN(x) __attribute__((aligned(x)))
+#endif
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+enum blake2s_constant
+{
+   BLAKE2S_BLOCKBYTES = 64,
+   BLAKE2S_OUTBYTES   = 32,
+   BLAKE2S_KEYBYTES   = 32,
+   BLAKE2S_SALTBYTES  = 8,
+   BLAKE2S_PERSONALBYTES = 8
+};
+
+#pragma pack(push, 1)
+typedef struct __blake2s_nway_param
+{
+   uint8_t  digest_length; // 1
+   uint8_t  key_length;    // 2
+   uint8_t  fanout;        // 3
+   uint8_t  depth;         // 4
+   uint32_t leaf_length;   // 8
+   uint8_t  node_offset[6];// 14
+   uint8_t  node_depth;    // 15
+   uint8_t  inner_length;  // 16
+   // uint8_t  reserved[0];
+   uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
+   uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+} blake2s_nway_param;
+#pragma pack(pop)
+
+ALIGN( 64 ) typedef struct __blake2s_4way_state
+{
+   __m128i h[8];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 4 ];
+   uint32_t t[2];
+   uint32_t f[2];
+   size_t   buflen;
+   uint8_t  last_node;
+} blake2s_4way_state ;
+
+int blake2s_4way_init( blake2s_4way_state *S, const uint8_t outlen );
+int blake2s_4way_update( blake2s_4way_state *S, const void *in,
+                         uint64_t inlen );
+int blake2s_4way_final( blake2s_4way_state *S, void *out, uint8_t outlen );
+
+#if defined(__AVX2__)
+
+ALIGN( 64 ) typedef struct __blake2s_8way_state
+{
+   __m256i h[8];
+   uint8_t  buf[ BLAKE2S_BLOCKBYTES * 8 ];
+   uint32_t t[2];
+   uint32_t f[2];
+   size_t   buflen;
+   uint8_t  last_node;
+} blake2s_8way_state ;
+
+int blake2s_8way_init( blake2s_8way_state *S, const uint8_t outlen );
+int blake2s_8way_update( blake2s_8way_state *S, const void *in,
+                         uint64_t inlen );
+int blake2s_8way_final( blake2s_8way_state *S, void *out, uint8_t outlen );
+
+#endif
+
+#if 0
+	// Simple API
+//	int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+
+	// Direct Hash Mining Helpers
+	#define blake2s_salt32(out, in, inlen, key32) blake2s(out, in, key32, 32, inlen, 32) /* neoscrypt */
+	#define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0)
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif  // __SSE4_2__
+
+#endif
--- a/algo/blake/blake2s.c
+++ b/algo/blake/blake2s.c
@@ -1,26 +1,29 @@
-#include "algo-gate-api.h"
+#include "blake2s-gate.h"

 #include <string.h>
 #include <stdint.h>

-#include "crypto/blake2s.h"
+#include "sph-blake2s.h"

-static __thread blake2s_state s_midstate;
-static __thread blake2s_state s_ctx;
+static __thread blake2s_state blake2s_ctx;
+//static __thread blake2s_state s_ctx;
 #define MIDLEN 76

-void blake2s_hash(void *output, const void *input)
+void blake2s_hash( void *output, const void *input )
 {
-	unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
-	blake2s_state blake2_ctx __attribute__ ((aligned (64)));
-
-	blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES);
-	blake2s_update(&blake2_ctx, input, 80);
-	blake2s_final(&blake2_ctx, hash, BLAKE2S_OUTBYTES);
+   unsigned char _ALIGN(64) hash[BLAKE2S_OUTBYTES];
+   blake2s_state ctx __attribute__ ((aligned (64)));
+  
+   memcpy( &ctx, &blake2s_ctx, sizeof ctx );
+   blake2s_update( &ctx, input+64, 16 );
+ 
+//	blake2s_init(&ctx, BLAKE2S_OUTBYTES);
+//	blake2s_update(&ctx, input, 80);
+	blake2s_final( &ctx, hash, BLAKE2S_OUTBYTES );

 	memcpy(output, hash, 32);
 }
-
+/*
 static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
 {
 	s_ctx.buflen = MIDLEN;
@@ -28,7 +31,7 @@ static void blake2s_hash_end(uint32_t *output, const uint32_t *input)
 	blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80 - MIDLEN);
 	blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES);
 }
-
+*/
 int scanhash_blake2s(int thr_id, struct work *work,
 	uint32_t max_nonce, uint64_t *hashes_done)
 {
@@ -46,13 +49,12 @@ int scanhash_blake2s(int thr_id, struct work *work,
        swab32_array( endiandata, pdata, 20 );

 	// midstate
-	blake2s_init(&s_midstate, BLAKE2S_OUTBYTES);
-	blake2s_update(&s_midstate, (uint8_t*) endiandata, MIDLEN);
-	memcpy(&s_ctx, &s_midstate, sizeof(blake2s_state));
+	blake2s_init( &blake2s_ctx, BLAKE2S_OUTBYTES );
+	blake2s_update( &blake2s_ctx, (uint8_t*) endiandata, 64 );

 	do {
 		be32enc(&endiandata[19], n);
-		blake2s_hash_end(hash64, endiandata);
+		blake2s_hash( hash64, endiandata );
 		if (hash64[7] < Htarg && fulltest(hash64, ptarget)) {
 			*hashes_done = n - first_nonce + 1;
 			pdata[19] = n;
@@ -67,7 +69,7 @@ int scanhash_blake2s(int thr_id, struct work *work,

 	return 0;
 }
-
+/*
 // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
 int64_t blake2s_get_max64 ()
 {
@@ -81,4 +83,4 @@ bool register_blake2s_algo( algo_gate_t* gate )
  gate->get_max64 = (void*)&blake2s_get_max64;
  return true;
 };
-
+*/
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -0,0 +1,141 @@
+#include "blakecoin-gate.h"
+#include "blake-hash-4way.h"
+#include <string.h>
+#include <stdint.h>
+#include <memory.h>
+
+#if defined (BLAKECOIN_4WAY)
+
+blake256r8_4way_context blakecoin_4w_ctx;
+
+void blakecoin_4way_hash(void *state, const void *input)
+{
+     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+     blake256r8_4way_context ctx;
+
+     memcpy( &ctx, &blakecoin_4w_ctx, sizeof ctx );
+     blake256r8_4way( &ctx, input + (64<<2), 16 );
+     blake256r8_4way_close( &ctx, vhash );
+
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   if ( opt_benchmark )
+      HTarget = 0x7f;
+
+   swab32_array( edata, pdata, 20 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   blake256r8_4way_init( &blakecoin_4w_ctx );
+   blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );
+
+   uint32_t *noncep = vdata + 76;   // 19*4
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      pdata[19] = n;
+      blakecoin_4way_hash( hash, vdata );
+
+      for ( int i = 0; i < 4; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      {
+          pdata[19] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+
+   } while ( (num_found == 0) && (n < max_nonce) 
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
+
+#if defined(BLAKECOIN_8WAY)
+
+blake256r8_8way_context blakecoin_8w_ctx;
+
+void blakecoin_8way_hash( void *state, const void *input )
+{
+     uint32_t vhash[8*8] __attribute__ ((aligned (64)));
+     blake256r8_8way_context ctx;
+
+     memcpy( &ctx, &blakecoin_8w_ctx, sizeof ctx );
+     blake256r8_8way( &ctx, input + (64<<3), 16 );
+     blake256r8_8way_close( &ctx, vhash );
+
+     mm256_deinterleave_8x32( state,     state+ 32, state+ 64, state+ 96,
+                              state+128, state+160, state+192, state+224,
+                              vhash, 256 );
+}
+
+int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+   uint32_t vdata[20*8] __attribute__ ((aligned (64)));
+   uint32_t hash[8*8] __attribute__ ((aligned (32)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t HTarget = ptarget[7];
+   uint32_t _ALIGN(32) edata[20];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   uint32_t *noncep = vdata + 152;   // 19*8
+   int num_found = 0;
+   if ( opt_benchmark )
+      HTarget = 0x7f;
+
+   // we need big endian data...
+   swab32_array( edata, pdata, 20 );
+   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
+                                 edata, edata, edata, edata, 640 );
+   blake256r8_8way_init( &blakecoin_8w_ctx );
+   blake256r8_8way( &blakecoin_8w_ctx, vdata, 64 );
+
+   do {
+      be32enc( noncep,    n   );
+      be32enc( noncep +1, n+1 );
+      be32enc( noncep +2, n+2 );
+      be32enc( noncep +3, n+3 );
+      be32enc( noncep +4, n+4 );
+      be32enc( noncep +5, n+5 );
+      be32enc( noncep +6, n+6 );
+      be32enc( noncep +7, n+7 );
+      pdata[19] = n;
+      blakecoin_8way_hash( hash, vdata );
+
+      for ( int i = 0; i < 8; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
+      {
+          pdata[19] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 8;
+   } while ( (num_found == 0) && (n < max_nonce)
+             && !work_restart[thr_id].restart );
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
+
--- a/algo/blake/blakecoin-gate.c
+++ b/algo/blake/blakecoin-gate.c
@@ -0,0 +1,36 @@
+#include "blakecoin-gate.h"
+#include <memory.h>
+
+// changed to get_max64_0x3fffffLL in cpuminer-multi-decred
+int64_t blakecoin_get_max64 ()
+{
+  return 0x7ffffLL;
+//  return 0x3fffffLL;
+}
+
+// vanilla uses default gen merkle root, otherwise identical to blakecoin
+bool register_vanilla_algo( algo_gate_t* gate )
+{
+#if defined(BLAKECOIN_8WAY)
+  gate->scanhash  = (void*)&scanhash_blakecoin_8way;
+  gate->hash      = (void*)&blakecoin_8way_hash;
+
+#elif defined(BLAKECOIN_4WAY)
+  gate->scanhash  = (void*)&scanhash_blakecoin_4way;
+  gate->hash      = (void*)&blakecoin_4way_hash;
+#else
+  gate->scanhash = (void*)&scanhash_blakecoin;
+  gate->hash     = (void*)&blakecoinhash;
+#endif
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->get_max64 = (void*)&blakecoin_get_max64;
+  return true;
+}
+
+bool register_blakecoin_algo( algo_gate_t* gate )
+{
+  register_vanilla_algo( gate );
+  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
+  return true;
+}
+
--- a/algo/blake/blakecoin-gate.h
+++ b/algo/blake/blakecoin-gate.h
@@ -0,0 +1,30 @@
+#ifndef __BLAKECOIN_GATE_H__
+#define __BLAKECOIN_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__SSE4_2__)
+  #define BLAKECOIN_4WAY
+#endif
+#if defined(__AVX2__)
+  #define BLAKECOIN_8WAY
+#endif
+
+#if defined (BLAKECOIN_8WAY)
+void blakecoin_8way_hash(void *state, const void *input);
+int scanhash_blakecoin_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
+#if defined (BLAKECOIN_4WAY)
+void blakecoin_4way_hash(void *state, const void *input);
+int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+#endif
+
+void blakecoinhash( void *state, const void *input );
+int scanhash_blakecoin( int thr_id, struct work *work, uint32_t max_nonce,
+                      uint64_t *hashes_done );
+
+#endif
--- a/algo/blake/blakecoin.c
+++ b/algo/blake/blakecoin.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "blakecoin-gate.h"
 #define BLAKE32_ROUNDS 8
 #include "sph_blake.h"

@@ -98,7 +98,7 @@ void blakecoin_gen_merkle_root ( char* merkle_root, struct stratum_ctx* sctx )
 SHA256( sctx->job.coinbase, (int)sctx->job.coinbase_size, merkle_root );
 }
 */
-
+/*
 // changed to get_max64_0x3fffffLL in cpuminer-multi-decred
 int64_t blakecoin_get_max64 ()
 {
@@ -121,4 +121,4 @@ bool register_blakecoin_algo( algo_gate_t* gate )
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
  return true;
 }
-
+*/
--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -1,5 +1,4 @@
 #include "decred-gate.h"
-#include "sph_blake.h"
 #include "blake-hash-4way.h"
 #include <string.h>
 #include <stdint.h>
@@ -9,59 +8,22 @@
 #if defined (DECRED_4WAY)

 static __thread blake256_4way_context blake_mid;
-static __thread bool ctx_midstate_done = false;

 void decred_hash_4way( void *state, const void *input )
 {
     uint32_t vhash[8*4] __attribute__ ((aligned (64)));
-     uint32_t hash0[8] __attribute__ ((aligned (32)));
-     uint32_t hash1[8] __attribute__ ((aligned (32)));
-     uint32_t hash2[8] __attribute__ ((aligned (32)));
-     uint32_t hash3[8] __attribute__ ((aligned (32)));
-     blake256_4way_context ctx __attribute__ ((aligned (64)));
-
-     sph_blake256_context ctx2 __attribute__ ((aligned (64)));
-     uint32_t hash[16] __attribute__ ((aligned (64)));
-     uint32_t sin0[45], sin1[45], sin2[45], sin3[45];
-
-     mm_deinterleave_4x32x( sin0, sin1, sin2, sin3, input, 180*8 );
-
-     void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
+//     uint32_t hash0[8] __attribute__ ((aligned (32)));
+//     uint32_t hash1[8] __attribute__ ((aligned (32)));
+//     uint32_t hash2[8] __attribute__ ((aligned (32)));
+//     uint32_t hash3[8] __attribute__ ((aligned (32)));
+     const void *tail = input + ( DECRED_MIDSTATE_LEN << 2 );
     int tail_len = 180 - DECRED_MIDSTATE_LEN; 
+     blake256_4way_context ctx __attribute__ ((aligned (64)));

     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
     blake256_4way( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
-/*
-     sph_blake256_init( &ctx2 );
-     sph_blake256( &ctx2, sin0, 180 );
-     sph_blake256_close( &ctx2, hash );
-*/
-/*
-     blake256_4way_init( &ctx );
-     blake256_4way( &ctx, input, 180 );
-     blake256_4way_close( &ctx, vhash );
-*/
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
-/*
-        for ( int i = 0; i < 8; i++ )
-          if ( hash[i] != hash0[i] )
-            printf(" hash mismatch, i = %u\n",i);
-
-printf("hash:  %08lx %08lx %08lx %08lx\n", *hash, *(hash+1),
-                             *(hash+2), *(hash+3) );
-printf("hash0: %08lx %08lx %08lx %08lx\n", *hash0, *(hash0+1),
-                             *(hash0+2), *(hash0+3) );
-printf("\n");
-*/
-
-     memcpy( state,    hash0, 32 );
-     memcpy( state+32, hash1, 32 );
-     memcpy( state+64, hash2, 32 );
-     memcpy( state+96, hash3, 32 );
-
-//     memcpy( state, hash, 32 );
-
+     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -69,28 +31,26 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[48*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
-        uint32_t _ALIGN(64) edata[48];
-        uint32_t *pdata = work->data;
-        uint32_t *ptarget = work->target;
-        const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
-        uint32_t n = first_nonce;
-        const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
+   uint32_t _ALIGN(64) edata[48];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[DECRED_NONCE_INDEX];
+   uint32_t n = first_nonce;
+   const uint32_t HTarget = opt_benchmark ? 0x7f : ptarget[7];
   uint32_t *nonces = work->nonces;
-   bool *found = work->nfound;
   int num_found = 0;

-        ctx_midstate_done = false;
-        memcpy( edata, pdata, 180 );
+   // copy to buffer guaranteed to be aligned.
+   memcpy( edata, pdata, 180 );

   // use the old way until  new way updated for size.
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 180*8 );
+   mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );

   blake256_4way_init( &blake_mid );
   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );

   uint32_t *noncep = vdata + DECRED_NONCE_INDEX * 4;
   do {
-      found[0] = found[1] = found[2] = found[3] = false;
      * noncep    = n;
      *(noncep+1) = n+1;
      *(noncep+2) = n+2;
@@ -98,55 +58,14 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,

      decred_hash_4way( hash, vdata );

-      if ( hash[7] <= HTarget && fulltest( hash, ptarget ) )
+      for ( int i = 0; i < 4; i++ )
+      if (  (hash+(i<<3))[7] <= HTarget && fulltest( hash+(i<<3), ptarget ) )
      {
-          work_set_target_ratio( work, hash );
-          found[0] = true;
-          num_found++;
-          nonces[0] = n;
-          pdata[DECRED_NONCE_INDEX] = n;
+          pdata[DECRED_NONCE_INDEX] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
      }
-/*
-      if ( (hash+8)[7] <= HTarget && fulltest( hash+8, ptarget ) )
-      {
-printf("found 1\n");          
-
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[8], hash[9], hash[10],hash[11] );
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[12], hash[13], hash[14],hash[15] );
-printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
-printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
-
-          work_set_target_ratio( work, hash+8 );
-          found[1] = true;
-          num_found++;
-          nonces[1] = n+1;
-      }
-*/
-      if ( (hash+16)[7] <= HTarget && fulltest( hash+16, ptarget ) )
-      {
-          work_set_target_ratio( work, hash+16 );
-          found[2] = true;
-          num_found++;
-          nonces[2] = n+2;
-      }
-/*
-      if ( (hash+24)[7] <= HTarget && fulltest( hash+24, ptarget ) )
-      {
-printf("found 3\n");          
-
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[0], hash[1], hash[2],hash[3] );
-printf("vhash: %08lx %08lx %08lx %08lx\n", hash[4], hash[5], hash[6],hash[7] );
-printf("shash: %08lx %08lx %08lx %08lx\n", shash[0], shash[1], shash[2],shash[3] );
-printf("shash: %08lx %08lx %08lx %08lx\n\n", shash[4], shash[5], shash[6],shash[7] );
-
-          work_set_target_ratio( work, hash+24 );
-          found[3] = true;
-          num_found++;
-          nonces[3] = n+3;
-      }
-*/
-      n += 2;
-//      n += 4;
+      n += 4;
  } while ( (num_found == 0) && (n < max_nonce) 
            && !work_restart[thr_id].restart );

--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -145,15 +145,13 @@ bool register_decred_algo( algo_gate_t* gate )
 {
 #if defined(DECRED_4WAY)
  four_way_not_tested();
-  gate->optimizations = FOUR_WAY_OPT;
  gate->scanhash  = (void*)&scanhash_decred_4way;
  gate->hash      = (void*)&decred_hash_4way;
 #else
-  gate->optimizations = SSE2_OPT;
  gate->scanhash  = (void*)&scanhash_decred;
  gate->hash      = (void*)&decred_hash;
 #endif
-
+  gate->optimizations = AVX2_OPT;
  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
  gate->display_extra_data    = (void*)&decred_decode_extradata;
--- a/algo/blake/decred-gate.h
+++ b/algo/blake/decred-gate.h
@@ -18,7 +18,7 @@
 //                         uint64_t *hashes_done );
 #endif

-#if defined(FOUR_WAY) && defined(__AVX__)
+#if defined(__SSE4_2__)
  #define DECRED_4WAY
 #endif

--- a/algo/blake/pentablake-4way.c
+++ b/algo/blake/pentablake-4way.c
@@ -1,4 +1,7 @@
 #include "pentablake-gate.h"
+
+#if defined (__AVX2__)
+
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
@@ -9,8 +12,6 @@

 //#define DEBUG_ALGO

-#ifdef PENTABLAKE_4WAY
-
 extern void pentablakehash_4way( void *output, const void *input )
 {
 	unsigned char _ALIGN(32) hash[128];
@@ -110,12 +111,8 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
    const uint32_t first_nonce = pdata[19];
    const uint32_t Htarg = ptarget[7];
    uint32_t *nonces = work->nonces;
-    bool *found = work->nfound;
    int num_found = 0;
-    uint32_t *noncep0 = vdata + 73;   // 9*8 + 1
-    uint32_t *noncep1 = vdata + 75;
-    uint32_t *noncep2 = vdata + 77;
-    uint32_t *noncep3 = vdata + 79;
+    uint32_t *noncep = vdata + 73;   // 9*8 + 1

 //    uint32_t _ALIGN(32) hash64[8];
 //    uint32_t _ALIGN(32) endiandata[32];
@@ -149,47 +146,19 @@ int scanhash_pentablake_4way( int thr_id, struct work *work,
        {
           uint32_t mask = masks[m];
           do {
-              found[0] = found[1] = found[2] = found[3] = false;
-              be32enc( noncep0, n   );
-              be32enc( noncep1, n+1 );
-              be32enc( noncep2, n+2 );
-              be32enc( noncep3, n+3 );
+              be32enc( noncep,   n   );
+              be32enc( noncep+2, n+1 );
+              be32enc( noncep+4, n+2 );
+              be32enc( noncep+6, n+3 );

              pentablakehash_4way( hash, vdata );

-              // return immediately on nonce found, only one submit
-              if ( ( !(hash[7] & mask) ) && fulltest( hash, ptarget ) )
+              for ( int i = 0; i < 4; i++ )
+              if ( !( (hash+(i<<3))[7] & mask )
+                  && fulltest( hash+(i<<3), ptarget ) )
              {
-                  found[0] = true;
-                  num_found++;
-                  nonces[0] = n;
-                  pdata[19] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
-              }
-              if ( (! ((hash+8)[7] & mask) ) && fulltest( hash+8, ptarget ) )
-              {
-                  found[1] = true;
-                  num_found++;
-                  nonces[1] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
-              }
-              if ( ( !((hash+16)[7] & mask) ) && fulltest( hash+16, ptarget ) )
-              {
-                  found[2] = true;
-                  num_found++;
-                  nonces[2] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
-              }
-              if ( ( !((hash+24)[7] & mask) ) && fulltest( hash+24, ptarget ) )
-              {
-                  found[3] = true;
-                  num_found++;
-                  nonces[3] = n;
-                  *hashes_done = n - first_nonce + 1;
-                  return 1;
+                 nonces[ num_found++ ] = n+i;
+                 work_set_target_ratio( work, hash+(i<<3) );
              }
              n += 4;

--- a/algo/blake/pentablake-gate.c
+++ b/algo/blake/pentablake-gate.c
@@ -9,7 +9,7 @@ bool register_pentablake_algo( algo_gate_t* gate )
    gate->scanhash  = (void*)&scanhash_pentablake;
    gate->hash      = (void*)&pentablakehash;
 #endif
-    gate->optimizations = FOUR_WAY_OPT;
+    gate->optimizations = AVX2_OPT;
    gate->get_max64 = (void*)&get_max64_0x3ffff;
    return true;
 };
--- a/algo/blake/pentablake-gate.h
+++ b/algo/blake/pentablake-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-#if defined(FOUR_WAY) && defined(__AVX__)
+#if defined(__AVX2__)
  #define PENTABLAKE_4WAY
 #endif

--- a/algo/blake/sph-blake2s.c
+++ b/algo/blake/sph-blake2s.c
@@ -16,7 +16,7 @@
 #include <stdio.h>

 #include "algo/sha/sph_types.h"
-#include "crypto/blake2s.h"
+#include "sph-blake2s.h"

 static const uint32_t blake2s_IV[8] =
 {
--- a/algo/blake/sph-blake2s.h
+++ b/algo/blake/sph-blake2s.h
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
--- a/algo/bmw/bmw-hash-4way.h
+++ b/algo/bmw/bmw-hash-4way.h
@@ -46,94 +46,37 @@ extern "C"{
 #include "algo/sha/sph_types.h"
 #include "avxdefs.h"

-/**
- * Output size (in bits) for BMW-224.
- */
-#define SPH_SIZE_bmw224   224
-
-/**
- * Output size (in bits) for BMW-256.
- */
 #define SPH_SIZE_bmw256   256

-#if SPH_64
-
-/**
- * Output size (in bits) for BMW-384.
- */
-#define SPH_SIZE_bmw384   384
-
-/**
- * Output size (in bits) for BMW-512.
- */
 #define SPH_SIZE_bmw512   512

-#endif
-
-/**
- * This structure is a context for BMW-224 and BMW-256 computations:
- * it contains the intermediate values and some data from the last
- * entered block. Once a BMW computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running BMW
- * computation can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
 typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char buf[64];    /* first field, for alignment */
-	size_t ptr;
-	sph_u32 H[16];
-#if SPH_64
-	sph_u64 bit_count;
-#else
-	sph_u32 bit_count_high, bit_count_low;
-#endif
-#endif
+   __m128i buf[64];
+   __m128i H[16];
+   size_t ptr;
+   sph_u32 bit_count;  // assume bit_count fits in 32 bits
 } bmw_4way_small_context;

 typedef bmw_4way_small_context bmw256_4way_context;

-#if SPH_64
-
-/**
- * This structure is a context for BMW-384 and BMW-512 computations:
- * it contains the intermediate values and some data from the last
- * entered block. Once a BMW computation has been performed, the
- * context can be reused for another computation.
- *
- * The contents of this structure are private. A running BMW
- * computation can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
 typedef struct {
-#ifndef DOXYGEN_IGNORE
   __m256i buf[16];
   __m256i H[16];
-
-//	unsigned char buf[128];    /* first field, for alignment */
-	size_t ptr;
-//	sph_u64 H[16];
-	sph_u64 bit_count;
-#endif
+   size_t ptr;
+   sph_u64 bit_count;
 } bmw_4way_big_context;

 typedef bmw_4way_big_context bmw512_4way_context;

-#endif
-
 void bmw256_4way_init(void *cc);

 void bmw256_4way(void *cc, const void *data, size_t len);

 void bmw256_4way_close(void *cc, void *dst);

-void bmw256_addbits_and_close(
+void bmw256_4way_addbits_and_close(
 	void *cc, unsigned ub, unsigned n, void *dst);

-#if SPH_64
-
 void bmw512_4way_init(void *cc);

 void bmw512_4way(void *cc, const void *data, size_t len);
@@ -150,5 +93,3 @@ void bmw512_4way_addbits_and_close(
 #endif

 #endif
-
-#endif
--- a/algo/bmw/sse2/bmw.c
+++ b/algo/bmw/sse2/bmw.c
@@ -477,7 +477,7 @@ do { \
        for (u = 0; u < 16; u ++) \
        sph_enc64le_aligned(data + 8 * u, h2[u]); \
        dh = h1; \
-        h = final_b; \
+        h = (sph_u64*)final_b; \
    } \
    /* end wrapped for break loop */ \
    out = dst; \
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -3,7 +3,8 @@
 #include "cryptonight.h"
 #include "miner.h"
 #include "crypto/c_keccak.h"
-#include "avxdefs.h"
+#include <immintrin.h>
+//#include "avxdefs.h"

 void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey);
 void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state);
--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -0,0 +1,205 @@
+#if defined(__AVX2__)
+
+#include <stdbool.h>
+#include <unistd.h>
+#include <memory.h>
+#include "cube-hash-2way.h"
+
+// 2x128
+
+static void transform_2way( cube_2way_context *sp )
+{
+    int r;
+    const int rounds = sp->rounds;
+
+    __m256i x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3;
+
+    x0 = _mm256_load_si256( (__m256i*)sp->h     );
+    x1 = _mm256_load_si256( (__m256i*)sp->h + 1 );
+    x2 = _mm256_load_si256( (__m256i*)sp->h + 2 );
+    x3 = _mm256_load_si256( (__m256i*)sp->h + 3 );
+    x4 = _mm256_load_si256( (__m256i*)sp->h + 4 );
+    x5 = _mm256_load_si256( (__m256i*)sp->h + 5 );
+    x6 = _mm256_load_si256( (__m256i*)sp->h + 6 );
+    x7 = _mm256_load_si256( (__m256i*)sp->h + 7 );
+
+    for ( r = 0; r < rounds; ++r )
+    {
+        x4 = _mm256_add_epi32( x0, x4 );
+        x5 = _mm256_add_epi32( x1, x5 );
+        x6 = _mm256_add_epi32( x2, x6 );
+        x7 = _mm256_add_epi32( x3, x7 );
+        y0 = x2;
+        y1 = x3;
+        y2 = x0;
+        y3 = x1;
+        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0,  7 ),
+                               _mm256_srli_epi32( y0, 25 ) );
+        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1,  7 ),
+                               _mm256_srli_epi32( y1, 25 ) );
+        x2 = _mm256_xor_si256( _mm256_slli_epi32( y2,  7 ),
+                               _mm256_srli_epi32( y2, 25 ) );
+        x3 = _mm256_xor_si256( _mm256_slli_epi32( y3,  7 ),
+                               _mm256_srli_epi32( y3, 25 ) );
+        x0 = _mm256_xor_si256( x0, x4 );
+        x1 = _mm256_xor_si256( x1, x5 );
+        x2 = _mm256_xor_si256( x2, x6 );
+        x3 = _mm256_xor_si256( x3, x7 );
+        x4 = mm256_swap128_64( x4 );
+        x5 = mm256_swap128_64( x5 );
+        x6 = mm256_swap128_64( x6 );
+        x7 = mm256_swap128_64( x7 );
+        x4 = _mm256_add_epi32( x0, x4 );
+        x5 = _mm256_add_epi32( x1, x5 );
+        x6 = _mm256_add_epi32( x2, x6 );
+        x7 = _mm256_add_epi32( x3, x7 );
+        y0 = x1;
+        y1 = x0;
+        y2 = x3;
+        y3 = x2;
+        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
+                               _mm256_srli_epi32( y0, 21 ) );
+        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ),
+                               _mm256_srli_epi32( y1, 21 ) );
+        x2 = _mm256_xor_si256( _mm256_slli_epi32( y2, 11 ),
+                               _mm256_srli_epi32( y2, 21 ) );
+        x3 = _mm256_xor_si256( _mm256_slli_epi32( y3, 11 ),
+                               _mm256_srli_epi32( y3, 21 ) );
+        x0 = _mm256_xor_si256( x0, x4 );
+        x1 = _mm256_xor_si256( x1, x5 );
+        x2 = _mm256_xor_si256( x2, x6 );
+        x3 = _mm256_xor_si256( x3, x7 );
+        x4 = mm256_swap64_32( x4 );
+        x5 = mm256_swap64_32( x5 );
+        x6 = mm256_swap64_32( x6 );
+        x7 = mm256_swap64_32( x7 );
+    }
+
+    _mm256_store_si256( (__m256i*)sp->h,     x0 );
+    _mm256_store_si256( (__m256i*)sp->h + 1, x1 );
+    _mm256_store_si256( (__m256i*)sp->h + 2, x2 );
+    _mm256_store_si256( (__m256i*)sp->h + 3, x3 );
+    _mm256_store_si256( (__m256i*)sp->h + 4, x4 );
+    _mm256_store_si256( (__m256i*)sp->h + 5, x5 );
+    _mm256_store_si256( (__m256i*)sp->h + 6, x6 );
+    _mm256_store_si256( (__m256i*)sp->h + 7, x7 );
+
+}
+
+cube_2way_context cube_2way_ctx_cache __attribute__ ((aligned (64)));
+
+int cube_2way_reinit( cube_2way_context *sp )
+{
+   memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
+   return 0;
+
+}
+
+int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
+                       int blockbytes )
+{
+    int i;
+
+    // all sizes of __m128i
+    cube_2way_ctx_cache.hashlen   = hashbitlen/128;
+    cube_2way_ctx_cache.blocksize = blockbytes/16;
+    cube_2way_ctx_cache.rounds    = rounds;
+    cube_2way_ctx_cache.pos       = 0;
+
+    for ( i = 0; i < 8; ++i )
+       cube_2way_ctx_cache.h[i] = m256_zero;
+
+    cube_2way_ctx_cache.h[0] = _mm256_set_epi32(
+                                   0, rounds, blockbytes, hashbitlen / 8,
+                                   0, rounds, blockbytes, hashbitlen / 8 );
+
+    for ( i = 0; i < 10; ++i )
+       transform_2way( &cube_2way_ctx_cache );
+
+    memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
+    return 0;
+}
+
+
+int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
+{
+    const int len = size / 16;
+    const __m256i *in = (__m256i*)data;
+    int i;
+
+    // It is assumed data is aligned to 256 bits and is a multiple of 128 bits.
+    // Current usage sata is either 64 or 80 bytes.
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_2way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    return 0;
+}
+
+int cube_2way_close( cube_2way_context *sp, void *output )
+{
+    __m256i *hash = (__m256i*)output;
+    int i;
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
+                    _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80,
+                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+    transform_2way( sp );
+
+    sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
+                                                             1,0,0,0 ) );
+    for ( i = 0; i < 10; ++i )
+       transform_2way( &cube_2way_ctx_cache );
+
+    for ( i = 0; i < sp->hashlen; i++ )
+       hash[i] = sp->h[i];
+
+    return 0;
+}
+
+int cube_2way_update_close( cube_2way_context *sp, void *output,
+                               const void *data, size_t size )
+{
+    const int len = size / 16;
+    const __m256i *in = (__m256i*)data;
+    __m256i *hash = (__m256i*)output;
+    int i;
+
+    for ( i = 0; i < len; i++ )
+    {
+        sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ], in[i] );
+        sp->pos++;
+        if ( sp->pos == sp->blocksize )
+        {
+           transform_2way( sp );
+           sp->pos = 0;
+        }
+    }
+
+    // pos is zero for 64 byte data, 1 for 80 byte data.
+    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
+                    _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80,
+                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+    transform_2way( sp );
+
+    sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
+                                                             1,0,0,0 ) );
+    for ( i = 0; i < 10; ++i )
+       transform_2way( &cube_2way_ctx_cache );
+
+    for ( i = 0; i < sp->hashlen; i++ )
+       hash[i] = sp->h[i];
+
+    return 0;
+}
+
+#endif
--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -0,0 +1,36 @@
+#ifndef CUBE_HASH_2WAY_H__
+#define CUBE_HASH_2WAY_H__
+
+#if defined(__AVX2__)
+
+#include <stdint.h>
+#include "avxdefs.h"
+
+// 2x128, 2 way parallel SSE2
+
+struct _cube_2way_context
+{
+    int hashlen;           // __m128i
+    int rounds;
+    int blocksize;         // __m128i
+    int pos;               // number of __m128i read into x from current block
+    __m256i h[8] __attribute__ ((aligned (64)));
+};
+
+typedef struct _cube_2way_context cube_2way_context;
+
+int cube_2way_init( cube_2way_context* sp, int hashbitlen, int rounds,
+                       int blockbytes );
+// reinitialize context with same parameters, much faster.
+int cube_2way_reinit( cube_2way_context *sp );
+
+int cube_2way_update( cube_2way_context *sp, const void *data, size_t size );
+
+int cube_2way_close( cube_2way_context *sp, void *output );
+
+int cube_2way_update_close( cube_2way_context *sp, void *output,
+                            const void *data, size_t size );
+
+
+#endif
+#endif
--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -10,6 +10,10 @@
 #endif
 #include "cubehash_sse2.h"
 #include "algo/sha/sha3-defs.h"
+#include <stdbool.h>
+#include <unistd.h>
+#include <memory.h>
+#include "avxdefs.h"

 static void transform( cubehashParam *sp )
 {
@@ -18,7 +22,7 @@ static void transform( cubehashParam *sp )

 #ifdef __AVX2__

-    __m256i x0, x1, x2, x3, y0, y1;
+    register __m256i x0, x1, x2, x3, y0, y1;

    x0 = _mm256_load_si256( (__m256i*)sp->x     );
    x1 = _mm256_load_si256( (__m256i*)sp->x + 1 );   
@@ -29,20 +33,19 @@ static void transform( cubehashParam *sp )
    { 
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
-        y0 = x1;
-        y1 = x0;
-        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
+        y0 = x0;
+        x0 = _mm256_xor_si256( _mm256_slli_epi32( x1, 7 ),
+                               _mm256_srli_epi32( x1, 25 ) );
+        x1 = _mm256_xor_si256( _mm256_slli_epi32( y0, 7 ),
                               _mm256_srli_epi32( y0, 25 ) );
-        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 7 ),
-                               _mm256_srli_epi32( y1, 25 ) );
        x0 = _mm256_xor_si256( x0, x2 );
        x1 = _mm256_xor_si256( x1, x3 );
        x2 = _mm256_shuffle_epi32( x2, 0x4e );
        x3 = _mm256_shuffle_epi32( x3, 0x4e );
        x2 = _mm256_add_epi32( x0, x2 );
        x3 = _mm256_add_epi32( x1, x3 );
-        y0 = _mm256_permute2f128_si256( x0, x0, 1 );
-        y1 = _mm256_permute2f128_si256( x1, x1, 1 );
+        y0 = _mm256_permute4x64_epi64( x0, 0x4e );
+        y1 = _mm256_permute4x64_epi64( x1, 0x4e );
        x0 = _mm256_xor_si256( _mm256_slli_epi32( y0, 11 ),
                               _mm256_srli_epi32( y0, 21 ) );
        x1 = _mm256_xor_si256( _mm256_slli_epi32( y1, 11 ), 
@@ -125,6 +128,18 @@ static void transform( cubehashParam *sp )
 #endif
 }  // transform

+// Cubehash context initializing is very expensive.
+// Cache the intial value for faster reinitializing.
+cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));
+
+int cubehashReinit( cubehashParam *sp )
+{
+   memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
+   return SUCCESS;
+
+}
+
+// Initialize the cache then copy to sp.
 int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)
 {
    int i;
@@ -135,24 +150,26 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes)

    /* Sanity checks */
    if ( rounds <= 0 || rounds > 32 )
-         rounds = CUBEHASH_ROUNDS;
+       rounds = CUBEHASH_ROUNDS;
    if ( blockbytes <= 0 || blockbytes >= 256)
-         blockbytes = CUBEHASH_BLOCKBYTES;
+       blockbytes = CUBEHASH_BLOCKBYTES;

    // all sizes of __m128i
-    sp->hashlen   = hashbitlen/128;
-    sp->blocksize = blockbytes/16;
-    sp->rounds    = rounds;
-    sp->pos       = 0;
+    cube_ctx_cache.hashlen   = hashbitlen/128;
+    cube_ctx_cache.blocksize = blockbytes/16;
+    cube_ctx_cache.rounds    = rounds;
+    cube_ctx_cache.pos       = 0;

    for ( i = 0; i < 8; ++i )
-         sp->x[i] = _mm_set_epi32(0, 0, 0, 0);
+       cube_ctx_cache.x[i] = _mm_setzero_si128();;

-    sp->x[0] = _mm_set_epi32( 0, rounds, blockbytes, hashbitlen / 8 );
+    cube_ctx_cache.x[0] = _mm_set_epi32( 0, rounds, blockbytes,
+                                         hashbitlen / 8 );

    for ( i = 0; i < 10; ++i )
-         transform(sp);
-//    sp->pos = 0;
+       transform( &cube_ctx_cache );
+
+    memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) );
    return SUCCESS;
 }

--- a/algo/cubehash/sse2/cubehash_sse2.h
+++ b/algo/cubehash/sse2/cubehash_sse2.h
@@ -29,6 +29,8 @@ extern "C" {
 #endif

 int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes);
+// reinitialize context with same parameters, much faster.
+int cubehashReinit( cubehashParam* sp );

 int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size);

--- a/algo/echo/aes_ni/architectures
+++ b/algo/echo/aes_ni/architectures
@@ -1,2 +0,0 @@
-amd64
-x86
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -14,18 +14,20 @@
 * Institute of Applied Mathematics, Middle East Technical University, Turkey.
 *
 */
+#if defined(__AES__)

 #include <memory.h>
 #include "miner.h"
 #include "hash_api.h"
-#include "vperm.h"
-
+//#include "vperm.h"
+#include <immintrin.h>
+/*
 #ifndef NO_AES_NI
 #include <wmmintrin.h>
 #else
 #include <tmmintrin.h>
 #endif
-
+*/

 MYALIGN const unsigned int _k_s0F[] = {0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F};
 MYALIGN const unsigned int _k_ipt[] = {0x5A2A7000, 0xC2B2E898, 0x52227808, 0xCABAE090, 0x317C4D00, 0x4C01307D, 0xB0FDCC81, 0xCD80B1FC};
@@ -246,7 +248,8 @@ void DumpState(__m128i *ps)
 void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
 {
 	unsigned int r, b, i, j;
-	__m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
+//      __m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
+	__m128i t1, t2, s2, k1;
 	__m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 


@@ -396,7 +399,7 @@ HashReturn init_echo(hashState_echo *ctx, int nHashSize)
 {
 	int i, j;

-	ctx->k = _mm_xor_si128(ctx->k, ctx->k);
+        ctx->k = _mm_setzero_si128(); 
 	ctx->processed_bits = 0;
 	ctx->uBufferBytes = 0;

@@ -742,4 +745,4 @@ HashReturn hash_echo(int hashbitlen, const BitSequence *data, DataLength databit
 	return SUCCESS;
 }

-
+#endif
--- a/algo/echo/aes_ni/implementors
+++ b/algo/echo/aes_ni/implementors
@@ -1 +0,0 @@
-Çağdaş Çalık
--- a/algo/echo/aes_ni/vperm.h
+++ b/algo/echo/aes_ni/vperm.h
@@ -1,120 +0,0 @@
-/*
- * file        : vperm.h
- * version     : 1.0.208
- * date        : 14.12.2010
- * 
- * vperm implementation of AES s-box 
- *
- * Credits: Adapted from Mike Hamburg's AES implementation, http://crypto.stanford.edu/vpaes/
- *
- * Cagdas Calik
- * ccalik@metu.edu.tr
- * Institute of Applied Mathematics, Middle East Technical University, Turkey.
- *
- */
-
-#ifndef VPERM_H
-#define VPERM_H
-
-#include "algo/sha/sha3_common.h"
-#include <tmmintrin.h>
-
-/*
-extern const unsigned int _k_s0F[];
-extern const unsigned int _k_ipt[];
-extern const unsigned int _k_opt[];
-extern const unsigned int _k_inv[];
-extern const unsigned int _k_sb1[];
-extern const unsigned int _k_sb2[];
-extern const unsigned int _k_sb3[];
-extern const unsigned int _k_sb4[];
-extern const unsigned int _k_sb5[];
-extern const unsigned int _k_sb7[];
-extern const unsigned int _k_sbo[];
-extern const unsigned int _k_h63[];
-extern const unsigned int _k_hc6[];
-extern const unsigned int _k_h5b[];
-extern const unsigned int _k_h4e[];
-extern const unsigned int _k_h0e[];
-extern const unsigned int _k_h15[];
-extern const unsigned int _k_aesmix1[];
-extern const unsigned int _k_aesmix2[];
-extern const unsigned int _k_aesmix3[];
-extern const unsigned int _k_aesmix4[];
-*/
-
-// input: x, table
-// output: x
-#define TRANSFORM(x, table, t1, t2)\
-	t1 = _mm_andnot_si128(M128(_k_s0F), x);\
-	t1 = _mm_srli_epi32(t1, 4);\
-	x  = _mm_and_si128(x, M128(_k_s0F));\
-	t1 = _mm_shuffle_epi8(*((__m128i*)table + 1), t1);\
-	x  = _mm_shuffle_epi8(*((__m128i*)table + 0), x);\
-	x  = _mm_xor_si128(x, t1)
-
-#if 0
-// compiled erroneously with 32-bit msc compiler
-	t2 = _mm_shuffle_epi8(table[0], x);\
-	x  = _mm_shuffle_epi8(table[1], t1);\
-	x  = _mm_xor_si128(x, t2)
-#endif
-
-// input: x
-// output: t2, t3
-#define SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4)\
-	t1 = _mm_andnot_si128(M128(_k_s0F), x);\
-	t1 = _mm_srli_epi32(t1, 4);\
-	x  = _mm_and_si128(x, M128(_k_s0F));\
-	t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 1), x);\
-	x  = _mm_xor_si128(x, t1);\
-	t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t1);\
-	t3 = _mm_xor_si128(t3, t2);\
-	t4 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), x);\
-	t4 = _mm_xor_si128(t4, t2);\
-	t2 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t3);\
-	t2 = _mm_xor_si128(t2, x);\
-	t3 = _mm_shuffle_epi8(*((__m128i*)_k_inv + 0), t4);\
-	t3 = _mm_xor_si128(t3, t1);\
-
-
-// input: x1, x2, table
-// output: y
-#define VPERM_LOOKUP(x1, x2, table, y, t)\
-	t = _mm_shuffle_epi8(*((__m128i*)table + 0), x1);\
-	y = _mm_shuffle_epi8(*((__m128i*)table + 1), x2);\
-	y = _mm_xor_si128(y, t)
-
-
-// input: x
-// output: x
-#define SUBSTITUTE_VPERM(x, t1, t2, t3, t4)  \
-	TRANSFORM(x, _k_ipt, t1, t2);\
-	SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
-	VPERM_LOOKUP(t2, t3, _k_sbo, x, t1);\
-	x = _mm_xor_si128(x, M128(_k_h63))
-
-
-// input: x
-// output: x
-#define AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3) \
-	SUBSTITUTE_VPERM_CORE(x, t1, t2, t3, t4);\
-	VPERM_LOOKUP(t2, t3, _k_sb1, s1, t1);\
-	VPERM_LOOKUP(t2, t3, _k_sb2, s2, t1);\
-	s3 = _mm_xor_si128(s1, s2);\
-	x = _mm_shuffle_epi8(s2, M128(_k_aesmix1));\
-	x = _mm_xor_si128(x, _mm_shuffle_epi8(s3, M128(_k_aesmix2)));\
-	x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix3)));\
-	x = _mm_xor_si128(x, _mm_shuffle_epi8(s1, M128(_k_aesmix4)));\
-	x = _mm_xor_si128(x, M128(_k_h5b))
-
-
-// input: x
-// output: x
-#define AES_ROUND_VPERM(x, t1, t2, t3, t4, s1, s2, s3) \
-	TRANSFORM(x, _k_ipt, t1, t2);\
-	AES_ROUND_VPERM_CORE(x, t1, t2, t3, t4, s1, s2, s3);\
-	TRANSFORM(x, _k_opt, t1, t2)
-
-#endif // VPERM_H
-
--- a/algo/groestl/myr-groestl.c
+++ b/algo/groestl/myr-groestl.c
@@ -1,4 +1,4 @@
-#include "algo-gate-api.h"
+#include "myrgr-gate.h"

 #include <stdio.h>
 #include <stdlib.h>
@@ -10,8 +10,6 @@
 #else
  #include "aes_ni/hash-groestl.h"
 #endif
-
-#include <openssl/sha.h>
 #include "algo/sha/sph_sha2.h"

 typedef struct {
@@ -20,11 +18,7 @@ typedef struct {
 #else
    hashState_groestl       groestl;
 #endif
-#ifndef USE_SPH_SHA
-   SHA256_CTX         sha;
-#else
-   sph_sha256_context sha;
-#endif
+    sph_sha256_context sha;
 } myrgr_ctx_holder;

 myrgr_ctx_holder myrgr_ctx;
@@ -36,44 +30,37 @@ void init_myrgr_ctx()
 #else
     init_groestl (&myrgr_ctx.groestl, 64 );
 #endif
-#ifndef USE_SPH_SHA
-   SHA256_Init( &myrgr_ctx.sha );
-#else
-   sph_sha256_init( &myrgr_ctx.sha );
-#endif
+     sph_sha256_init(&myrgr_ctx.sha);
 }

-void myriadhash( void *output, const void *input )
+void myriad_hash(void *output, const void *input)
 {
-     myrgr_ctx_holder ctx __attribute__ ((aligned (64)));
-     memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
-     uint32_t hash[16] __attribute__ ((aligned (64))); 
+        myrgr_ctx_holder ctx;
+        memcpy( &ctx, &myrgr_ctx, sizeof(myrgr_ctx) );
+
+ 	uint32_t _ALIGN(32) hash[16];

 #ifdef NO_AES_NI
-     sph_groestl512(&ctx.groestl, input, 80);
-     sph_groestl512_close(&ctx.groestl, hash);
+	sph_groestl512(&ctx.groestl, input, 80);
+	sph_groestl512_close(&ctx.groestl, hash);
 #else
-     update_and_final_groestl( &ctx.groestl, (char*)input,
-                               (const char*)input, 640 );
+        update_groestl( &ctx.groestl, (char*)input, 640 );
+        final_groestl( &ctx.groestl, (char*)hash);
 #endif

-#ifndef USE_SPH_SHA
-     SHA256_Update( &ctx.sha, hash, 64 );
-     SHA256_Final( (unsigned char*) hash, &ctx.sha );
-#else
-     sph_sha256(&ctx.sha, hash, 64);
-     sph_sha256_close(&ctx.sha, hash);
-#endif
-     memcpy(output, hash, 32);
+	sph_sha256(&ctx.sha, hash, 64);
+	sph_sha256_close(&ctx.sha, hash);
+
+	memcpy(output, hash, 32);
 }

-int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done)
+int scanhash_myriad(int thr_id, struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done)
 {
        uint32_t *pdata = work->data;
        uint32_t *ptarget = work->target;

-	uint32_t endiandata[20] __attribute__ ((aligned (64)));
+	uint32_t _ALIGN(64) endiandata[20];
 	const uint32_t first_nonce = pdata[19];
 	uint32_t nonce = first_nonce;

@@ -84,9 +71,9 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,

 	do {
 		const uint32_t Htarg = ptarget[7];
-		uint32_t hash[8] __attribute__ ((aligned (64)));
+		uint32_t hash[8];
 		be32enc(&endiandata[19], nonce);
-		myriadhash(hash, endiandata);
+		myriad_hash(hash, endiandata);

 		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
 			pdata[19] = nonce;
@@ -101,14 +88,15 @@ int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
-
+/*
 bool register_myriad_algo( algo_gate_t* gate )
 {
-    gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | SHA_OPT;
+    gate->optimizations = SSE2_OPT | AES_OPT;
    init_myrgr_ctx();
    gate->scanhash = (void*)&scanhash_myriad;
    gate->hash     = (void*)&myriadhash;
+//    gate->hash_alt = (void*)&myriadhash;
    gate->get_max64 = (void*)&get_max64_0x3ffff;
    return true;
 };
-
+*/
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -0,0 +1,108 @@
+#include "myrgr-gate.h"
+
+#if defined(MYRGR_4WAY)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "aes_ni/hash-groestl.h"
+#include "algo/sha/sha2-hash-4way.h"
+
+typedef struct {
+    hashState_groestl       groestl;
+    sha256_4way_context     sha;
+} myrgr_4way_ctx_holder;
+
+myrgr_4way_ctx_holder myrgr_4way_ctx;
+
+void init_myrgr_4way_ctx()
+{
+     init_groestl (&myrgr_4way_ctx.groestl, 64 );
+     sha256_4way_init( &myrgr_4way_ctx.sha );
+}
+
+void myriad_4way_hash( void *output, const void *input )
+{
+     uint32_t hash0[20] __attribute__ ((aligned (64)));
+     uint32_t hash1[20] __attribute__ ((aligned (64)));
+     uint32_t hash2[20] __attribute__ ((aligned (64)));
+     uint32_t hash3[20] __attribute__ ((aligned (64)));
+     uint32_t vhash[16*4] __attribute__ ((aligned (64)));
+     myrgr_4way_ctx_holder ctx;
+     memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );
+
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );
+
+     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 640 );
+     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
+     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );
+
+     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+
+     sha256_4way( &ctx.sha, vhash, 64 );
+     sha256_4way_close( &ctx.sha, vhash );
+
+     mm_deinterleave_4x32( output, output+32, output+64, output+96,
+                           vhash, 256 );
+}
+
+int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t _ALIGN(64) edata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 76; // 19*4
+
+/*
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+
+	uint32_t _ALIGN(64) endiandata[20];
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+*/
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   swab32_array( edata, pdata, 20 );
+   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   do {
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );
+
+      myriad_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          pdata[19] = n+i;
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/groestl/myrgr-gate.c
+++ b/algo/groestl/myrgr-gate.c
@@ -0,0 +1,18 @@
+#include "myrgr-gate.h"
+
+bool register_myriad_algo( algo_gate_t* gate )
+{
+#if defined (MYRGR_4WAY)
+  init_myrgr_4way_ctx();
+  gate->scanhash  = (void*)&scanhash_myriad_4way;
+  gate->hash      = (void*)&myriad_4way_hash;
+#else
+  init_myrgr_ctx();
+  gate->scanhash  = (void*)&scanhash_myriad;
+  gate->hash      = (void*)&myriad_hash;
+#endif
+  gate->optimizations = AES_OPT | AVX2_OPT;
+  gate->get_max64 = (void*)&get_max64_0x3ffff;
+  return true;
+};
+
--- a/algo/groestl/myrgr-gate.h
+++ b/algo/groestl/myrgr-gate.h
@@ -0,0 +1,30 @@
+#ifndef MYRGR_GATE_H__
+#define MYRGR_GATE_H__
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define MYRGR_4WAY
+#endif
+
+#if defined(MYRGR_4WAY)
+
+void myriad_4way_hash( void *state, const void *input );
+
+int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_myrgr_4way_ctx();
+
+#endif
+
+void myriad_hash( void *state, const void *input );
+
+int scanhash_myriad( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_myrgr_ctx();
+
+#endif
+
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -0,0 +1,935 @@
+/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
+/*
+ * Hamsi implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+//#include "miner.h"
+#include "hamsi-hash-4way.h"
+
+#if defined(__AVX2__)
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+/*
+ * The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
+ * table lookup during message expansion (1 to 8, inclusive). If we note
+ * w the number of bits per message word (w=32 for Hamsi-224/256, w=64
+ * for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
+ * Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
+ * then we will get t tables (where t=ceil(w/n)) of individual size
+ * 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
+ * n=5, there are 7 tables, but the last one uses only two bits on
+ * input, not five).
+ *
+ * Also, we read t rows of r words from RAM. Words in a given row are
+ * concatenated in RAM in that order, so most of the cost is about
+ * reading the first row word; comparatively, cache misses are thus
+ * less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
+ *
+ * When n=1, tables are "special" in that we omit the first entry of
+ * each table (which always contains 0), so that total table size is
+ * halved.
+ *
+ * We thus have the following (size1 is the cumulative table size of
+ * Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
+ * are for Hamsi-224/256 and Hamsi-384/512, respectively).
+ *
+ *   n      size1      size2    t1    t2
+ * ---------------------------------------
+ *   1       1024       4096    32    64
+ *   2       2048       8192    16    32
+ *   3       2688      10880    11    22
+ *   4       4096      16384     8    16
+ *   5       6272      25600     7    13
+ *   6      10368      41984     6    11
+ *   7      16896      73856     5    10
+ *   8      32768     131072     4     8
+ *
+ * So there is a trade-off: a lower n makes the tables fit better in
+ * L1 cache, but increases the number of memory accesses. The optimal
+ * value depends on the amount of available L1 cache and the relative
+ * impact of a cache miss.
+ *
+ * Experimentally, in ideal benchmark conditions (which are not necessarily
+ * realistic with regards to L1 cache contention), it seems that n=8 is
+ * the best value on "big" architectures (those with 32 kB or more of L1
+ * cache), while n=4 is better on "small" architectures. This was tested
+ * on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
+ * (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
+ * (8 kB L1 cache).
+ *
+ * Note: with n=1, the 32 tables (actually implemented as one big table)
+ * are read entirely and sequentially, regardless of the input data,
+ * thus avoiding any data-dependent table access pattern.
+ */
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+//#include "hamsi-helper-4way.c"
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
+	SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
+	SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
+	SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
+	SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
+	SPH_C32(0x6769756d)
+};
+
+static const sph_u32 alpha_n[] = {
+	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
+};
+
+static const sph_u32 alpha_f[] = {
+	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
+	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
+	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
+	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
+};
+
+// imported from hamsi helper
+
+/* Note: this table lists bits within each byte from least
+   siginificant to most significant. */
+static const sph_u32 T512[64][16] = {
+	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
+	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
+	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
+	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
+	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
+	  SPH_C32(0x9e69af68) },
+	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
+	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
+	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
+	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
+	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
+	  SPH_C32(0x0c26f262) },
+	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
+	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
+	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
+	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
+	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
+	  SPH_C32(0xdc24e61f) },
+	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
+	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
+	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
+	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
+	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
+	  SPH_C32(0x3daac2da) },
+	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
+	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
+	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
+	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
+	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
+	  SPH_C32(0x78cace29) },
+	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
+	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
+	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
+	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
+	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
+	  SPH_C32(0x2dd1f9ab) },
+	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
+	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
+	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
+	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
+	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
+	  SPH_C32(0xbf2c0be2) },
+	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
+	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
+	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
+	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
+	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
+	  SPH_C32(0x32219526) },
+	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
+	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
+	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
+	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
+	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
+	  SPH_C32(0xac8e6c88) },
+	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
+	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
+	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
+	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
+	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
+	  SPH_C32(0x7b1bd6b9) },
+	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
+	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
+	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
+	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
+	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
+	  SPH_C32(0xf746c320) },
+	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
+	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
+	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
+	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
+	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
+	  SPH_C32(0x69505b3a) },
+	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
+	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
+	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
+	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
+	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
+	  SPH_C32(0x8a341574) },
+	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
+	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
+	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
+	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
+	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
+	  SPH_C32(0x450360bf) },
+	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
+	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
+	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
+	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
+	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
+	  SPH_C32(0xf3d45758) },
+	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
+	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
+	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
+	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
+	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
+	  SPH_C32(0x925c44e9) },
+	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
+	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
+	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
+	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
+	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
+	  SPH_C32(0xa123ff9f) },
+	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
+	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
+	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
+	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
+	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
+	  SPH_C32(0x1568ff0f) },
+	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
+	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
+	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
+	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
+	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
+	  SPH_C32(0xc5c1eb3e) },
+	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
+	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
+	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
+	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
+	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
+	  SPH_C32(0x1af21fe1) },
+	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
+	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
+	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
+	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
+	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
+	  SPH_C32(0x857f3c2b) },
+	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
+	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
+	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
+	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
+	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
+	  SPH_C32(0x2ba05a55) },
+	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
+	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
+	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
+	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
+	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
+	  SPH_C32(0xfeabf254) },
+	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
+	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
+	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
+	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
+	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
+	  SPH_C32(0xfe1cdc7f) },
+	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
+	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
+	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
+	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
+	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
+	  SPH_C32(0xb0a51834) },
+	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
+	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
+	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
+	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
+	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
+	  SPH_C32(0xa6b8c28d) },
+	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
+	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
+	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
+	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
+	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
+	  SPH_C32(0x3a4e99d7) },
+	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
+	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
+	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
+	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
+	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
+	  SPH_C32(0xe1844257) },
+	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
+	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
+	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
+	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
+	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
+	  SPH_C32(0x2c3b504e) },
+	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
+	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
+	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
+	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
+	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
+	  SPH_C32(0x524a0d59) },
+	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
+	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
+	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
+	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
+	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
+	  SPH_C32(0x378dd173) },
+	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
+	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
+	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
+	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
+	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
+	  SPH_C32(0x8b6c72bd) },
+	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
+	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
+	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
+	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
+	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
+	  SPH_C32(0x8e67b7fa) },
+	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
+	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
+	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
+	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
+	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
+	  SPH_C32(0x443d3004) },
+	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
+	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
+	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
+	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
+	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
+	  SPH_C32(0xf4f6ea7b) },
+	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
+	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
+	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
+	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
+	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
+	  SPH_C32(0x979961d0) },
+	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
+	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
+	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
+	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
+	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
+	  SPH_C32(0x98aa496e) },
+	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
+	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
+	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
+	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
+	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
+	  SPH_C32(0x094e3198) },
+	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
+	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
+	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
+	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
+	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
+	  SPH_C32(0xe86cba2e) },
+	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
+	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
+	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
+	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
+	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
+	  SPH_C32(0x4b7eec55) },
+	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
+	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
+	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
+	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
+	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
+	  SPH_C32(0x1e7536a6) },
+	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
+	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
+	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
+	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
+	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
+	  SPH_C32(0x24314f17) },
+	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
+	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
+	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
+	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
+	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
+	  SPH_C32(0x9075b1ce) },
+	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
+	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
+	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
+	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
+	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
+	  SPH_C32(0x9b6ef888) },
+	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
+	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
+	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
+	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
+	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
+	  SPH_C32(0xd8b61463) },
+	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
+	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
+	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
+	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
+	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
+	  SPH_C32(0x3ea660f7) },
+	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
+	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
+	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
+	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
+	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
+	  SPH_C32(0x7f975691) },
+	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
+	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
+	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
+	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
+	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
+	  SPH_C32(0x2c94459e) },
+	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
+	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
+	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
+	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
+	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
+	  SPH_C32(0x56a7b19f) },
+	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
+	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
+	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
+	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
+	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
+	  SPH_C32(0x81fdf908) },
+	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
+	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
+	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
+	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
+	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
+	  SPH_C32(0x5bd61539) },
+	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
+	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
+	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
+	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
+	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
+	  SPH_C32(0x15b961e7) },
+	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
+	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
+	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
+	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
+	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
+	  SPH_C32(0x2a2c18f0) },
+	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
+	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
+	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
+	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
+	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
+	  SPH_C32(0x551e3d6e) },
+	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
+	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
+	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
+	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
+	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
+	  SPH_C32(0x33c5244f) },
+	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
+	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
+	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
+	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
+	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
+	  SPH_C32(0x8a58e6a4) },
+	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
+	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
+	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
+	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
+	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
+	  SPH_C32(0xda878000) },
+	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
+	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
+	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
+	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
+	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
+	  SPH_C32(0x3c5dfffe) },
+	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
+	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
+	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
+	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
+	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
+	  SPH_C32(0x7b1675d7) },
+	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
+	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
+	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
+	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
+	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
+	  SPH_C32(0x2879ebac) },
+	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
+	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
+	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
+	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
+	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
+	  SPH_C32(0xbe0a679e) },
+	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
+	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
+	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
+	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
+	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
+	  SPH_C32(0x30aebcf7) },
+	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
+	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
+	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
+	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
+	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
+	  SPH_C32(0xc7ff60f0) },
+	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
+	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
+	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
+	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
+	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
+	  SPH_C32(0xe7e00a94) }
+};
+
+#define INPUT_BIG \
+do { \
+  __m256i db = *buf; \
+  const sph_u32 *tp = &T512[0][0]; \
+  m0 = m256_zero; \
+  m1 = m256_zero; \
+  m2 = m256_zero; \
+  m3 = m256_zero; \
+  m4 = m256_zero; \
+  m5 = m256_zero; \
+  m6 = m256_zero; \
+  m7 = m256_zero; \
+  for ( int u = 0; u < 64; u++ ) \
+  { \
+     __m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
+     dm = mm256_negate_32( _mm256_or_si256( dm, \
+                         _mm256_slli_epi64( dm, 32 ) ) ); \
+     m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x1], tp[0x0], tp[0x1], tp[0x0], \
+                                    tp[0x1], tp[0x0], tp[0x1], tp[0x0] ) ) ); \
+     m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x3], tp[0x2], tp[0x3], tp[0x2], \
+                                    tp[0x3], tp[0x2], tp[0x3], tp[0x2] ) ) ); \
+     m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x5], tp[0x4], tp[0x5], tp[0x4], \
+                                    tp[0x5], tp[0x4], tp[0x5], tp[0x4] ) ) ); \
+     m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x7], tp[0x6], tp[0x7], tp[0x6], \
+                                    tp[0x7], tp[0x6], tp[0x7], tp[0x6] ) ) ); \
+     m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x9], tp[0x8], tp[0x9], tp[0x8], \
+                                    tp[0x9], tp[0x8], tp[0x9], tp[0x8] ) ) ); \
+     m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0xB], tp[0xA], tp[0xB], tp[0xA], \
+                                    tp[0xB], tp[0xA], tp[0xB], tp[0xA] ) ) ); \
+     m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0xD], tp[0xC], tp[0xD], tp[0xC], \
+                                    tp[0xD], tp[0xC], tp[0xD], tp[0xC] ) ) ); \
+     m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0xF], tp[0xE], tp[0xF], tp[0xE], \
+                                    tp[0xF], tp[0xE], tp[0xF], tp[0xE] ) ) ); \
+     tp += 0x10; \
+     db = _mm256_srli_epi64( db, 1 ); \
+  } \
+} while (0)
+
+#define SBOX( a, b, c, d ) \
+do { \
+  __m256i t; \
+  t = a; \
+  a = _mm256_and_si256( a, c ); \
+  a = _mm256_xor_si256( a, d ); \
+  c = _mm256_xor_si256( c, b ); \
+  c = _mm256_xor_si256( c, a ); \
+  d = _mm256_or_si256( d, t ); \
+  d = _mm256_xor_si256( d, b ); \
+  t = _mm256_xor_si256( t, c ); \
+  b = d; \
+  d = _mm256_or_si256( d, t ); \
+  d = _mm256_xor_si256( d, a ); \
+  a = _mm256_and_si256( a, b ); \
+  t = _mm256_xor_si256( t, a ); \
+  b = _mm256_xor_si256( b, d ); \
+  b = _mm256_xor_si256( b, t ); \
+  a = c; \
+  c = b; \
+  b = d; \
+  d = mm256_not( t ); \
+} while (0)
+
+#define L( a, b, c, d ) \
+do { \
+   a = mm256_rol_32( a, 13 ); \
+   c = mm256_rol_32( c,  3 ); \
+   b = _mm256_xor_si256( b, _mm256_xor_si256( a, c ) ); \
+   d = _mm256_xor_si256( d, _mm256_xor_si256( c, \
+                                              _mm256_slli_epi32( a, 3 ) ) ); \
+   b = mm256_rol_32( b, 1 ); \
+   d = mm256_rol_32( d, 7 ); \
+   a = _mm256_xor_si256( a, _mm256_xor_si256( b, d ) ); \
+   c = _mm256_xor_si256( c, _mm256_xor_si256( d, \
+                                              _mm256_slli_epi32( b, 7 ) ) ); \
+   a = mm256_rol_32( a,  5 ); \
+   c = mm256_rol_32( c, 22 ); \
+} while (0)
+
+#define DECL_STATE_BIG \
+   __m256i c0, c1, c2, c3, c4, c5, c6, c7; \
+
+#define READ_STATE_BIG(sc) \
+do { \
+   c0 = sc->h[0x0]; \
+   c1 = sc->h[0x1]; \
+   c2 = sc->h[0x2]; \
+   c3 = sc->h[0x3]; \
+   c4 = sc->h[0x4]; \
+   c5 = sc->h[0x5]; \
+   c6 = sc->h[0x6]; \
+   c7 = sc->h[0x7]; \
+} while (0)
+
+#define WRITE_STATE_BIG(sc) \
+do { \
+   sc->h[0x0] = c0; \
+   sc->h[0x1] = c1; \
+   sc->h[0x2] = c2; \
+   sc->h[0x3] = c3; \
+   sc->h[0x4] = c4; \
+   sc->h[0x5] = c5; \
+   sc->h[0x6] = c6; \
+   sc->h[0x7] = c7; \
+} while (0)
+
+#define s0   m0
+#define s1   c0
+#define s2   m1
+#define s3   c1
+#define s4   c2
+#define s5   m2
+#define s6   c3
+#define s7   m3
+#define s8   m4
+#define s9   c4
+#define sA   m5
+#define sB   c5
+#define sC   c6
+#define sD   m6
+#define sE   c7
+#define sF   m7
+
+#define ROUND_BIG(rc, alpha) \
+do { \
+  __m256i t0, t1, t2, t3; \
+  s0 = _mm256_xor_si256( s0, _mm256_set_epi32( \
+        alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00], \
+        alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00] ) ); \
+  s1 = _mm256_xor_si256( s1, _mm256_set_epi32( \
+                     alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02], \
+                     alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02] ) ); \
+  s2 = _mm256_xor_si256( s2, _mm256_set_epi32( \
+                     alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04], \
+                     alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04] ) ); \
+  s3 = _mm256_xor_si256( s3, _mm256_set_epi32( \
+                     alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06], \
+                     alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06] ) ); \
+  s4 = _mm256_xor_si256( s4, _mm256_set_epi32( \
+                     alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08], \
+                     alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08] ) ); \
+  s5 = _mm256_xor_si256( s5, _mm256_set_epi32( \
+                     alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A], \
+                     alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A] ) ); \
+  s6 = _mm256_xor_si256( s6, _mm256_set_epi32( \
+                     alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C], \
+                     alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C] ) ); \
+  s7 = _mm256_xor_si256( s7, _mm256_set_epi32( \
+                     alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E], \
+                     alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E] ) ); \
+  s8 = _mm256_xor_si256( s8, _mm256_set_epi32( \
+                     alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10], \
+                     alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10] ) ); \
+  s9 = _mm256_xor_si256( s9, _mm256_set_epi32( \
+                     alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12], \
+                     alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12] ) ); \
+  sA = _mm256_xor_si256( sA, _mm256_set_epi32( \
+                     alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14], \
+                     alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14] ) ); \
+  sB = _mm256_xor_si256( sB, _mm256_set_epi32( \
+                     alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16], \
+                     alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16] ) ); \
+  sC = _mm256_xor_si256( sC, _mm256_set_epi32( \
+                     alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18], \
+                     alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18] ) ); \
+  sD = _mm256_xor_si256( sD, _mm256_set_epi32( \
+                     alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A], \
+                     alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A] ) ); \
+  sE = _mm256_xor_si256( sE, _mm256_set_epi32( \
+                     alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C], \
+                     alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C] ) ); \
+  sF = _mm256_xor_si256( sF, _mm256_set_epi32( \
+                     alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E], \
+                     alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E] ) ); \
+\
+  SBOX( s0, s4, s8, sC ); \
+  SBOX( s1, s5, s9, sD ); \
+  SBOX( s2, s6, sA, sE ); \
+  SBOX( s3, s7, sB, sF ); \
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \
+                           _mm256_bslli_epi128( s5, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \
+                           _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
+  L( s0, t1, s9, t3 ); \
+  s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
+                           _mm256_bslli_epi128( s6, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \
+                           _mm256_bslli_epi128( sF, 4 ), 0xAA ); \
+  L( s1, t1, sA, t3 ); \
+  s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \
+                           _mm256_bslli_epi128( s7, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \
+                           _mm256_bslli_epi128( sC, 4 ), 0xAA ); \
+  L( s2, t1, sB, t3 ); \
+  s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \
+                           _mm256_bslli_epi128( s4, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \
+                           _mm256_bslli_epi128( sD, 4 ), 0xAA ); \
+  L( s3, t1, s8, t3 ); \
+  s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \
+  t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \
+  t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \
+                           _mm256_bslli_epi128( sB, 4 ), 0xAA ); \
+  L( t0, t1, t2, t3 ); \
+  s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
+  s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \
+  s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
+  s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \
+  s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \
+  sA = _mm256_blend_epi32( sA, t2, 0xAA ); \
+  s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \
+  sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \
+\
+  t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
+                           _mm256_bslli_epi128( sD, 4 ), 0xAA ); \
+  t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \
+  L( t0, t1, t2, t3 ); \
+  s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \
+  sC = _mm256_blend_epi32( sC, t0, 0xAA ); \
+  s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \
+  sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \
+  s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
+  sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \
+  s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
+  sF = _mm256_blend_epi32( sF, t3, 0xAA ); \
+} while (0)
+
+#define P_BIG \
+do { \
+   ROUND_BIG(0, alpha_n); \
+   ROUND_BIG(1, alpha_n); \
+   ROUND_BIG(2, alpha_n); \
+   ROUND_BIG(3, alpha_n); \
+   ROUND_BIG(4, alpha_n); \
+   ROUND_BIG(5, alpha_n); \
+} while (0)
+
+#define PF_BIG \
+do { \
+   ROUND_BIG( 0, alpha_f); \
+   ROUND_BIG( 1, alpha_f); \
+   ROUND_BIG( 2, alpha_f); \
+   ROUND_BIG( 3, alpha_f); \
+   ROUND_BIG( 4, alpha_f); \
+   ROUND_BIG( 5, alpha_f); \
+   ROUND_BIG( 6, alpha_f); \
+   ROUND_BIG( 7, alpha_f); \
+   ROUND_BIG( 8, alpha_f); \
+   ROUND_BIG( 9, alpha_f); \
+   ROUND_BIG(10, alpha_f); \
+   ROUND_BIG(11, alpha_f); \
+} while (0)
+
+#define T_BIG \
+do { /* order is important */ \
+   c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \
+   c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \
+   c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \
+   c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \
+   c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \
+   c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \
+   c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \
+   c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \
+} while (0)
+
+void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
+{
+   DECL_STATE_BIG
+   sph_u32 tmp;
+
+   tmp = SPH_T32( (sph_u32)num << 6 );
+   sc->count_low = SPH_T32( sc->count_low + tmp );
+   sc->count_high += (sph_u32)( (num >> 13) >> 13 );
+   if ( sc->count_low < tmp )
+      sc->count_high++;
+
+   READ_STATE_BIG( sc );
+   while ( num-- > 0 )
+   {
+      __m256i m0, m1, m2, m3, m4, m5, m6, m7;
+
+      INPUT_BIG;
+      P_BIG;
+      T_BIG;
+      buf++;
+   }
+   WRITE_STATE_BIG( sc );
+}
+
+void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
+{
+   __m256i m0, m1, m2, m3, m4, m5, m6, m7;
+   DECL_STATE_BIG
+   READ_STATE_BIG( sc );
+   INPUT_BIG;
+   PF_BIG;
+   T_BIG;
+   WRITE_STATE_BIG( sc );
+}
+
+void hamsi512_4way_init( hamsi_4way_big_context *sc )
+{
+   sc->partial_len = 0;
+   sph_u32 lo, hi;
+   sc->count_high = sc->count_low = 0;
+   for ( int i = 0; i < 8; i++ )
+   {
+      lo = 2*i;
+      hi = 2*i + 1;
+      sc->h[i] = _mm256_set_epi32( IV512[hi], IV512[lo], IV512[hi], IV512[lo],
+                                   IV512[hi], IV512[lo], IV512[hi], IV512[lo] );
+   }
+}
+
+void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
+{
+   __m256i *vdata = (__m256i*)data;
+
+// It looks like the only way to get in here is if core was previously called
+// with a very small len
+// That's not likely even with 80 byte input so deprecate partial len
+/*
+   if ( sc->partial_len != 0 )
+   {
+      size_t mlen;
+
+      mlen = 8 - sc->partial_len;
+      if ( len < mlen )
+      {
+         memcpy_256( sc->partial + (sc->partial_len >> 3), data, len>>3 );
+         sc->partial_len += len;
+         return;
+      }
+      else
+      {
+         memcpy_256( sc->partial + (sc->partial_len >> 3), data, mlen>>3 );
+         len -= mlen;
+         vdata += mlen>>3;
+         hamsi_big( sc, sc->partial, 1 );
+         sc->partial_len = 0;
+      }
+   }
+*/
+
+   hamsi_big( sc, vdata, len>>3 );
+   vdata += ( (len& ~(size_t)7) >> 3 );
+   len &= (size_t)7;
+   memcpy_256( sc->buf, vdata, len>>3 );
+   sc->partial_len = len;
+}
+
+void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
+{
+   __m256i *out = (__m256i*)dst;
+   __m256i pad[1];
+   size_t u;
+   int ch, cl;
+
+   sph_enc32be( &ch, sc->count_high );
+   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] =  _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch );
+   sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
+                                  0UL, 0x80UL, 0UL, 0x80UL );
+   hamsi_big( sc, sc->buf, 1 );
+   hamsi_big_final( sc, pad );
+   for ( u = 0; u < 8; u ++ )
+      out[u] = mm256_bswap_32( sc->h[u] );
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -0,0 +1,72 @@
+/* $Id: sph_hamsi.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * Hamsi interface. This code implements Hamsi with the recommended
+ * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_hamsi.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef HAMSI_4WAY_H__
+#define HAMSI_4WAY_H__
+
+#include <stddef.h>
+#include "algo/sha/sph_types.h"
+
+#if defined (__AVX2__)
+
+#include "avxdefs.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#define SPH_SIZE_hamsi512   512
+
+// Partial is only scalar but needs pointer ref for hamsi-helper
+// deprecate partial_len
+typedef struct {
+   __m256i h[8];
+   __m256i buf[1];
+   size_t partial_len;
+   sph_u32 count_high, count_low;
+} hamsi_4way_big_context;
+
+typedef hamsi_4way_big_context hamsi512_4way_context;
+
+void hamsi512_4way_init( hamsi512_4way_context *sc );
+void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
+void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -0,0 +1,115 @@
+/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */
+/*
+ * Helper code, included (three times !) by HAVAL implementation.
+ *
+ * TODO: try to merge this with md_helper.c.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#undef SPH_XCAT
+#define SPH_XCAT(a, b)    SPH_XCAT_(a, b)
+#undef SPH_XCAT_
+#define SPH_XCAT_(a, b)   a ## b
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _4way)
+( haval_4way_context *sc, const void *data, size_t len )
+{
+   __m128i *vdata = (__m128i*)data;
+   unsigned current;
+
+   current = (unsigned)sc->count_low & 127U;
+   while ( len > 0 )
+   {
+      unsigned clen;
+      sph_u32 clow, clow2;
+
+      clen = 128U - current;
+      if ( clen > len )
+         clen = len;
+      memcpy_128( sc->buf + (current>>2), vdata, clen>>2 );
+      vdata += clen>>2;
+      current += clen;
+      len -= clen;
+      if ( current == 128U )
+      {
+         DSTATE;
+         IN_PREPARE(sc->buf);
+         RSTATE;
+         SPH_XCAT(CORE, PASSES)(INW);
+         WSTATE;
+         current = 0;
+      }
+      clow = sc->count_low;
+      clow2 = SPH_T32(clow + clen);
+      sc->count_low = clow2;
+      if ( clow2 < clow )
+         sc->count_high ++;
+   }
+}
+
+static void
+SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
+                                                void *dst)
+{
+   unsigned current;
+   DSTATE;
+
+   current = (unsigned)sc->count_low & 127UL;
+
+   sc->buf[ current>>2 ] = m128_one_32;
+   current += 4;   
+   RSTATE;
+   if ( current > 116UL )
+   {
+      memset_zero_128( sc->buf + ( current>>2 ), (128UL-current) >> 2 );
+      do
+      {
+         IN_PREPARE(sc->buf);
+         SPH_XCAT(CORE, PASSES)(INW);
+      } while (0);
+      current = 0;
+   }
+
+   uint32_t t1, t2;
+   memset_zero_128( sc->buf + ( current>>2 ), (116UL-current) >> 2 );
+   t1 = 0x01 | (PASSES << 3);
+   t2 = sc->olen << 3;
+   sc->buf[ 116>>2 ] = _mm_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) );
+   sc->buf[ 120>>2 ] = _mm_set1_epi32( sc->count_low << 3 );
+   sc->buf[ 124>>2 ] = _mm_set1_epi32( (sc->count_high << 3)
+                                     | (sc->count_low >> 29) );
+   do
+   {
+      IN_PREPARE(sc->buf);
+      SPH_XCAT(CORE, PASSES)(INW);
+   } while (0);
+   WSTATE;
+   haval_4way_out( sc, dst );
+}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	9edc650042	v3.8.7.2	2018-04-11 13:44:26 -04:00
Jay D Dee	218cef337a	v3.8.7.1	2018-04-10 21:49:06 -04:00
Jay D Dee	9ffce7bdb7	v3.8.7	2018-04-09 19:14:38 -04:00
Jay D Dee	c7efa50aad	v3.8.6.1	2018-04-06 11:42:01 -04:00
Jay D Dee	dd5e552357	v3.8.6	2018-03-31 12:50:52 -04:00
Jay D Dee	f449c6725f	v3.8.5	2018-03-27 20:20:05 -04:00
Jay D Dee	3363d61524	v3.8.4.1	2018-03-22 14:28:03 -04:00
Jay D Dee	20fe05054c	v3.8.4	2018-03-18 12:51:03 -04:00
Jay D Dee	157508bd07	v3.8.3.3	2018-02-25 14:15:07 -05:00
Jay D Dee	c24a4bdbc2	v3.8.3.2	2018-02-24 14:36:19 -05:00
Jay D Dee	59c7848d91	v3.8.3.1	2018-02-23 15:45:32 -05:00
Jay D Dee	3c02653dbe	v3.8.3	2018-02-23 12:39:15 -05:00
Jay D Dee	502ed0b1fe	v3.8.2.1	2018-02-17 13:52:24 -05:00
Jay D Dee	d60a268972	v3.8.2	2018-02-15 14:48:50 -05:00
Jay D Dee	e4265a6f11	v3.8.1.1	2018-02-09 23:30:14 -05:00
Jay D Dee	a28daca3ce	v3.8.1	2018-02-07 16:38:45 -05:00
Jay D Dee	54b8fd7362	v3.8.0.1	2018-02-05 22:10:18 -05:00
Jay D Dee	ad2275f74a	v3.8.0	2018-01-23 21:02:16 -05:00
Jay D Dee	a90d75b8f5	v3.7.10	2018-01-16 15:11:44 -05:00