v3.8.8.1

v3.8.7.2
v3.8.7.1
2025-09-17 23:44:27 +00:00 · 2018-05-11 11:52:36 -04:00 · 2018-04-11 13:44:26 -04:00 · 2018-04-10 21:49:06 -04:00 · 2018-04-09 19:14:38 -04:00 · 2018-04-06 11:42:01 -04:00
46 changed files with 2847 additions and 1580 deletions
--- a/2
+++ b/2
@@ -29,3 +29,5 @@ Wolf0
 Optiminer

 Jay D Dee
+
+xcouiz@gmail.com
--- a/README.md
+++ b/README.md
@@ -7,6 +7,11 @@ All of the code is believed to be open and free. If anyone has a
 claim to any of it post your case in the cpuminer-opt Bitcoin Talk forum
 or by email.

+Miner programs are often flagged as malware by antivirus programs. This is
+a false positive, they are flagged simply because they are cryptocurrency 
+miners. The source code is open for anyone to inspect. If you don't trust 
+the software, don't use it.
+
 https://bitcointalk.org/index.php?topic=1326803.0

 mailto://jayddee246@gmail.com
@@ -40,82 +45,84 @@ MacOS, OSx and Android are not supported.
 Supported Algorithms
 --------------------

-                          allium       Garlicoin
-                          anime        Animecoin
-                          argon2       Argon2 coin (AR2)
-                          argon2d-crds Credits (CRDS)
-                          argon2d-dyn  Dynamic (DYN)
-                          axiom        Shabal-256 MemoHash
+                          allium        Garlicoin
+                          anime         Animecoin
+                          argon2        Argon2 coin (AR2)
+                          argon2d250    argon2d-crds, Credits (CRDS)
+                          argon2d500    argon2d-dyn,  Dynamic (DYN)
+                          argon2d4096   argon2d-uis, Unitus, (UIS)
+                          axiom         Shabal-256 MemoHash
                          bastion
-                          blake        Blake-256 (SFR)
-                          blakecoin    blake256r8
-                          blake2s      Blake-2 S
-                          bmw          BMW 256
-                          c11          Chaincoin
-                          cryptolight  Cryptonight-light
-                          cryptonight  cryptonote, Monero (XMR)
+                          blake         Blake-256 (SFR)
+                          blakecoin     blake256r8
+                          blake2s       Blake-2 S
+                          bmw           BMW 256
+                          c11           Chaincoin
+                          cryptolight   Cryptonight-light
+                          cryptonight  
+                          cryptonightv7 Monero (XMR)
                          decred
-                          deep         Deepcoin (DCN)
-                          dmd-gr       Diamond-Groestl
-                          drop         Dropcoin
-                          fresh        Fresh
-                          groestl      Groestl coin
-                          heavy        Heavy
-                          hmq1725      Espers
-                          hodl         Hodlcoin
-                          jha          Jackpotcoin
-                          keccak       Maxcoin
-                          keccakc      Creative coin
-                          lbry         LBC, LBRY Credits
-                          luffa        Luffa
-                          lyra2h       Hppcoin
-                          lyra2re      lyra2
-                          lyra2rev2    lyra2v2, Vertcoin
-                          lyra2z       Zcoin (XZC)
-                          lyra2z330    Lyra2 330 rows, Zoin (ZOI)
-                          m7m          Magi (XMG)
-                          myr-gr       Myriad-Groestl
-                          neoscrypt    NeoScrypt(128, 2, 1)
-                          nist5        Nist5
-                          pentablake   Pentablake
-                          phi1612      phi, LUX coin
-                          pluck        Pluck:128 (Supcoin)
-                          polytimos    Ninja
-                          quark        Quark
-                          qubit        Qubit
-                          scrypt       scrypt(1024, 1, 1) (default)
-                          scrypt:N     scrypt(N, 1, 1)
+                          deep          Deepcoin (DCN)
+                          dmd-gr        Diamond-Groestl
+                          drop          Dropcoin
+                          fresh         Fresh
+                          groestl       Groestl coin
+                          heavy         Heavy
+                          hmq1725       Espers
+                          hodl          Hodlcoin
+                          jha           Jackpotcoin
+                          keccak        Maxcoin
+                          keccakc       Creative coin
+                          lbry          LBC, LBRY Credits
+                          luffa         Luffa
+                          lyra2h        Hppcoin
+                          lyra2re       lyra2
+                          lyra2rev2     lyra2v2, Vertcoin
+                          lyra2z        Zcoin (XZC)
+                          lyra2z330     Lyra2 330 rows, Zoin (ZOI)
+                          m7m           Magi (XMG)
+                          myr-gr        Myriad-Groestl
+                          neoscrypt     NeoScrypt(128, 2, 1)
+                          nist5         Nist5
+                          pentablake    Pentablake
+                          phi1612       phi, LUX coin
+                          pluck         Pluck:128 (Supcoin)
+                          polytimos     Ninja
+                          quark         Quark
+                          qubit         Qubit
+                          scrypt        scrypt(1024, 1, 1) (default)
+                          scrypt:N      scrypt(N, 1, 1)
                          scryptjane:nf
-                          sha256d      Double SHA-256
-                          sha256t      Triple SHA-256, Onecoin (OC)
-                          shavite3     Shavite3
-                          skein        Skein+Sha (Skeincoin)
-                          skein2       Double Skein (Woodcoin)
-                          skunk        Signatum (SIGT)
-                          timetravel   Machinecoin (MAC)
-                          timetravel10 Bitcore
-                          tribus       Denarius (DNR)
-                          vanilla      blake256r8vnl (VCash)
-                          veltor       (VLT)
+                          sha256d       Double SHA-256
+                          sha256t       Triple SHA-256, Onecoin (OC)
+                          shavite3      Shavite3
+                          skein         Skein+Sha (Skeincoin)
+                          skein2        Double Skein (Woodcoin)
+                          skunk         Signatum (SIGT)
+                          timetravel    Machinecoin (MAC)
+                          timetravel10  Bitcore
+                          tribus        Denarius (DNR)
+                          vanilla       blake256r8vnl (VCash)
+                          veltor        (VLT)
                          whirlpool
                          whirlpoolx
-                          x11          Dash
-                          x11evo       Revolvercoin
-                          x11gost      sib (SibCoin)
-                          x12          Galaxie Cash (GCH)
-                          x13          X13
-                          x13sm3       hsr (Hshare)
-                          x14          X14
-                          x15          X15
-                          x16r         Ravencoin (RVN)
-                          x16s         pigeoncoin (PGN)
+                          x11           Dash
+                          x11evo        Revolvercoin
+                          x11gost       sib (SibCoin)
+                          x12           Galaxie Cash (GCH)
+                          x13           X13
+                          x13sm3        hsr (Hshare)
+                          x14           X14
+                          x15           X15
+                          x16r          Ravencoin (RVN)
+                          x16s          pigeoncoin (PGN)
                          x17
-                          xevan        Bitsend (BSD)
-                          yescrypt     Globalboost-Y (BSTY)
-                          yescryptr8   BitZeny (ZNY)
-                          yescryptr16  Yenten (YTN)
-                          yescryptr32  WAVI
-                          zr5          Ziftr
+                          xevan         Bitsend (BSD)
+                          yescrypt      Globalboost-Y (BSTY)
+                          yescryptr8    BitZeny (ZNY)
+                          yescryptr16   Yenten (YTN)
+                          yescryptr32   WAVI
+                          zr5           Ziftr

 Errata
 ------
--- a/README.txt
+++ b/README.txt
@@ -4,6 +4,11 @@ for Linux and Windows can be found in RELEASE_NOTES.
 cpuminer is a console program that is executed from a DOS command prompt.
 There is no GUI and no mouse support.

+Miner programs are often flagged as malware by antivirus programs. This is
+a false positive, they are flagged simply because they are cryptocurrency 
+miners. The source code is open for anyone to inspect. If you don't trust
+the software, don't use it.
+
 Choose the exe that best matches you CPU's features or use trial and
 error to find the fastest one that doesn't crash. Pay attention to
 the features listed at cpuminer startup to ensure you are mining at
@@ -16,14 +21,16 @@ AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
 supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.
+Changes in v3.8.4 may have improved compatibility with some of these CPUs.

-Exe name                Compile flags              Arch name

-cpuminer-sse2.exe      "-msse2"                    Core2, Nehalem   
-cpuminer-aes-sse42.exe "-maes -msse4.2"            Westmere
-cpuminer-aes-avx.exe   "-march=corei7-avx"         Sandybridge, Ivybridge
-cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
-cpuminer-avx2-sha.exe  "-march=core-avx2 -msha"    Ryzen
+Exe name                Compile flags            Arch name
+
+cpuminer-sse2.exe      "-msse2"                  Core2, Nehalem   
+cpuminer-aes-sse42.exe "-march=westmere"         Westmere, Sandy-Ivybridge
+cpuminer-avx.exe       "-march=corei7-avx"       Sandy-Ivybridge
+cpuminer-avx2.exe      "-march=core-avx2"        Haswell, Sky-Kaby-Coffeelake
+cpuminer-avx2-sha.exe  "-march=core-avx2 -msha"  Ryzen

 If you like this software feel free to donate:

--- a/49
+++ b/49
@@ -13,11 +13,11 @@ Security warning
 ----------------

 Miner programs are often flagged as malware by antivirus programs. This is
-a false positive, they are flagged simply because they are miners. The source
-code is open for anyone to inspect. If you don't trust the software, don't use
-it.
+a false positive, they are flagged simply because they are cryptocurrency 
+miners. The source code is open for anyone to inspect. If you don't trust 
+the software, don't use it.

-The cryptographic code has been taken from trusted sources but has been
+The cryptographic hashing code has been taken from trusted sources but has been
 modified for speed at the expense of accepted security practices. This
 code should not be imported into applications where secure cryptography is
 required.
@@ -50,8 +50,7 @@ will give a clue as to the missing package.
 The following command should install everything you need on Debian based
 distributions such as Ubuntu:

-sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake
-
+sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake zlib1g-dev

 build-essential  (for Ubuntu, Development Tools package group on Fedora)
 automake
@@ -81,12 +80,12 @@ cd cpuminer-opt-x.y.z
 Run ./build.sh to build on Linux or execute the following commands.

 ./autogen.sh
-CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
+CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 make

 Additional optional compile flags, add the following to CFLAGS to activate:

-DUSE_SPH_SHA
+-DUSE_SPH_SHA (deprecated)

 SPH may give slightly better performance on algos that use sha256 when using
 openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
@@ -160,6 +159,40 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.8.8.1
+
+Fixed x16r.
+Removed cryptonight variant check due to false positives.
+API displays hashrate before shares are submitted.
+
+v3.8.8
+
+Added cryptonightv7 for Monero.
+
+v3.8.7.2
+
+Fixed argon2d-dyn regression in v3.8.7.1.
+Changed compile options for aes-sse42 Windows build to -march=westmere
+
+v3.8.7.1
+
+Fixed argon2d-uis low difficulty rejects.
+Fixed argon2d aliases.
+
+v3.8.7
+
+Added argon2d4096 (alias argon2d-uis) for Unitus (UIS).
+argon2d-crds and argon2d-dyn renamed to argon2d250 and argon2d500 respectively.
+  The old names are recognized as aliases.
+AVX512 is now supported for argon2d algos, Linux only.
+AVX is no longer a reported feature and an AVX Windows binary is no longer
+  provided. Use AES-SSE42 build instead.
+
+v3.8.6.1
+
+Faster argon2d* AVX2.
+Untested AVX-512 for argon2d*, YMMV.
+
 v3.8.6

 Fixed argon2 regression in v3.8.5.
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.14.1 -*- Autoconf -*-
+# generated automatically by aclocal 1.15.1 -*- Autoconf -*-

-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2017 Free Software Foundation, Inc.

 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -20,7 +20,7 @@ You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])

-# Copyright (C) 2002-2013 Free Software Foundation, Inc.
+# Copyright (C) 2002-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.])
 # generated from the m4 files accompanying Automake X.Y.
 # (This private macro should not be called outside this file.)
 AC_DEFUN([AM_AUTOMAKE_VERSION],
-[am__api_version='1.14'
+[am__api_version='1.15'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.14.1], [],
+m4_if([$1], [1.15.1], [],
      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])

@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.14.1])dnl
+[AM_AUTOMAKE_VERSION([1.15.1])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])

 # Figure out how to run the assembler.                      -*- Autoconf -*-

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl

 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -123,15 +123,14 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl
 # configured tree to be moved without reconfiguration.

 AC_DEFUN([AM_AUX_DIR_EXPAND],
-[dnl Rely on autoconf to set up CDPATH properly.
-AC_PREREQ([2.50])dnl
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
+[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
+# Expand $ac_aux_dir to an absolute path.
+am_aux_dir=`cd "$ac_aux_dir" && pwd`
 ])

 # AM_CONDITIONAL                                            -*- Autoconf -*-

-# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+# Copyright (C) 1997-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -162,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])

-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -353,7 +352,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl

 # Generate code to set up dependency tracking.              -*- Autoconf -*-

-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -429,7 +428,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],

 # Do all the work for Automake.                             -*- Autoconf -*-

-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -519,8 +518,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
+# We need awk for the "check" target (and possibly the TAP driver).  The
+# system "awk" is bad on some platforms.
 AC_REQUIRE([AC_PROG_AWK])dnl
 AC_REQUIRE([AC_PROG_MAKE_SET])dnl
 AC_REQUIRE([AM_SET_LEADING_DOT])dnl
@@ -593,7 +592,11 @@ to "yes", and re-run configure.
 END
    AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
  fi
-fi])
+fi
+dnl The trailing newline in this macro's definition is deliberate, for
+dnl backward compatibility and to allow trailing 'dnl'-style comments
+dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841.
+])

 dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
 dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
@@ -622,7 +625,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -633,7 +636,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co
 # Define $install_sh.
 AC_DEFUN([AM_PROG_INSTALL_SH],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-if test x"${install_sh}" != xset; then
+if test x"${install_sh+set}" != xset; then
  case $am_aux_dir in
  *\ * | *\	*)
    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -643,7 +646,7 @@ if test x"${install_sh}" != xset; then
 fi
 AC_SUBST([install_sh])])

-# Copyright (C) 2003-2013 Free Software Foundation, Inc.
+# Copyright (C) 2003-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -665,7 +668,7 @@ AC_SUBST([am__leading_dot])])
 # Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
 # From Jim Meyering

-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -700,7 +703,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])

 # Check to see how 'make' treats includes.	            -*- Autoconf -*-

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -750,7 +753,7 @@ rm -f confinc confmf

 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-

-# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+# Copyright (C) 1997-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -789,7 +792,7 @@ fi

 # Helper functions for option handling.                     -*- Autoconf -*-

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -818,7 +821,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
 AC_DEFUN([_AM_IF_OPTION],
 [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])

-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -865,7 +868,7 @@ AC_LANG_POP([C])])
 # For backward compatibility.
 AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -884,7 +887,7 @@ AC_DEFUN([AM_RUN_LOG],

 # Check to make sure that the build environment is sane.    -*- Autoconf -*-

-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -965,7 +968,7 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])

-# Copyright (C) 2009-2013 Free Software Foundation, Inc.
+# Copyright (C) 2009-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1025,7 +1028,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1053,7 +1056,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])

-# Copyright (C) 2006-2013 Free Software Foundation, Inc.
+# Copyright (C) 2006-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1072,7 +1075,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])

 # Check how to create a tarball.                            -*- Autoconf -*-

-# Copyright (C) 2004-2013 Free Software Foundation, Inc.
+# Copyright (C) 2004-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -157,81 +157,83 @@ bool register_algo_gate( int algo, algo_gate_t *gate )

   switch (algo)
   {
-     case ALGO_ALLIUM:       register_allium_algo      ( gate ); break;
-     case ALGO_ANIME:        register_anime_algo       ( gate ); break;
-     case ALGO_ARGON2:       register_argon2_algo      ( gate ); break;
-     case ALGO_ARGON2DCRDS:  register_argon2d_crds_algo( gate ); break;
-     case ALGO_ARGON2DDYN:   register_argon2d_dyn_algo ( gate ); break;
-     case ALGO_AXIOM:        register_axiom_algo       ( gate ); break;
-     case ALGO_BASTION:      register_bastion_algo     ( gate ); break;
-     case ALGO_BLAKE:        register_blake_algo       ( gate ); break;
-     case ALGO_BLAKECOIN:    register_blakecoin_algo   ( gate ); break;
+     case ALGO_ALLIUM:       register_allium_algo       ( gate ); break;
+     case ALGO_ANIME:        register_anime_algo        ( gate ); break;
+     case ALGO_ARGON2:       register_argon2_algo       ( gate ); break;
+     case ALGO_ARGON2D250:   register_argon2d_crds_algo ( gate ); break;
+     case ALGO_ARGON2D500:   register_argon2d_dyn_algo  ( gate ); break;
+     case ALGO_ARGON2D4096:  register_argon2d4096_algo  ( gate ); break;
+     case ALGO_AXIOM:        register_axiom_algo        ( gate ); break;
+     case ALGO_BASTION:      register_bastion_algo      ( gate ); break;
+     case ALGO_BLAKE:        register_blake_algo        ( gate ); break;
+     case ALGO_BLAKECOIN:    register_blakecoin_algo    ( gate ); break;
 //     case ALGO_BLAKE2B:      register_blake2b_algo    ( gate ); break;
-     case ALGO_BLAKE2S:      register_blake2s_algo     ( gate ); break;
-     case ALGO_C11:          register_c11_algo         ( gate ); break;
-     case ALGO_CRYPTOLIGHT:  register_cryptolight_algo ( gate ); break;
-     case ALGO_CRYPTONIGHT:  register_cryptonight_algo ( gate ); break;
-     case ALGO_DECRED:       register_decred_algo      ( gate ); break;
-     case ALGO_DEEP:         register_deep_algo        ( gate ); break;
-     case ALGO_DMD_GR:       register_dmd_gr_algo      ( gate ); break;
-     case ALGO_DROP:         register_drop_algo        ( gate ); break;
-     case ALGO_FRESH:        register_fresh_algo       ( gate ); break;
-     case ALGO_GROESTL:      register_groestl_algo     ( gate ); break;
-     case ALGO_HEAVY:        register_heavy_algo       ( gate ); break;
-     case ALGO_HMQ1725:      register_hmq1725_algo     ( gate ); break;
-     case ALGO_HODL:         register_hodl_algo        ( gate ); break;
-     case ALGO_JHA:          register_jha_algo         ( gate ); break;
-     case ALGO_KECCAK:       register_keccak_algo      ( gate ); break;
-     case ALGO_KECCAKC:      register_keccakc_algo     ( gate ); break;
-     case ALGO_LBRY:         register_lbry_algo        ( gate ); break;
-     case ALGO_LUFFA:        register_luffa_algo       ( gate ); break;
-     case ALGO_LYRA2H:       register_lyra2h_algo      ( gate ); break;
-     case ALGO_LYRA2RE:      register_lyra2re_algo     ( gate ); break;
-     case ALGO_LYRA2REV2:    register_lyra2rev2_algo   ( gate ); break;
-     case ALGO_LYRA2Z:       register_lyra2z_algo      ( gate ); break;
-     case ALGO_LYRA2Z330:    register_lyra2z330_algo   ( gate ); break;
-     case ALGO_M7M:          register_m7m_algo         ( gate ); break;
-     case ALGO_MYR_GR:       register_myriad_algo      ( gate ); break;
-     case ALGO_NEOSCRYPT:    register_neoscrypt_algo   ( gate ); break;
-     case ALGO_NIST5:        register_nist5_algo       ( gate ); break;
-     case ALGO_PENTABLAKE:   register_pentablake_algo  ( gate ); break;
-     case ALGO_PHI1612:      register_phi1612_algo     ( gate ); break;
-     case ALGO_PLUCK:        register_pluck_algo       ( gate ); break;
-     case ALGO_POLYTIMOS:    register_polytimos_algo   ( gate ); break;
-     case ALGO_QUARK:        register_quark_algo       ( gate ); break;
-     case ALGO_QUBIT:        register_qubit_algo       ( gate ); break;
-     case ALGO_SCRYPT:       register_scrypt_algo      ( gate ); break;
-     case ALGO_SCRYPTJANE:   register_scryptjane_algo  ( gate ); break;
-     case ALGO_SHA256D:      register_sha256d_algo     ( gate ); break;
-     case ALGO_SHA256T:      register_sha256t_algo     ( gate ); break;
-     case ALGO_SHAVITE3:     register_shavite_algo     ( gate ); break;
-     case ALGO_SKEIN:        register_skein_algo       ( gate ); break;
-     case ALGO_SKEIN2:       register_skein2_algo      ( gate ); break;
-     case ALGO_SKUNK:        register_skunk_algo       ( gate ); break;
-     case ALGO_TIMETRAVEL:   register_timetravel_algo  ( gate ); break;
-     case ALGO_TIMETRAVEL10: register_timetravel10_algo( gate ); break;
-     case ALGO_TRIBUS:       register_tribus_algo      ( gate ); break;
-     case ALGO_VANILLA:      register_vanilla_algo     ( gate ); break;
-     case ALGO_VELTOR:       register_veltor_algo      ( gate ); break;
-     case ALGO_WHIRLPOOL:    register_whirlpool_algo   ( gate ); break;
-     case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo  ( gate ); break;
-     case ALGO_X11:          register_x11_algo         ( gate ); break;
-     case ALGO_X11EVO:       register_x11evo_algo      ( gate ); break;
-     case ALGO_X11GOST:      register_x11gost_algo     ( gate ); break;
-     case ALGO_X12:          register_x12_algo         ( gate ); break;
-     case ALGO_X13:          register_x13_algo         ( gate ); break;
-     case ALGO_X13SM3:       register_x13sm3_algo      ( gate ); break;
-     case ALGO_X14:          register_x14_algo         ( gate ); break;
-     case ALGO_X15:          register_x15_algo         ( gate ); break;
-     case ALGO_X16R:         register_x16r_algo        ( gate ); break;
-     case ALGO_X16S:         register_x16s_algo        ( gate ); break;
-     case ALGO_X17:          register_x17_algo         ( gate ); break;
-     case ALGO_XEVAN:        register_xevan_algo       ( gate ); break;
-     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
-     case ALGO_YESCRYPTR8:   register_yescryptr8_algo  ( gate ); break;
-     case ALGO_YESCRYPTR16:  register_yescryptr16_algo ( gate ); break;
-     case ALGO_YESCRYPTR32:  register_yescryptr32_algo ( gate ); break;
-     case ALGO_ZR5:          register_zr5_algo         ( gate ); break;
+     case ALGO_BLAKE2S:      register_blake2s_algo      ( gate ); break;
+     case ALGO_C11:          register_c11_algo          ( gate ); break;
+     case ALGO_CRYPTOLIGHT:  register_cryptolight_algo  ( gate ); break;
+     case ALGO_CRYPTONIGHT:  register_cryptonight_algo  ( gate ); break;
+     case ALGO_CRYPTONIGHTV7:register_cryptonightv7_algo( gate ); break;
+     case ALGO_DECRED:       register_decred_algo       ( gate ); break;
+     case ALGO_DEEP:         register_deep_algo         ( gate ); break;
+     case ALGO_DMD_GR:       register_dmd_gr_algo       ( gate ); break;
+     case ALGO_DROP:         register_drop_algo         ( gate ); break;
+     case ALGO_FRESH:        register_fresh_algo        ( gate ); break;
+     case ALGO_GROESTL:      register_groestl_algo      ( gate ); break;
+     case ALGO_HEAVY:        register_heavy_algo        ( gate ); break;
+     case ALGO_HMQ1725:      register_hmq1725_algo      ( gate ); break;
+     case ALGO_HODL:         register_hodl_algo         ( gate ); break;
+     case ALGO_JHA:          register_jha_algo          ( gate ); break;
+     case ALGO_KECCAK:       register_keccak_algo       ( gate ); break;
+     case ALGO_KECCAKC:      register_keccakc_algo      ( gate ); break;
+     case ALGO_LBRY:         register_lbry_algo         ( gate ); break;
+     case ALGO_LUFFA:        register_luffa_algo        ( gate ); break;
+     case ALGO_LYRA2H:       register_lyra2h_algo       ( gate ); break;
+     case ALGO_LYRA2RE:      register_lyra2re_algo      ( gate ); break;
+     case ALGO_LYRA2REV2:    register_lyra2rev2_algo    ( gate ); break;
+     case ALGO_LYRA2Z:       register_lyra2z_algo       ( gate ); break;
+     case ALGO_LYRA2Z330:    register_lyra2z330_algo    ( gate ); break;
+     case ALGO_M7M:          register_m7m_algo          ( gate ); break;
+     case ALGO_MYR_GR:       register_myriad_algo       ( gate ); break;
+     case ALGO_NEOSCRYPT:    register_neoscrypt_algo    ( gate ); break;
+     case ALGO_NIST5:        register_nist5_algo        ( gate ); break;
+     case ALGO_PENTABLAKE:   register_pentablake_algo   ( gate ); break;
+     case ALGO_PHI1612:      register_phi1612_algo      ( gate ); break;
+     case ALGO_PLUCK:        register_pluck_algo        ( gate ); break;
+     case ALGO_POLYTIMOS:    register_polytimos_algo    ( gate ); break;
+     case ALGO_QUARK:        register_quark_algo        ( gate ); break;
+     case ALGO_QUBIT:        register_qubit_algo        ( gate ); break;
+     case ALGO_SCRYPT:       register_scrypt_algo       ( gate ); break;
+     case ALGO_SCRYPTJANE:   register_scryptjane_algo   ( gate ); break;
+     case ALGO_SHA256D:      register_sha256d_algo      ( gate ); break;
+     case ALGO_SHA256T:      register_sha256t_algo      ( gate ); break;
+     case ALGO_SHAVITE3:     register_shavite_algo      ( gate ); break;
+     case ALGO_SKEIN:        register_skein_algo        ( gate ); break;
+     case ALGO_SKEIN2:       register_skein2_algo       ( gate ); break;
+     case ALGO_SKUNK:        register_skunk_algo        ( gate ); break;
+     case ALGO_TIMETRAVEL:   register_timetravel_algo   ( gate ); break;
+     case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break;
+     case ALGO_TRIBUS:       register_tribus_algo       ( gate ); break;
+     case ALGO_VANILLA:      register_vanilla_algo      ( gate ); break;
+     case ALGO_VELTOR:       register_veltor_algo       ( gate ); break;
+     case ALGO_WHIRLPOOL:    register_whirlpool_algo    ( gate ); break;
+     case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo   ( gate ); break;
+     case ALGO_X11:          register_x11_algo          ( gate ); break;
+     case ALGO_X11EVO:       register_x11evo_algo       ( gate ); break;
+     case ALGO_X11GOST:      register_x11gost_algo      ( gate ); break;
+     case ALGO_X12:          register_x12_algo          ( gate ); break;
+     case ALGO_X13:          register_x13_algo          ( gate ); break;
+     case ALGO_X13SM3:       register_x13sm3_algo       ( gate ); break;
+     case ALGO_X14:          register_x14_algo          ( gate ); break;
+     case ALGO_X15:          register_x15_algo          ( gate ); break;
+     case ALGO_X16R:         register_x16r_algo         ( gate ); break;
+     case ALGO_X16S:         register_x16s_algo         ( gate ); break;
+     case ALGO_X17:          register_x17_algo          ( gate ); break;
+     case ALGO_XEVAN:        register_xevan_algo        ( gate ); break;
+     case ALGO_YESCRYPT:     register_yescrypt_algo     ( gate ); break;
+     case ALGO_YESCRYPTR8:   register_yescryptr8_algo   ( gate ); break;
+     case ALGO_YESCRYPTR16:  register_yescryptr16_algo  ( gate ); break;
+     case ALGO_YESCRYPTR32:  register_yescryptr32_algo  ( gate ); break;
+     case ALGO_ZR5:          register_zr5_algo          ( gate ); break;
    default:
        applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
        return false;
@@ -288,6 +290,9 @@ void exec_hash_function( int algo, void *output, const void *pdata )
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
+  { "argon2d-crds",      "argon2d250"   },
+  { "argon2d-dyn",       "argon2d500"   },
+  { "argon2d-uis",       "argon2d4096"  },
  { "bitcore",           "timetravel10" },
  { "bitzeny",           "yescryptr8"   },
  { "blake256r8",        "blakecoin"    },
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -2,6 +2,8 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include "miner.h"
+#include "avxdefs.h"
+#include "interleave.h"

 /////////////////////////////
 ////
@@ -91,6 +93,7 @@ typedef  uint32_t set_t;
 #define AVX_OPT         8
 #define AVX2_OPT     0x10
 #define SHA_OPT      0x20
+#define AVX512_OPT   0x40

 // return set containing all elements from sets a & b
 inline set_t set_union ( set_t a, set_t b ) { return a | b; }
--- a/algo/argon2/argon2a/argon2a.c
+++ b/algo/argon2/argon2a/argon2a.c
@@ -79,7 +79,7 @@ int64_t argon2_get_max64 ()

 bool register_argon2_algo( algo_gate_t* gate )
 {
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
  gate->scanhash        = (void*)&scanhash_argon2;
  gate->hash            = (void*)&argon2hash;
  gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root;
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -28,6 +28,7 @@ void argon2d_crds_hash( void *output, const void *input )
 	context.lanes = 4;    // Degree of Parallelism
 	context.threads = 1;  // Threads
 	context.t_cost = 1;   // Iterations
+        context.version = ARGON2_VERSION_10;

 	argon2_ctx( &context, Argon2_d );
 }
@@ -70,7 +71,8 @@ bool register_argon2d_crds_algo( algo_gate_t* gate )
        gate->scanhash = (void*)&scanhash_argon2d_crds;
        gate->hash = (void*)&argon2d_crds_hash;
        gate->set_target = (void*)&scrypt_set_target;
-        gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        return true;
 }

 // Dynamic
@@ -96,6 +98,7 @@ void argon2d_dyn_hash( void *output, const void *input )
    context.lanes = 8;     // Degree of Parallelism
    context.threads = 1;   // Threads
    context.t_cost = 2;    // Iterations
+    context.version = ARGON2_VERSION_10;

    argon2_ctx( &context, Argon2_d );
 }
@@ -138,6 +141,58 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )
        gate->scanhash = (void*)&scanhash_argon2d_dyn;
        gate->hash = (void*)&argon2d_dyn_hash;
        gate->set_target = (void*)&scrypt_set_target;
-        gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT;
+        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        return true;
+}
+
+// Unitus
+
+int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done)
+{
+   uint32_t _ALIGN(64) vhash[8];
+   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+    
+   uint32_t t_cost = 1; // 1 iteration
+   uint32_t m_cost = 4096; // use 4MB
+   uint32_t parallelism = 1; // 1 thread, 2 lanes
+
+   for ( int i = 0; i < 19; i++ )
+      be32enc( &endiandata[i], pdata[i] );
+
+   do {
+      be32enc( &endiandata[19], n );
+      argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
+                 (char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
+      if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) )
+      {
+         *hashes_done = n - first_nonce + 1;
+         pdata[19] = n;
+         return true;
+      }
+      n++;
+
+   } while (n < max_nonce && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+
+   return 0;
+}
+
+int64_t get_max64_0x1ff() { return 0x1ff; }
+
+bool register_argon2d4096_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d4096;
+        gate->set_target = (void*)&scrypt_set_target;
+        gate->get_max64  = (void*)&get_max64_0x1ff;
+        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        return true;
 }

--- a/algo/argon2/argon2d/argon2d-gate.h
+++ b/algo/argon2/argon2d/argon2d-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-// Credits
+// Credits: version = 0x10, m_cost = 250.
 bool register_argon2d_crds_algo( algo_gate_t* gate );

 void argon2d_crds_hash( void *state, const void *input );
@@ -12,7 +12,7 @@ void argon2d_crds_hash( void *state, const void *input );
 int scanhash_argon2d_crds( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );

-// Dynamic
+// Dynamic: version = 0x10, m_cost = 500.
 bool register_argon2d_dyn_algo( algo_gate_t* gate );

 void argon2d_dyn_hash( void *state, const void *input );
@@ -21,5 +21,11 @@ int scanhash_argon2d_dyn( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );


+// Unitus: version = 0x13, m_cost = 4096.
+bool register_argon2d4096_algo( algo_gate_t* gate );
+
+int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
 #endif

--- a/algo/argon2/argon2d/argon2d/argon2.c
+++ b/algo/argon2/argon2d/argon2d/argon2.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -19,10 +19,6 @@
 #include <stdlib.h>
 #include <stdio.h>

-#ifdef _WIN32
-#include <malloc.h>
-#endif
-
 #include "argon2.h"
 #include "encoding.h"
 #include "core.h"
@@ -31,6 +27,10 @@ const char *argon2_type2string(argon2_type type, int uppercase) {
    switch (type) {
        case Argon2_d:
            return uppercase ? "Argon2d" : "argon2d";
+        case Argon2_i:
+            return uppercase ? "Argon2i" : "argon2i";
+        case Argon2_id:
+            return uppercase ? "Argon2id" : "argon2id";
    }

    return NULL;
@@ -46,7 +46,7 @@ int argon2_ctx(argon2_context *context, argon2_type type) {
        return result;
    }

-    if (Argon2_d != type) {
+    if (Argon2_d != type && Argon2_i != type && Argon2_id != type) {
        return ARGON2_INCORRECT_TYPE;
    }

@@ -62,18 +62,18 @@ int argon2_ctx(argon2_context *context, argon2_type type) {
    /* Ensure that all segments have equal length */
    memory_blocks = segment_length * (context->lanes * ARGON2_SYNC_POINTS);

+    instance.version = context->version;
    instance.memory = NULL;
    instance.passes = context->t_cost;
    instance.memory_blocks = memory_blocks;
    instance.segment_length = segment_length;
    instance.lane_length = segment_length * ARGON2_SYNC_POINTS;
    instance.lanes = context->lanes;
-    instance.limit = 1;
    instance.threads = context->threads;
    instance.type = type;

-    if (instance.threads > instance.limit) {
-        instance.threads = instance.limit;
+    if (instance.threads > instance.lanes) {
+        instance.threads = instance.lanes;
    }

    /* 3. Initialization: Hashing inputs, allocating memory, filling first
@@ -101,7 +101,8 @@ int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
                const uint32_t parallelism, const void *pwd,
                const size_t pwdlen, const void *salt, const size_t saltlen,
                void *hash, const size_t hashlen, char *encoded,
-                const size_t encodedlen, argon2_type type){
+                const size_t encodedlen, argon2_type type,
+                const uint32_t version){

    argon2_context context;
    int result;
@@ -145,6 +146,7 @@ int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
    context.allocate_cbk = NULL;
    context.free_cbk = NULL;
    context.flags = ARGON2_DEFAULT_FLAGS;
+    context.version = version;

    result = argon2_ctx(&context, type);

@@ -174,23 +176,69 @@ int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
    return ARGON2_OK;
 }

+int argon2i_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
+                         const uint32_t parallelism, const void *pwd,
+                         const size_t pwdlen, const void *salt,
+                         const size_t saltlen, const size_t hashlen,
+                         char *encoded, const size_t encodedlen,
+                         const uint32_t version) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       NULL, hashlen, encoded, encodedlen, Argon2_i,
+                       version );
+}
+
+int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                     const uint32_t parallelism, const void *pwd,
+                     const size_t pwdlen, const void *salt,
+                     const size_t saltlen, void *hash, const size_t hashlen,
+                     const uint32_t version ) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       hash, hashlen, NULL, 0, Argon2_i, version );
+}
+
 int argon2d_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
                         const uint32_t parallelism, const void *pwd,
                         const size_t pwdlen, const void *salt,
                         const size_t saltlen, const size_t hashlen,
-                         char *encoded, const size_t encodedlen) {
+                         char *encoded, const size_t encodedlen,
+                         const uint32_t version ) {

    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
-                       NULL, hashlen, encoded, encodedlen, Argon2_d);
+                       NULL, hashlen, encoded, encodedlen, Argon2_d,
+                       version );
 }

 int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
                     const uint32_t parallelism, const void *pwd,
                     const size_t pwdlen, const void *salt,
-                     const size_t saltlen, void *hash, const size_t hashlen) {
+                     const size_t saltlen, void *hash, const size_t hashlen,
+                     const uint32_t version ) {

    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
-                       hash, hashlen, NULL, 0, Argon2_d);
+                       hash, hashlen, NULL, 0, Argon2_d, version );
+}
+
+int argon2id_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
+                          const uint32_t parallelism, const void *pwd,
+                          const size_t pwdlen, const void *salt,
+                          const size_t saltlen, const size_t hashlen,
+                          char *encoded, const size_t encodedlen,
+                          const uint32_t version ) {
+
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       NULL, hashlen, encoded, encodedlen, Argon2_id,
+                       version);
+}
+
+int argon2id_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                      const uint32_t parallelism, const void *pwd,
+                      const size_t pwdlen, const void *salt,
+                      const size_t saltlen, void *hash, const size_t hashlen,
+                      const uint32_t version ) {
+    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
+                       hash, hashlen, NULL, 0, Argon2_id, version );
 }

 static int argon2_compare(const uint8_t *b1, const uint8_t *b2, size_t len) {
@@ -269,15 +317,33 @@ fail:
    return ret;
 }

+int argon2i_verify(const char *encoded, const void *pwd, const size_t pwdlen) {
+
+    return argon2_verify(encoded, pwd, pwdlen, Argon2_i);
+}
+
 int argon2d_verify(const char *encoded, const void *pwd, const size_t pwdlen) {

    return argon2_verify(encoded, pwd, pwdlen, Argon2_d);
 }

+int argon2id_verify(const char *encoded, const void *pwd, const size_t pwdlen) {
+
+    return argon2_verify(encoded, pwd, pwdlen, Argon2_id);
+}
+
 int argon2d_ctx(argon2_context *context) {
    return argon2_ctx(context, Argon2_d);
 }

+int argon2i_ctx(argon2_context *context) {
+    return argon2_ctx(context, Argon2_i);
+}
+
+int argon2id_ctx(argon2_context *context) {
+    return argon2_ctx(context, Argon2_id);
+}
+
 int argon2_verify_ctx(argon2_context *context, const char *hash,
                      argon2_type type) {
    int ret = argon2_ctx(context, type);
@@ -296,6 +362,14 @@ int argon2d_verify_ctx(argon2_context *context, const char *hash) {
    return argon2_verify_ctx(context, hash, Argon2_d);
 }

+int argon2i_verify_ctx(argon2_context *context, const char *hash) {
+    return argon2_verify_ctx(context, hash, Argon2_i);
+}
+
+int argon2id_verify_ctx(argon2_context *context, const char *hash) {
+    return argon2_verify_ctx(context, hash, Argon2_id);
+}
+
 const char *argon2_error_message(int error_code) {
    switch (error_code) {
    case ARGON2_OK:
@@ -374,307 +448,11 @@ const char *argon2_error_message(int error_code) {
        return "Unknown error code";
    }
 }
-
+/*
 size_t argon2_encodedlen(uint32_t t_cost, uint32_t m_cost, uint32_t parallelism,
                         uint32_t saltlen, uint32_t hashlen, argon2_type type) {
  return strlen("$$v=$m=,t=,p=$$") + strlen(argon2_type2string(type, 0)) +
         numlen(t_cost) + numlen(m_cost) + numlen(parallelism) +
-         b64len(saltlen) + b64len(hashlen);
+         b64len(saltlen) + b64len(hashlen) + numlen(ARGON2_VERSION_NUMBER) + 1;
 }
-
-#ifdef __AVX2__
-
-///////////////////////////
-// Wolf's Additions
-///////////////////////////
-
-#include <stdbool.h>
-#include <pthread.h>
-#include <x86intrin.h>
-#include "../blake2/blake2.h"
-
-typedef struct _Argon2d_Block
-{
-	union
-	{
-		uint64_t data[1024 / 8] __attribute__((aligned(32)));
-		__m128i dqwords[1024 / 16] __attribute__((aligned(32)));
-		__m256i qqwords[1024 / 32] __attribute__((aligned(32)));
-	};
-} Argon2d_Block;
-
-typedef struct _Argon2ThreadData
-{
-	Argon2d_Block *Matrix;
-	uint32_t slice;
-	uint32_t lane;
-} Argon2ThreadData;
-
-#define SEGMENT_LENGTH			(250U / (4U * 4U))		// memory_blocks / (context->lanes * ARGON2_SYNC_POINTS);
-#define LANE_LENGTH				(SEGMENT_LENGTH * 4U)	// segment_length * ARGON2_SYNC_POINTS;
-#define CONCURRENT_THREADS		4
-
-static const uint64_t blake2b_IV[8] =
-{
-	0x6A09E667F3BCC908ULL, 0xBB67AE8584CAA73BULL,
-	0x3C6EF372FE94F82BULL, 0xA54FF53A5F1D36F1ULL,
-	0x510E527FADE682D1ULL, 0x9B05688C2B3E6C1FULL,
-	0x1F83D9ABFB41BD6BULL, 0x5BE0CD19137E2179ULL
-};
-
-static const unsigned int blake2b_sigma[12][16] =
-{
-	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
-	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
-	{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
-	{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
-	{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
-	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
-	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
-	{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
-	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
-	{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
-	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
-};
-
-#define ROTL64(x, y)		(((x) << (y)) | ((x) >> (64 - (y))))
-
-#define G(r, i, a, b, c, d)                                                    \
-    do {                                                                       \
-        a = a + b + m[blake2b_sigma[r][2 * i + 0]];                            \
-        d = ROTL64(d ^ a, 32);                                                 \
-        c = c + d;                                                             \
-        b = ROTL64(b ^ c, 40);                                                 \
-        a = a + b + m[blake2b_sigma[r][2 * i + 1]];                            \
-        d = ROTL64(d ^ a, 48);                                                 \
-        c = c + d;                                                             \
-        b = ROTL64(b ^ c, 1);                                                 \
-    } while ((void)0, 0)
-
-#define ROUND(r)                                                               \
-    do {                                                                       \
-        G(r, 0, v[0], v[4], v[8], v[12]);                                      \
-        G(r, 1, v[1], v[5], v[9], v[13]);                                      \
-        G(r, 2, v[2], v[6], v[10], v[14]);                                     \
-        G(r, 3, v[3], v[7], v[11], v[15]);                                     \
-        G(r, 4, v[0], v[5], v[10], v[15]);                                     \
-        G(r, 5, v[1], v[6], v[11], v[12]);                                     \
-        G(r, 6, v[2], v[7], v[8], v[13]);                                      \
-        G(r, 7, v[3], v[4], v[9], v[14]);                                      \
-    } while ((void)0, 0)
-
-void CompressBlock(uint64_t *h, const uint64_t *m, uint64_t t, uint64_t f)
-{
-	uint64_t v[16];
-	
-	int i;
-	for(i = 0; i < 8; ++i) v[i] = h[i];
-	
-	for(i = 8; i < 16; ++i) v[i] = blake2b_IV[i - 8];
-	
-	v[12] ^= t;
-	v[14] ^= f;
-	
-	int r;
-	for(r = 0; r < 12; ++r)
-	{
-		ROUND(r);
-	}
-	
-	for(i = 0; i < 8; ++i) h[i] ^= v[i] ^ v[i + 8];
-}
-
-void Argon2dInitHash(void *HashOut, void *Input)
-{
-	blake2b_state BlakeHash;
-	uint32_t InBuf[64];							// Is only 50 uint32_t, but need more space for Blake2B
-	
-	memset(InBuf, 0x00, 200);
-	
-	InBuf[0] = 4UL;								// Lanes
-	InBuf[1] = 32UL;								// Output Length
-	InBuf[2] = 250UL;							// Memory Cost
-	InBuf[3] = 1UL;								// Time Cost
-	InBuf[4] = 16UL;								// Argon2 Version Number
-	InBuf[5] = 0UL;								// Type
-	InBuf[6] = 80UL;								// Password Length
-	
-	memcpy(InBuf + 7, Input, 80);				// Password
-	
-	InBuf[27] = 80UL;							// Salt Length
-	
-	memcpy(InBuf + 28, Input, 80);				// Salt
-	
-	InBuf[48] = 0UL;								// Secret Length
-	InBuf[49] = 0UL;								// Associated Data Length
-	
-	int i;
-	for(i = 50; i < 64; ++i) InBuf[i] = 0UL;
-		
-	uint64_t H[8];
-	
-	for(i = 0; i < 8; ++i) H[i] = blake2b_IV[i];
-	
-	H[0] ^= 0x0000000001010040;
-	
-	CompressBlock(H, (uint64_t *)InBuf, 128ULL, 0ULL);
-	CompressBlock(H, (uint64_t *)(InBuf + 32), 200ULL, 0xFFFFFFFFFFFFFFFFULL);
-	
-	memcpy(HashOut, H, 64U);
-}
-
-void Argon2dFillFirstBlocks(Argon2d_Block *Matrix, void *InitHash)
-{
-	uint32_t lane;
-	for(lane = 0; lane < 4; ++lane)
-	{
-		((uint32_t *)InitHash)[16] = 0;
-		((uint32_t *)InitHash)[17] = lane;
-		blake2b_long(Matrix[lane * LANE_LENGTH].data, 1024, InitHash, 72);
-		((uint32_t *)InitHash)[16] |= 1;
-		blake2b_long(Matrix[lane * LANE_LENGTH + 1].data, 1024, InitHash, 72);
-	}
-}
-
-#include "../blake2/blamka-round-opt.h"
-
-void Argon2dFillSingleBlock(Argon2d_Block *State, Argon2d_Block *RefBlock, Argon2d_Block *NextBlock)
-{	
-	__m256i XY[32];
-	
-	int i;
-	for(i = 0; i < 32; ++i)
-		XY[i] = State->qqwords[i] = _mm256_xor_si256(State->qqwords[i], RefBlock->qqwords[i]);
-	
-	for(i = 0; i < 8; ++i)
-	{
-		BLAKE2_ROUND(	State->dqwords[8 * i + 0], State->dqwords[8 * i + 1], State->dqwords[8 * i + 2], State->dqwords[8 * i + 3],
-						State->dqwords[8 * i + 4], State->dqwords[8 * i + 5], State->dqwords[8 * i + 6], State->dqwords[8 * i + 7]);
-	}
-	
-	for(i = 0; i < 8; ++i)
-	{
-		BLAKE2_ROUND(	State->dqwords[8 * 0 + i], State->dqwords[8 * 1 + i], State->dqwords[8 * 2 + i], State->dqwords[8 * 3 + i],
-						State->dqwords[8 * 4 + i], State->dqwords[8 * 5 + i], State->dqwords[8 * 6 + i], State->dqwords[8 * 7 + i]);
-	}
-	
-	for(i = 0; i < 32; ++i)
-	{
-		State->qqwords[i] = _mm256_xor_si256(State->qqwords[i], XY[i]);
-		_mm256_store_si256(NextBlock->qqwords + i, State->qqwords[i]);
-	}
-}
-
-void FillSegment(Argon2d_Block *Matrix, uint32_t slice, uint32_t lane)
-{			
-	uint32_t startidx, prevoff, curoff;
-	Argon2d_Block State;
-	
-	startidx = (!slice) ? 2 : 0;
-	curoff = lane * LANE_LENGTH + slice * SEGMENT_LENGTH + startidx;
-	
-	//if(!(curoff % LANE_LENGTH)) prevoff = curoff + LANE_LENGTH - 1;
-	//else prevoff = curoff - 1;
-	
-	prevoff = (!(curoff % LANE_LENGTH)) ? curoff + LANE_LENGTH - 1 : curoff - 1;
-	
-	memcpy(State.data, (Matrix + prevoff)->data, 1024);
-	
-	int i;
-	for(i = startidx; i < SEGMENT_LENGTH; ++i, ++curoff, ++prevoff)
-	{
-		if((curoff % LANE_LENGTH) == 1) prevoff = curoff - 1;
-		
-		uint64_t pseudorand = Matrix[prevoff].data[0];
-		uint64_t reflane = (!slice) ? lane : (pseudorand >> 32) & 3;		// mod lanes
-				
-		uint32_t index = i;
-		bool samelane = reflane == lane;
-		pseudorand &= 0xFFFFFFFFULL;
-		uint32_t refareasize = ((reflane == lane) ? slice * SEGMENT_LENGTH + index - 1 : slice * SEGMENT_LENGTH + ((!index) ? -1 : 0));
-		
-		
-		if(!slice) refareasize = index - 1;
-		
-		uint64_t relativepos = (pseudorand & 0xFFFFFFFFULL);
-		relativepos = relativepos * relativepos >> 32;
-		relativepos = refareasize - 1 - (refareasize * relativepos >> 32);
-		
-		uint32_t startpos = 0;
-				
-		uint32_t abspos = (startpos + relativepos) % LANE_LENGTH;
-		
-		uint32_t refidx = abspos;
-		
-		Argon2dFillSingleBlock(&State, Matrix + (LANE_LENGTH * reflane + refidx), Matrix + curoff);
-	}
-}
-
-void *ThreadedSegmentFill(void *ThrData)
-{
-	Argon2ThreadData *Data = (Argon2ThreadData *)ThrData;
-	
-	FillSegment(Data->Matrix, Data->slice, Data->lane);
-	return(NULL);
-}
-
-void Argon2dFillAllBlocks(Argon2d_Block *Matrix)
-{
-	pthread_t ThrHandles[CONCURRENT_THREADS];
-	Argon2ThreadData ThrData[CONCURRENT_THREADS];
-	
-	int s;
-	for(s = 0; s < 4; ++s)
-	{
-		// WARNING: Assumes CONCURRENT_THREADS == lanes == 4
-		int l;
-		for(l = 0; l < 4; ++l)
-		{
-			FillSegment(Matrix, s, l);
-		}		
-	}
-}
-
-void Argon2dFinalizeHash(void *OutputHash, Argon2d_Block *Matrix)
-{
-	int l;
-	for(l = 1; l < 4; ++l)
-	{
-		int i;
-		for(i = 0; i < 32; ++i)
-			Matrix[LANE_LENGTH - 1].qqwords[i] = _mm256_xor_si256(Matrix[LANE_LENGTH - 1].qqwords[i], Matrix[LANE_LENGTH * l + (LANE_LENGTH - 1)].qqwords[i]);
-	}
-	
-	blake2b_long(OutputHash, 32, Matrix[LANE_LENGTH - 1].data, 1024);
-}
-
-void WolfArgon2dPoWHash(void *Output, void *Matrix, const void *BlkHdr)
-{
-	uint8_t tmp[72];
-		
-	Argon2dInitHash(tmp, (uint8_t *)BlkHdr);
-		
-	Argon2dFillFirstBlocks(Matrix, tmp);
-	
-	Argon2dFillAllBlocks(Matrix);
-	
-	Argon2dFinalizeHash((uint8_t *)Output, Matrix);
-}
-
-void WolfArgon2dAllocateCtx(void **Matrix)
-{
-	#ifdef _WIN32
-	*((Argon2d_Block **)Matrix) = (Argon2d_Block *)_aligned_malloc(32, sizeof(Argon2d_Block) * (SEGMENT_LENGTH << 4));
-	#else
-	*((Argon2d_Block **)Matrix) = (Argon2d_Block *)malloc(sizeof(Argon2d_Block) * (SEGMENT_LENGTH << 4));
-	posix_memalign(Matrix, 32, sizeof(Argon2d_Block) * (SEGMENT_LENGTH << 4));
-	#endif
-}
-
-void WolfArgon2dFreeCtx(void *Matrix)
-{
-	free(Matrix);
-}
-
-#endif
+*/
--- a/algo/argon2/argon2d/argon2d/argon2.h
+++ b/algo/argon2/argon2d/argon2d/argon2.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -29,10 +29,13 @@ extern "C" {
 /* Symbols visibility control */
 #ifdef A2_VISCTL
 #define ARGON2_PUBLIC __attribute__((visibility("default")))
+#define ARGON2_LOCAL __attribute__ ((visibility ("hidden")))
 #elif _MSC_VER
 #define ARGON2_PUBLIC __declspec(dllexport)
+#define ARGON2_LOCAL
 #else
 #define ARGON2_PUBLIC
+#define ARGON2_LOCAL
 #endif

 /*
@@ -206,6 +209,8 @@ typedef struct Argon2_Context {
    uint32_t lanes;   /* number of lanes */
    uint32_t threads; /* maximum number of threads */

+    uint32_t version; /* version number */
+
    allocate_fptr allocate_cbk; /* pointer to memory allocator */
    deallocate_fptr free_cbk;   /* pointer to memory deallocator */

@@ -214,9 +219,15 @@ typedef struct Argon2_Context {

 /* Argon2 primitive type */
 typedef enum Argon2_type {
-  Argon2_d = 0
+  Argon2_d = 0,
+  Argon2_i = 1,
+  Argon2_id = 2
 } argon2_type;

+/* Version of the algorithm */
+#define ARGON2_VERSION_10 0x10
+#define ARGON2_VERSION_13 0x13
+
 /*
 * Function that gives the string representation of an argon2_type.
 * @param type The argon2_type that we want the string for
@@ -233,8 +244,31 @@ ARGON2_PUBLIC const char *argon2_type2string(argon2_type type, int uppercase);
 ARGON2_PUBLIC int argon2_ctx(argon2_context *context, argon2_type type);

 /**
- * Hashes a password with Argon2i, producing a raw hash by allocating memory at
- * @hash
+ * Hashes a password with Argon2i, producing an encoded hash
+ * @param t_cost Number of iterations
+ * @param m_cost Sets memory usage to m_cost kibibytes
+ * @param parallelism Number of threads and compute lanes
+ * @param pwd Pointer to password
+ * @param pwdlen Password size in bytes
+ * @param salt Pointer to salt
+ * @param saltlen Salt size in bytes
+ * @param hashlen Desired length of the hash in bytes
+ * @param encoded Buffer where to write the encoded hash
+ * @param encodedlen Size of the buffer (thus max size of the encoded hash)
+ * @pre   Different parallelism levels will give different results
+ * @pre   Returns ARGON2_OK if successful
+ */
+ARGON2_PUBLIC int argon2i_hash_encoded(const uint32_t t_cost,
+                                       const uint32_t m_cost,
+                                       const uint32_t parallelism,
+                                       const void *pwd, const size_t pwdlen,
+                                       const void *salt, const size_t saltlen,
+                                       const size_t hashlen, char *encoded,
+                                       const size_t encodedlen,
+                                       const uint32_t version );
+
+/**
+ * Hashes a password with Argon2i, producing a raw hash at @hash
 * @param t_cost Number of iterations
 * @param m_cost Sets memory usage to m_cost kibibytes
 * @param parallelism Number of threads and compute lanes
@@ -247,11 +281,12 @@ ARGON2_PUBLIC int argon2_ctx(argon2_context *context, argon2_type type);
 * @pre   Different parallelism levels will give different results
 * @pre   Returns ARGON2_OK if successful
 */
-ARGON2_PUBLIC int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+ARGON2_PUBLIC int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
                                   const uint32_t parallelism, const void *pwd,
                                   const size_t pwdlen, const void *salt,
                                   const size_t saltlen, void *hash,
-                                   const size_t hashlen);
+                                   const size_t hashlen,
+                                   const uint32_t version );

 ARGON2_PUBLIC int argon2d_hash_encoded(const uint32_t t_cost,
                                       const uint32_t m_cost,
@@ -259,7 +294,32 @@ ARGON2_PUBLIC int argon2d_hash_encoded(const uint32_t t_cost,
                                       const void *pwd, const size_t pwdlen,
                                       const void *salt, const size_t saltlen,
                                       const size_t hashlen, char *encoded,
-                                       const size_t encodedlen);
+                                       const size_t encodedlen,
+                                       const uint32_t version );
+
+ARGON2_PUBLIC int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
+                                   const uint32_t parallelism, const void *pwd,
+                                   const size_t pwdlen, const void *salt,
+                                   const size_t saltlen, void *hash,
+                                   const size_t hashlen,
+                                   const uint32_t version );
+
+ARGON2_PUBLIC int argon2id_hash_encoded(const uint32_t t_cost,
+                                        const uint32_t m_cost,
+                                        const uint32_t parallelism,
+                                        const void *pwd, const size_t pwdlen,
+                                        const void *salt, const size_t saltlen,
+                                        const size_t hashlen, char *encoded,
+                                        const size_t encodedlen,
+                                        const uint32_t version );
+
+ARGON2_PUBLIC int argon2id_hash_raw(const uint32_t t_cost,
+                                    const uint32_t m_cost,
+                                    const uint32_t parallelism, const void *pwd,
+                                    const size_t pwdlen, const void *salt,
+                                    const size_t saltlen, void *hash,
+                                    const size_t hashlen,
+                                    const uint32_t version );

 /* generic function underlying the above ones */
 ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
@@ -267,7 +327,8 @@ ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
                              const size_t pwdlen, const void *salt,
                              const size_t saltlen, void *hash,
                              const size_t hashlen, char *encoded,
-                              const size_t encodedlen, argon2_type type);
+                              const size_t encodedlen, argon2_type type,
+                              const uint32_t version );

 /**
 * Verifies a password against an encoded string
@@ -276,9 +337,15 @@ ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
 * @param pwd Pointer to password
 * @pre   Returns ARGON2_OK if successful
 */
+ARGON2_PUBLIC int argon2i_verify(const char *encoded, const void *pwd,
+                                 const size_t pwdlen);
+
 ARGON2_PUBLIC int argon2d_verify(const char *encoded, const void *pwd,
                                 const size_t pwdlen);

+ARGON2_PUBLIC int argon2id_verify(const char *encoded, const void *pwd,
+                                  const size_t pwdlen);
+
 /* generic function underlying the above ones */
 ARGON2_PUBLIC int argon2_verify(const char *encoded, const void *pwd,
                                const size_t pwdlen, argon2_type type);
@@ -293,6 +360,27 @@ ARGON2_PUBLIC int argon2_verify(const char *encoded, const void *pwd,
 */
 ARGON2_PUBLIC int argon2d_ctx(argon2_context *context);

+/**
+ * Argon2i: Version of Argon2 that picks memory blocks
+ * independent on the password and salt. Good for side-channels,
+ * but worse w.r.t. tradeoff attacks if only one pass is used.
+ *****
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2i_ctx(argon2_context *context);
+
+/**
+ * Argon2id: Version of Argon2 where the first half-pass over memory is
+ * password-independent, the rest are password-dependent (on the password and
+ * salt). OK against side channels (they reduce to 1/2-pass Argon2i), and
+ * better with w.r.t. tradeoff attacks (similar to Argon2d).
+ *****
+ * @param  context  Pointer to current Argon2 context
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2id_ctx(argon2_context *context);
+
 /**
 * Verify if a given password is correct for Argon2d hashing
 * @param  context  Pointer to current Argon2 context
@@ -302,6 +390,25 @@ ARGON2_PUBLIC int argon2d_ctx(argon2_context *context);
 */
 ARGON2_PUBLIC int argon2d_verify_ctx(argon2_context *context, const char *hash);

+/**
+ * Verify if a given password is correct for Argon2i hashing
+ * @param  context  Pointer to current Argon2 context
+ * @param  hash  The password hash to verify. The length of the hash is
+ * specified by the context outlen member
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2i_verify_ctx(argon2_context *context, const char *hash);
+
+/**
+ * Verify if a given password is correct for Argon2id hashing
+ * @param  context  Pointer to current Argon2 context
+ * @param  hash  The password hash to verify. The length of the hash is
+ * specified by the context outlen member
+ * @return  Zero if successful, a non zero error code otherwise
+ */
+ARGON2_PUBLIC int argon2id_verify_ctx(argon2_context *context,
+                                      const char *hash);
+
 /* generic function underlying the above ones */
 ARGON2_PUBLIC int argon2_verify_ctx(argon2_context *context, const char *hash,
                                    argon2_type type);
@@ -326,18 +433,6 @@ ARGON2_PUBLIC size_t argon2_encodedlen(uint32_t t_cost, uint32_t m_cost,
                                       uint32_t parallelism, uint32_t saltlen,
                                       uint32_t hashlen, argon2_type type);

-#ifdef __AVX2__
-
-///////////////////////////
-// Wolf's Additions
-///////////////////////////
-
-void WolfArgon2dPoWHash(void *Output, void *Matrix, const void *BlkHdr);
-void WolfArgon2dAllocateCtx(void **Matrix);
-void WolfArgon2dFreeCtx(void *Matrix);
-
-#endif
-
 #if defined(__cplusplus)
 }
 #endif
--- a/algo/argon2/argon2d/argon2d/core.c
+++ b/algo/argon2/argon2d/argon2d/core.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -25,7 +25,6 @@
 #endif
 #define VC_GE_2005(version) (version >= 1400)

-#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -35,6 +34,10 @@
 #include "../blake2/blake2.h"
 #include "../blake2/blake2-impl.h"

+#ifdef GENKAT
+#include "genkat.h"
+#endif
+
 #if defined(__clang__)
 #if __has_attribute(optnone)
 #define NOT_OPTIMIZED __attribute__((optnone))
@@ -131,7 +134,7 @@ void NOT_OPTIMIZED secure_wipe_memory(void *v, size_t n) {
 }

 /* Memory clear flag defaults to true. */
-int FLAG_clear_internal_memory = 1;
+int FLAG_clear_internal_memory = 0;
 void clear_internal_memory(void *v, size_t n) {
  if (FLAG_clear_internal_memory && v) {
    secure_wipe_memory(v, n);
@@ -163,6 +166,10 @@ void finalize(const argon2_context *context, argon2_instance_t *instance) {
            clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
        }

+#ifdef GENKAT
+        print_tag(context->out, context->outlen);
+#endif
+
        free_memory(context, (uint8_t *)instance->memory,
                    instance->memory_blocks, sizeof(block));
    }
@@ -249,6 +256,9 @@ static int fill_memory_blocks_st(argon2_instance_t *instance) {
                fill_segment(instance, position);
            }
        }
+#ifdef GENKAT
+        internal_kat(instance, r); /* Print all memory blocks */
+#endif
    }
    return ARGON2_OK;
 }
@@ -331,6 +341,10 @@ static int fill_memory_blocks_mt(argon2_instance_t *instance) {
                }
            }
        }
+
+#ifdef GENKAT
+        internal_kat(instance, r); /* Print all memory blocks */
+#endif
    }

 fail:
@@ -530,7 +544,8 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
    store32(&value, context->t_cost);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));

-    store32(&value, ARGON2_VERSION_NUMBER);        
+//    store32(&value, ARGON2_VERSION_NUMBER);
+    store32(&value, context->version);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));

    store32(&value, (uint32_t)type);
@@ -538,7 +553,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,

    store32(&value, context->pwdlen);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));
-		
+
    if (context->pwd != NULL) {
        blake2b_update(&BlakeHash, (const uint8_t *)context->pwd,
                       context->pwdlen);
@@ -548,7 +563,7 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
            context->pwdlen = 0;
        }
    }
-		
+
    store32(&value, context->saltlen);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));

@@ -602,11 +617,14 @@ int initialize(argon2_instance_t *instance, argon2_context *context) {
    /* Hashing all inputs */
    initial_hash(blockhash, context, instance->type);
    /* Zeroing 8 extra bytes */
-
    clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH,
                          ARGON2_PREHASH_SEED_LENGTH -
                              ARGON2_PREHASH_DIGEST_LENGTH);

+#ifdef GENKAT
+    initial_kat(blockhash, context, instance->type);
+#endif
+
    /* 3. Creating first blocks, we always have at least two blocks in a slice
     */
    fill_first_blocks(blockhash, instance);
--- a/algo/argon2/argon2d/argon2d/core.h
+++ b/algo/argon2/argon2d/argon2d/core.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -25,12 +25,12 @@
 /**********************Argon2 internal constants*******************************/

 enum argon2_core_constants {
-    /* Version of the algorithm */   
-    ARGON2_VERSION_NUMBER = 0x10,
    /* Memory block size in bytes */
    ARGON2_BLOCK_SIZE = 1024,
    ARGON2_QWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8,
    ARGON2_OWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 16,
+    ARGON2_HWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 32,
+    ARGON2_512BIT_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 64,

    /* Number of pseudo-random values generated by one call to Blake in Argon2i
       to
@@ -76,7 +76,6 @@ typedef struct Argon2_instance_t {
    uint32_t segment_length;
    uint32_t lane_length;
    uint32_t lanes;
-    uint32_t limit;
    uint32_t threads;
    argon2_type type;
    int print_internals; /* whether to print the memory blocks */
--- a/algo/argon2/argon2d/argon2d/encoding.c
+++ b/algo/argon2/argon2d/argon2d/encoding.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -326,6 +326,10 @@ int decode_string(argon2_context *ctx, const char *str, argon2_type type) {
    CC("$");
    CC(type_string);

+    /* Reading the version number if the default is suppressed */
+    ctx->version = ARGON2_VERSION_10;
+    CC_opt("$v=", DECIMAL_U32(ctx->version));
+
    CC("$m=");
    DECIMAL_U32(ctx->m_cost);
    CC(",t=");
@@ -411,6 +415,9 @@ int encode_string(char *dst, size_t dst_len, argon2_context *ctx,
    SS("$");
    SS(type_string);

+    SS("$v=");
+    SX(ctx->version);
+
    SS("$m=");
    SX(ctx->m_cost);
    SS(",t=");
--- a/algo/argon2/argon2d/argon2d/encoding.h
+++ b/algo/argon2/argon2d/argon2d/encoding.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -34,6 +34,117 @@
 * @param with_xor Whether to XOR into the new block (1) or just overwrite (0)
 * @pre all block pointers must be valid
 */
+
+#if defined(__AVX512F__)
+
+static void fill_block(__m512i *state, const block *ref_block,
+                       block *next_block, int with_xor) {
+    __m512i block_XY[ARGON2_512BIT_WORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+            state[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+            block_XY[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)next_block->v + i));
+        }
+    } else {
+        for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+            block_XY[i] = state[i] = _mm512_xor_si512(
+                state[i], _mm512_loadu_si512((const __m512i *)ref_block->v + i));
+        }
+    }
+
+    BLAKE2_ROUND_1( state[ 0], state[ 1], state[ 2], state[ 3],
+                    state[ 4], state[ 5], state[ 6], state[ 7] );
+    BLAKE2_ROUND_1( state[ 8], state[ 9], state[10], state[11],
+                    state[12], state[13], state[14], state[15] );
+
+    BLAKE2_ROUND_2( state[ 0], state[ 2], state[ 4], state[ 6],
+                    state[ 8], state[10], state[12], state[14] );
+    BLAKE2_ROUND_2( state[ 1], state[ 3], state[ 5], state[ 7],
+                    state[ 9], state[11], state[13], state[15] );
+
+/*
+    for (i = 0; i < 2; ++i) {
+        BLAKE2_ROUND_1(
+            state[8 * i + 0], state[8 * i + 1], state[8 * i + 2], state[8 * i + 3],
+            state[8 * i + 4], state[8 * i + 5], state[8 * i + 6], state[8 * i + 7]);
+    }
+
+    for (i = 0; i < 2; ++i) {
+        BLAKE2_ROUND_2(
+            state[2 * 0 + i], state[2 * 1 + i], state[2 * 2 + i], state[2 * 3 + i],
+            state[2 * 4 + i], state[2 * 5 + i], state[2 * 6 + i], state[2 * 7 + i]);
+    }
+*/
+
+    for (i = 0; i < ARGON2_512BIT_WORDS_IN_BLOCK; i++) {
+        state[i] = _mm512_xor_si512(state[i], block_XY[i]);
+        _mm512_storeu_si512((__m512i *)next_block->v + i, state[i]);
+    }
+}
+
+#elif defined(__AVX2__)
+
+static void fill_block(__m256i *state, const block *ref_block,
+                       block *next_block, int with_xor) {
+    __m256i block_XY[ARGON2_HWORDS_IN_BLOCK];
+    unsigned int i;
+
+    if (with_xor) {
+        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+            state[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+            block_XY[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)next_block->v + i));
+        }
+    } else {
+        for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+            block_XY[i] = state[i] = _mm256_xor_si256(
+                state[i], _mm256_loadu_si256((const __m256i *)ref_block->v + i));
+        }
+    }
+
+    BLAKE2_ROUND_1( state[ 0], state[ 4], state[ 1], state[ 5],
+                    state[ 2], state[ 6], state[ 3], state[ 7] );
+    BLAKE2_ROUND_1( state[ 8], state[12], state[ 9], state[13],
+                    state[10], state[14], state[11], state[15] );
+    BLAKE2_ROUND_1( state[16], state[20], state[17], state[21],
+                    state[18], state[22], state[19], state[23] );
+    BLAKE2_ROUND_1( state[24], state[28], state[25], state[29],
+                    state[26], state[30], state[27], state[31] );
+
+    BLAKE2_ROUND_2( state[ 0], state[ 4], state[ 8], state[12],
+                    state[16], state[20], state[24], state[28] );
+    BLAKE2_ROUND_2( state[ 1], state[ 5], state[ 9], state[13],
+                    state[17], state[21], state[25], state[29] );
+    BLAKE2_ROUND_2( state[ 2], state[ 6], state[10], state[14],
+                    state[18], state[22], state[26], state[30] );
+    BLAKE2_ROUND_2( state[ 3], state[ 7], state[11], state[15],
+                    state[19], state[23], state[27], state[31] );
+
+/*
+    for (i = 0; i < 4; ++i) {
+        BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
+                       state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
+    }
+
+    for (i = 0; i < 4; ++i) {
+        BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
+                       state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
+    }
+*/
+
+    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
+        state[i] = _mm256_xor_si256(state[i], block_XY[i]);
+        _mm256_storeu_si256((__m256i *)next_block->v + i, state[i]);
+    }
+}
+
+#else  // SSE2
+
 static void fill_block(__m128i *state, const block *ref_block,
                       block *next_block, int with_xor) {
    __m128i block_XY[ARGON2_OWORDS_IN_BLOCK];
@@ -53,6 +164,41 @@ static void fill_block(__m128i *state, const block *ref_block,
        }
    }

+    BLAKE2_ROUND( state[ 0], state[ 1], state[ 2], state[ 3],
+                  state[ 4], state[ 5], state[ 6], state[ 7] );
+    BLAKE2_ROUND( state[ 8], state[ 9], state[10], state[11], 
+                  state[12], state[13], state[14], state[15] );
+    BLAKE2_ROUND( state[16], state[17], state[18], state[19], 
+                  state[20], state[21], state[22], state[23] );
+    BLAKE2_ROUND( state[24], state[25], state[26], state[27], 
+                  state[28], state[29], state[30], state[31] );
+    BLAKE2_ROUND( state[32], state[33], state[34], state[35], 
+                  state[36], state[37], state[38], state[39] );
+    BLAKE2_ROUND( state[40], state[41], state[42], state[43], 
+                  state[44], state[45], state[46], state[47] );
+    BLAKE2_ROUND( state[48], state[49], state[50], state[51], 
+                  state[52], state[53], state[54], state[55] );
+    BLAKE2_ROUND( state[56], state[57], state[58], state[59], 
+                  state[60], state[61], state[62], state[63] );
+
+    BLAKE2_ROUND( state[ 0], state[ 8], state[16], state[24], 
+                  state[32], state[40], state[48], state[56] );
+    BLAKE2_ROUND( state[ 1], state[ 9], state[17], state[25],  
+                  state[33], state[41], state[49], state[57] );
+    BLAKE2_ROUND( state[ 2], state[10], state[18], state[26],  
+                  state[34], state[42], state[50], state[58] );
+    BLAKE2_ROUND( state[ 3], state[11], state[19], state[27],  
+                  state[35], state[43], state[51], state[59] );
+    BLAKE2_ROUND( state[ 4], state[12], state[20], state[28],  
+                  state[36], state[44], state[52], state[60] );
+    BLAKE2_ROUND( state[ 5], state[13], state[21], state[29],  
+                  state[37], state[45], state[53], state[61] );
+    BLAKE2_ROUND( state[ 6], state[14], state[22], state[30],  
+                  state[38], state[46], state[54], state[62] );
+    BLAKE2_ROUND( state[ 7], state[15], state[23], state[31],  
+                  state[39], state[47], state[55], state[63] );
+
+/*
    for (i = 0; i < 8; ++i) {
        BLAKE2_ROUND(state[8 * i + 0], state[8 * i + 1], state[8 * i + 2],
            state[8 * i + 3], state[8 * i + 4], state[8 * i + 5],
@@ -64,17 +210,28 @@ static void fill_block(__m128i *state, const block *ref_block,
            state[8 * 3 + i], state[8 * 4 + i], state[8 * 5 + i],
            state[8 * 6 + i], state[8 * 7 + i]);
    }
-
+*/
    for (i = 0; i < ARGON2_OWORDS_IN_BLOCK; i++) {
        state[i] = _mm_xor_si128(state[i], block_XY[i]);
        _mm_storeu_si128((__m128i *)next_block->v + i, state[i]);
    }
 }

+#endif
+
+#if 0
 static void next_addresses(block *address_block, block *input_block) {
    /*Temporary zero-initialized blocks*/
+#if defined(__AVX512F__)
+    __m512i zero_block[ARGON2_512BIT_WORDS_IN_BLOCK];
+    __m512i zero2_block[ARGON2_512BIT_WORDS_IN_BLOCK];
+#elif defined(__AVX2__)
+    __m256i zero_block[ARGON2_HWORDS_IN_BLOCK];
+    __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK];
+#else
    __m128i zero_block[ARGON2_OWORDS_IN_BLOCK];
    __m128i zero2_block[ARGON2_OWORDS_IN_BLOCK];
+#endif

    memset(zero_block, 0, sizeof(zero_block));
    memset(zero2_block, 0, sizeof(zero2_block));
@@ -88,30 +245,53 @@ static void next_addresses(block *address_block, block *input_block) {
    /*Second iteration of G*/
    fill_block(zero2_block, address_block, address_block, 0);
 }
+#endif

 void fill_segment(const argon2_instance_t *instance,
                  argon2_position_t position) {
    block *ref_block = NULL, *curr_block = NULL;
-    block address_block, input_block;
+//    block address_block, input_block;
    uint64_t pseudo_rand, ref_index, ref_lane;
    uint32_t prev_offset, curr_offset;
    uint32_t starting_index, i;
-    __m128i state[64];
-    int data_independent_addressing;
+#if defined(__AVX512F__)
+    __m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
+#elif defined(__AVX2__)
+    __m256i state[ARGON2_HWORDS_IN_BLOCK];
+#else
+    __m128i state[ARGON2_OWORDS_IN_BLOCK];
+#endif
+//    int data_independent_addressing;

    if (instance == NULL) {
        return;
    }

+    // data_independent_addressing =
+    //     (instance->type == Argon2_i) ||
+    //     (instance->type == Argon2_id && (position.pass == 0) &&
+    //      (position.slice < ARGON2_SYNC_POINTS / 2));
+
+    // if (data_independent_addressing) {
+    //     init_block_value(&input_block, 0);
+
+    //     input_block.v[0] = position.pass;
+    //     input_block.v[1] = position.lane;
+    //     input_block.v[2] = position.slice;
+    //     input_block.v[3] = instance->memory_blocks;
+    //     input_block.v[4] = instance->passes;
+    //     input_block.v[5] = instance->type;
+    // }
+
    starting_index = 0;

    if ((0 == position.pass) && (0 == position.slice)) {
        starting_index = 2; /* we have already generated the first two blocks */

        /* Don't forget to generate the first block of addresses: */
-        if (data_independent_addressing) {
-            next_addresses(&address_block, &input_block);
-        }
+//        if (data_independent_addressing) {
+//            next_addresses(&address_block, &input_block);
+//        }
    }

    /* Offset of the current block */
@@ -137,14 +317,14 @@ void fill_segment(const argon2_instance_t *instance,

        /* 1.2 Computing the index of the reference block */
        /* 1.2.1 Taking pseudo-random value from the previous block */
-        if (data_independent_addressing) {
-            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
-                next_addresses(&address_block, &input_block);
-            }
-            pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
-        } else {
+//        if (data_independent_addressing) {
+//            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
+//                next_addresses(&address_block, &input_block);
+//            }
+//            pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
+//        } else {
            pseudo_rand = instance->memory[prev_offset].v[0];
-        }
+//        }

        /* 1.2.2 Computing the lane of the reference block */
        ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
@@ -165,8 +345,15 @@ void fill_segment(const argon2_instance_t *instance,
        ref_block =
            instance->memory + instance->lane_length * ref_lane + ref_index;
        curr_block = instance->memory + curr_offset;
-            
-        fill_block(state, ref_block, curr_block, 0);
-
+         if (ARGON2_VERSION_10 == instance->version) {
+             /* version 1.2.1 and earlier: overwrite, not XOR */
+             fill_block(state, ref_block, curr_block, 0);
+         } else {
+             if(0 == position.pass) {
+                fill_block(state, ref_block, curr_block, 0);
+             } else {
+                 fill_block(state, ref_block, curr_block, 1);
+             }
+         }
    }
 }
--- a/algo/argon2/argon2d/argon2d/thread.c
+++ b/algo/argon2/argon2d/argon2d/thread.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
--- a/algo/argon2/argon2d/argon2d/thread.h
+++ b/algo/argon2/argon2d/argon2d/thread.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -46,7 +46,7 @@ typedef pthread_t argon2_thread_handle_t;
 * @param func A function pointer for the thread's entry point. Must not be
 * NULL.
 * @param args Pointer that is passed as an argument to @func. May be NULL.
- * @return 0 if @handle and @func are valid pointers and a thread is successfuly
+ * @return 0 if @handle and @func are valid pointers and a thread is successfully
 * created.
 */
 int argon2_thread_create(argon2_thread_handle_t *handle,
--- a/algo/argon2/argon2d/blake2/blake2-impl.h
+++ b/algo/argon2/argon2d/blake2/blake2-impl.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -153,4 +153,4 @@ static BLAKE2_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) {

 void clear_internal_memory(void *v, size_t n);

-#endif
+#endif
--- a/algo/argon2/argon2d/blake2/blake2.h
+++ b/algo/argon2/argon2d/blake2/blake2.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -78,7 +78,7 @@ int blake2b_final(blake2b_state *S, void *out, size_t outlen);

 /* Simple API */
 int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
-            const void *key, size_t keylen);
+                         const void *key, size_t keylen);

 /* Argon2 Team - Begin Code */
 int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
@@ -88,4 +88,4 @@ int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
 }
 #endif

-#endif
+#endif
--- a/algo/argon2/argon2d/blake2/blake2b.c
+++ b/algo/argon2/argon2d/blake2/blake2b.c
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -387,4 +387,4 @@ fail:
    return ret;
 #undef TRY
 }
-/* Argon2 Team - End Code */
+/* Argon2 Team - End Code */
--- a/algo/argon2/argon2d/blake2/blamka-round-opt.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-opt.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -29,6 +29,8 @@
 #include <x86intrin.h>
 #endif

+#if !defined(__AVX512F__)
+#if !defined(__AVX2__)
 #if !defined(__XOP__)
 #if defined(__SSSE3__)
 #define r16                                                                    \
@@ -176,5 +178,294 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
                                                                               \
        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1);                         \
    } while ((void)0, 0)
+#else /* __AVX2__ */

-#endif
+#include <immintrin.h>
+
+#define rotr32(x)   _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
+#define rotr24(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
+#define rotr16(x)   _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
+#define rotr63(x)   _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
+
+#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr32(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr24(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr32(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr24(B1); \
+    } while((void)0, 0);
+
+#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i ml = _mm256_mul_epu32(A0, B0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
+        D0 = _mm256_xor_si256(D0, A0); \
+        D0 = rotr16(D0); \
+        \
+        ml = _mm256_mul_epu32(C0, D0); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
+        B0 = _mm256_xor_si256(B0, C0); \
+        B0 = rotr63(B0); \
+        \
+        ml = _mm256_mul_epu32(A1, B1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
+        D1 = _mm256_xor_si256(D1, A1); \
+        D1 = rotr16(D1); \
+        \
+        ml = _mm256_mul_epu32(C1, D1); \
+        ml = _mm256_add_epi64(ml, ml); \
+        C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
+        B1 = _mm256_xor_si256(B1, C1); \
+        B1 = rotr63(B1); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while((void)0, 0);
+
+#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while(0);
+
+#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        \
+        B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+        C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+        D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while((void)0, 0);
+
+#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        __m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
+        __m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
+        B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+        \
+        tmp1 = C0; \
+        C0 = C1; \
+        C1 = tmp1; \
+        \
+        tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
+        tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
+        D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
+        D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    } while((void)0, 0);
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do{ \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
+        \
+        UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    } while((void)0, 0);
+
+#endif /* __AVX2__ */
+
+#else /* __AVX512F__ */
+
+#include <immintrin.h>
+
+#define ror64(x, n) _mm512_ror_epi64((x), (n))
+
+static __m512i muladd(__m512i x, __m512i y)
+{
+    __m512i z = _mm512_mul_epu32(x, y);
+    return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
+}
+
+#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        A0 = muladd(A0, B0); \
+        A1 = muladd(A1, B1); \
+\
+        D0 = _mm512_xor_si512(D0, A0); \
+        D1 = _mm512_xor_si512(D1, A1); \
+\
+        D0 = ror64(D0, 32); \
+        D1 = ror64(D1, 32); \
+\
+        C0 = muladd(C0, D0); \
+        C1 = muladd(C1, D1); \
+\
+        B0 = _mm512_xor_si512(B0, C0); \
+        B1 = _mm512_xor_si512(B1, C1); \
+\
+        B0 = ror64(B0, 24); \
+        B1 = ror64(B1, 24); \
+    } while ((void)0, 0)
+
+#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        A0 = muladd(A0, B0); \
+        A1 = muladd(A1, B1); \
+\
+        D0 = _mm512_xor_si512(D0, A0); \
+        D1 = _mm512_xor_si512(D1, A1); \
+\
+        D0 = ror64(D0, 16); \
+        D1 = ror64(D1, 16); \
+\
+        C0 = muladd(C0, D0); \
+        C1 = muladd(C1, D1); \
+\
+        B0 = _mm512_xor_si512(B0, C0); \
+        B1 = _mm512_xor_si512(B1, C1); \
+\
+        B0 = ror64(B0, 63); \
+        B1 = ror64(B1, 63); \
+    } while ((void)0, 0)
+
+#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
+\
+        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
+    } while ((void)0, 0)
+
+#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
+        B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
+\
+        C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
+        C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
+\
+        D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
+        D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \
+    do { \
+        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        G1(A0, B0, C0, D0, A1, B1, C1, D1); \
+        G2(A0, B0, C0, D0, A1, B1, C1, D1); \
+\
+        UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
+    } while ((void)0, 0)
+
+#define SWAP_HALVES(A0, A1) \
+    do { \
+        __m512i t0, t1; \
+        t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
+        t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
+        A0 = t0; \
+        A1 = t1; \
+    } while((void)0, 0)
+
+#define SWAP_QUARTERS(A0, A1) \
+    do { \
+        SWAP_HALVES(A0, A1); \
+        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+    } while((void)0, 0)
+
+#define UNSWAP_QUARTERS(A0, A1) \
+    do { \
+        A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
+        A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
+        SWAP_HALVES(A0, A1); \
+    } while((void)0, 0)
+
+#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
+    do { \
+        SWAP_HALVES(A0, B0); \
+        SWAP_HALVES(C0, D0); \
+        SWAP_HALVES(A1, B1); \
+        SWAP_HALVES(C1, D1); \
+        BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+        SWAP_HALVES(A0, B0); \
+        SWAP_HALVES(C0, D0); \
+        SWAP_HALVES(A1, B1); \
+        SWAP_HALVES(C1, D1); \
+    } while ((void)0, 0)
+
+#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
+    do { \
+        SWAP_QUARTERS(A0, A1); \
+        SWAP_QUARTERS(B0, B1); \
+        SWAP_QUARTERS(C0, C1); \
+        SWAP_QUARTERS(D0, D1); \
+        BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
+        UNSWAP_QUARTERS(A0, A1); \
+        UNSWAP_QUARTERS(B0, B1); \
+        UNSWAP_QUARTERS(C0, C1); \
+        UNSWAP_QUARTERS(D0, D1); \
+    } while ((void)0, 0)
+
+#endif /* __AVX512F__ */
+#endif /* BLAKE_ROUND_MKA_OPT_H */
--- a/algo/argon2/argon2d/blake2/blamka-round-ref.h
+++ b/algo/argon2/argon2d/blake2/blamka-round-ref.h
@@ -4,7 +4,7 @@
 * Copyright 2015
 * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
 *
- * You may use this work under the terms of a Creative Commons CC0 1.0 
+ * You may use this work under the terms of a Creative Commons CC0 1.0
 * License/Waiver or the Apache Public License 2.0, at your option. The terms of
 * these licenses can be found at:
 *
@@ -21,7 +21,7 @@
 #include "blake2.h"
 #include "blake2-impl.h"

-/*designed by the Lyra PHC team */
+/* designed by the Lyra PHC team */
 static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
    const uint64_t m = UINT64_C(0xFFFFFFFF);
    const uint64_t xy = (x & m) * (y & m);
@@ -53,4 +53,4 @@ static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
        G(v3, v4, v9, v14);                                                    \
    } while ((void)0, 0)

-#endif
+#endif
--- a/algo/cryptonight/cryptolight.c
+++ b/algo/cryptonight/cryptolight.c
@@ -325,7 +325,7 @@ int scanhash_cryptolight(int thr_id, struct work *work,

 	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));

-#ifndef NO_AES_NI
+#if defined(__AES__)
 		do {
 			*nonceptr = ++n;
 			cryptolight_hash_ctx_aes_ni(hash, pdata, 76, ctx);
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -1,14 +1,11 @@
+#if defined(__AES__)
+
 #include <x86intrin.h>
 #include <memory.h>
 #include "cryptonight.h"
 #include "miner.h"
 #include "crypto/c_keccak.h"
 #include <immintrin.h>
-//#include "avxdefs.h"
-
-void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey);
-void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state);
-void that_fucking_loop(uint8_t a[16], uint8_t b[16], uint8_t *long_state);

 static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
 {
@@ -25,7 +22,6 @@ static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)

 static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
 {
-#ifndef NO_AES_NI
 	__m128i tmp2, tmp4;
 	
 	tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
@@ -37,14 +33,12 @@ static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
 	tmp4 = _mm_slli_si128(tmp4, 0x04);
 	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
 	*tmp3 = _mm_xor_si128(*tmp3, tmp2);
-#endif
 }

 // Special thanks to Intel for helping me
 // with ExpandAESKey256() and its subroutines
 static inline void ExpandAESKey256(char *keybuf)
 {
-#ifndef NO_AES_NI
 	__m128i tmp1, tmp2, tmp3, *keys;
 	
 	keys = (__m128i *)keybuf;
@@ -91,7 +85,6 @@ static inline void ExpandAESKey256(char *keybuf)
 	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
 	ExpandAESKey256_sub1(&tmp1, &tmp2);
 	keys[14] = tmp1;
-#endif
 }

 // align to 64 byte cache line
@@ -109,13 +102,19 @@ static __thread cryptonight_ctx ctx;

 void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 {
-#ifndef NO_AES_NI
-
    uint8_t ExpandedKey[256] __attribute__((aligned(64)));
    __m128i *longoutput, *expkey, *xmminput;
    size_t i, j;
    
    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
+
+    if ( cryptonightV7 && len < 43 )
+      return;
+
+    const uint64_t tweak = cryptonightV7 
+                         ? *((const uint64_t*) (((const uint8_t*)input) + 35))
+                           ^ ctx.state.hs.w[24] : 0; 
+
    memcpy( ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE );
    ExpandAESKey256( ExpandedKey );
    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
@@ -214,7 +213,15 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	_mm_store_si128( (__m128i*)c, c_x );
        b_x = _mm_xor_si128( b_x, c_x );
        nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
-	_mm_store_si128( lsa, b_x );
+        _mm_store_si128( lsa, b_x );
+
+        if ( cryptonightV7 )
+        {
+           const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
+           const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
+           ((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
+        } 
+
 	b[0] = nextblock[0];
 	b[1] = nextblock[1];

@@ -227,10 +234,14 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 		 : "cc" );

        b_x = c_x;
-        nextblock[0] = a[0] + hi;
-        nextblock[1] = a[1] + lo;
-        a[0] = b[0] ^ nextblock[0];
-        a[1] = b[1] ^ nextblock[1];
+
+        a[0] += hi;
+        a[1] += lo;
+        nextblock[0] = a[0];
+        nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
+        a[0] ^= b[0];
+        a[1] ^= b[1];
+
        lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
        a_x = _mm_load_si128( (__m128i*)a );
        c_x = _mm_load_si128( lsa );
@@ -241,6 +252,14 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
    b_x = _mm_xor_si128( b_x, c_x );
    nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
    _mm_store_si128( lsa, b_x );
+
+    if ( cryptonightV7 )
+    {
+       const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
+       const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
+       ((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
+    }
+
    b[0] = nextblock[0];
    b[1] = nextblock[1];

@@ -251,8 +270,12 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
               "rm" ( b[0] )
             : "cc" );

-    nextblock[0] = a[0] + hi;
-    nextblock[1] = a[1] + lo;
+    a[0] += hi;
+    a[1] += lo;
+    nextblock[0] = a[0];
+    nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
+    a[0] ^= b[0];
+    a[1] ^= b[1];

    memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
    ExpandAESKey256( ExpandedKey );
@@ -330,5 +353,5 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
    keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
    extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);

-#endif
 }
+#endif
--- a/algo/cryptonight/cryptonight-common.c
+++ b/algo/cryptonight/cryptonight-common.c
@@ -7,11 +7,11 @@
 #include "cpuminer-config.h"
 #include "algo-gate-api.h"

-#ifndef NO_AES_NI
+#if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl256.h"
-#endif
-
+#else
 #include "crypto/c_groestl.h"
+#endif
 #include "crypto/c_blake256.h"
 #include "crypto/c_jh.h"
 #include "crypto/c_skein.h"
@@ -30,12 +30,12 @@ void do_blake_hash(const void* input, size_t len, char* output) {
 }

 void do_groestl_hash(const void* input, size_t len, char* output) {
-#ifdef NO_AES_NI
-    groestl(input, len * 8, (uint8_t*)output);
-#else
+#if defined(__AES__)
    hashState_groestl256 ctx;
    init_groestl256( &ctx, 32 );
    update_and_final_groestl256( &ctx, output, input, len * 8 );
+#else
+    groestl(input, len * 8, (uint8_t*)output);
 #endif
 }

@@ -52,23 +52,24 @@ void (* const extra_hashes[4])( const void *, size_t, char *) =

 void cryptonight_hash( void *restrict output, const void *input, int len )
 {
-
-#ifdef NO_AES_NI
-  cryptonight_hash_ctx ( output, input, len );
-#else 
+#if defined(__AES__)
  cryptonight_hash_aes( output, input, len );
+#else
+  cryptonight_hash_ctx ( output, input, len );
 #endif
 }

 void cryptonight_hash_suw( void *restrict output, const void *input )
 {
-#ifdef NO_AES_NI
-  cryptonight_hash_ctx ( output, input, 76 );
-#else
+#if defined(__AES__)
  cryptonight_hash_aes( output, input, 76 );
+#else
+  cryptonight_hash_ctx ( output, input, 76 );
 #endif
 }

+bool cryptonightV7 = false;
+
 int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done )
 {
@@ -80,6 +81,11 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
    const uint32_t first_nonce = n + 1;
    const uint32_t Htarg = ptarget[7];
    uint32_t hash[32 / 4] __attribute__((aligned(32)));
+
+//    if (  (  cryptonightV7 && ( *(uint8_t*)pdata <  7 ) )
+//       || ( !cryptonightV7 && ( *(uint8_t*)pdata == 7 ) ) )
+//          applog(LOG_WARNING,"Cryptonight variant mismatch, shares may be rejected.");
+
    do
    {
       *nonceptr = ++n;
@@ -87,6 +93,7 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
       if (unlikely( hash[7] < Htarg ))
       {
           *hashes_done = n - first_nonce + 1;
+//           work_set_target_ratio( work, hash );
 	   return true;
       }
    } while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
@@ -97,6 +104,7 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,

 bool register_cryptonight_algo( algo_gate_t* gate )
 {
+  cryptonightV7 = false;
  register_json_rpc2( gate );
  gate->optimizations = SSE2_OPT | AES_OPT;
  gate->scanhash         = (void*)&scanhash_cryptonight;
@@ -106,3 +114,15 @@ bool register_cryptonight_algo( algo_gate_t* gate )
  return true;
 };

+bool register_cryptonightv7_algo( algo_gate_t* gate )
+{
+  cryptonightV7 = true;
+  register_json_rpc2( gate );
+  gate->optimizations = SSE2_OPT | AES_OPT;
+  gate->scanhash      = (void*)&scanhash_cryptonight;
+  gate->hash          = (void*)&cryptonight_hash;
+  gate->hash_suw      = (void*)&cryptonight_hash_suw;
+  gate->get_max64     = (void*)&get_max64_0x40LL;
+  return true;
+};
+
--- a/algo/cryptonight/cryptonight.c
+++ b/algo/cryptonight/cryptonight.c
@@ -20,8 +20,8 @@
 #include "crypto/c_jh.h"
 #include "crypto/c_skein.h"
 #include "crypto/int-util.h"
-#include "crypto/hash-ops.h"
-//#include "cryptonight.h"
+//#include "crypto/hash-ops.h"
+#include "cryptonight.h"

 #if USE_INT128

@@ -51,6 +51,7 @@ typedef __uint128_t uint128_t;
 #define INIT_SIZE_BLK   8
 #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)

+/*
 #pragma pack(push, 1)
 union cn_slow_hash_state {
 	union hash_state hs;
@@ -78,6 +79,7 @@ static void do_skein_hash(const void* input, size_t len, char* output) {
 	int r = skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output);
 	assert(likely(SKEIN_SUCCESS == r));
 }
+*/

 extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
 extern int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
@@ -120,9 +122,11 @@ static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* pro
 extern uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi);
 #endif

+/*
 static void (* const extra_hashes[4])(const void *, size_t, char *) = {
 		do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
 };
+*/

 static inline size_t e2i(const uint8_t* a) {
 #if !LITE
@@ -132,14 +136,16 @@ static inline size_t e2i(const uint8_t* a) {
 #endif
 }

-static inline void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
+static inline void mul_sum_xor_dst( const uint8_t* a, uint8_t* c, uint8_t* dst, 
+         const uint64_t tweak )
+{
 	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
 	hi += ((uint64_t*) c)[0];

 	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
 	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
 	((uint64_t*) dst)[0] = hi;
-	((uint64_t*) dst)[1] = lo;
+	((uint64_t*) dst)[1] = cryptonightV7 ? lo ^ tweak : lo;
 }

 static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
@@ -174,8 +180,16 @@ static __thread cryptonight_ctx ctx;

 void cryptonight_hash_ctx(void* output, const void* input, int len)
 {
-	hash_process(&ctx.state.hs, (const uint8_t*) input, len);
-	ctx.aes_ctx = (oaes_ctx*) oaes_alloc();
+//    hash_process(&ctx.state.hs, (const uint8_t*) input, len);
+    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
+
+    if ( cryptonightV7 && len < 43 )
+      return;
+    const uint64_t tweak = cryptonightV7
+                         ? *((const uint64_t*) (((const uint8_t*)input) + 35))
+                           ^ ctx.state.hs.w[24] : 0;
+
+    ctx.aes_ctx = (oaes_ctx*) oaes_alloc();

    __builtin_prefetch( ctx.text,             0, 3 );
    __builtin_prefetch( ctx.text       +  64, 0, 3 );
@@ -211,23 +225,44 @@ void cryptonight_hash_ctx(void* output, const void* input, int len)
 	xor_blocks_dst(&ctx.state.k[0], &ctx.state.k[32], ctx.a);
 	xor_blocks_dst(&ctx.state.k[16], &ctx.state.k[48], ctx.b);

-	for (i = 0; likely(i < ITER / 4); ++i) {
-		/* Dependency chain: address -> read value ------+
-		 * written value <-+ hard function (AES or MUL) <+
-		 * next address  <-+
-		 */
-		/* Iteration 1 */
-		j = e2i(ctx.a);
-		aesb_single_round(&ctx.long_state[j], ctx.c, ctx.a);
-		xor_blocks_dst(ctx.c, ctx.b, &ctx.long_state[j]);
-		/* Iteration 2 */
-		mul_sum_xor_dst(ctx.c, ctx.a, &ctx.long_state[e2i(ctx.c)]);
-		/* Iteration 3 */
-		j = e2i(ctx.a);
-		aesb_single_round(&ctx.long_state[j], ctx.b, ctx.a);
-		xor_blocks_dst(ctx.b, ctx.c, &ctx.long_state[j]);
-		/* Iteration 4 */
-		mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)]);
+	for (i = 0; likely(i < ITER / 4); ++i)
+        {
+           /* Dependency chain: address -> read value ------+
+            * written value <-+ hard function (AES or MUL) <+
+            * next address  <-+
+            */
+           /* Iteration 1 */
+           j = e2i(ctx.a);
+           aesb_single_round(&ctx.long_state[j], ctx.c, ctx.a);
+           xor_blocks_dst(ctx.c, ctx.b, &ctx.long_state[j]);
+
+           if ( cryptonightV7 )
+           {
+              uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
+              const uint8_t tmp = lsa[11];
+              const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
+              lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
+           }
+
+           /* Iteration 2 */
+           mul_sum_xor_dst(ctx.c, ctx.a, &ctx.long_state[e2i(ctx.c)], tweak );
+
+           /* Iteration 3 */
+           j = e2i(ctx.a);
+           aesb_single_round(&ctx.long_state[j], ctx.b, ctx.a);
+           xor_blocks_dst(ctx.b, ctx.c, &ctx.long_state[j]);
+
+           if ( cryptonightV7 )
+           {
+              uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
+              const uint8_t tmp = lsa[11];
+              const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
+              lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
+           }
+
+           /* Iteration 4 */
+           mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)], tweak );
+
 	}

    __builtin_prefetch( ctx.text,             0, 3 );
@@ -266,7 +301,8 @@ void cryptonight_hash_ctx(void* output, const void* input, int len)
 		aesb_pseudo_round_mut(&ctx.text[7 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
 	}
 	memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE);
-	hash_permutation(&ctx.state.hs);
+//	hash_permutation(&ctx.state.hs);
+        keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
 	/*memcpy(hash, &state, 32);*/
 	extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
 	oaes_free((OAES_CTX **) &ctx.aes_ctx);
--- a/algo/cryptonight/cryptonight.h
+++ b/algo/cryptonight/cryptonight.h
@@ -45,5 +45,7 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,

 void cryptonight_hash_aes( void *restrict output, const void *input, int len );

+extern bool cryptonightV7;
+
 #endif

--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -67,7 +67,7 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   for ( int i=0; i < 19; i++ )
+   for ( int i=0; i < 20; i++ )
      be32enc( &edata[i], pdata[i] );

   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -67,7 +67,7 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   for ( int i=0; i < 19; i++ )
+   for ( int i=0; i < 20; i++ )
      be32enc( &edata[i], pdata[i] );

   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -60,7 +60,7 @@ int scanhash_sha256t_8way( int thr_id, struct work *work,
                               0xFFFF0000,
                                        0 };

-   for ( int k = 0; k < 19; k++ )
+   for ( int k = 0; k < 20; k++ )
      be32enc( &edata[k], pdata[k] );

   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
--- a/algo/x11/x11evo-gate.c
+++ b/algo/x11/x11evo-gate.c
@@ -31,11 +31,13 @@ int nextPerm( uint8_t n[], uint32_t count )
      return 0;

   for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
-      tail = i;
+   tail = i;

   if ( tail > 0 )
+   {
      for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
-           evo_swap( &n[tail - 1], &n[j] );
+      evo_swap( &n[tail - 1], &n[j] );
+   }

   for ( i = tail, j = count - 1; i<j; i++, j-- )
      evo_swap( &n[i], &n[j] );
--- a/algo/x17/x16r-4way.c
+++ b/algo/x17/x16r-4way.c
@@ -68,13 +68,7 @@ void x16r_4way_hash( void* output, const void* input )
   uint32_t hash2[24] __attribute__ ((aligned (64)));
   uint32_t hash3[24] __attribute__ ((aligned (64)));
   uint32_t vhash[24*4] __attribute__ ((aligned (64)));
-//   uint32_t inp0[24] __attribute__ ((aligned (64)));
-//   uint32_t inp1[24] __attribute__ ((aligned (64)));
-//   uint32_t inp2[24] __attribute__ ((aligned (64)));
-//   uint32_t inp3[24] __attribute__ ((aligned (64)));
-
   x16r_4way_ctx_holder ctx;
-   
   void *in0 = (void*) hash0;
   void *in1 = (void*) hash1;
   void *in2 = (void*) hash2;
@@ -290,10 +284,6 @@ void x16r_4way_hash( void* output, const void* input )
             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
         break;
      }
-//      in0 = (void*) hash0;
-//      in1 = (void*) hash1;
-//      in2 = (void*) hash2;
-//      in3 = (void*) hash3;
      size = 64;
   }
   memcpy( output,    hash0, 32 );
@@ -325,7 +315,7 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
   if ( s_ntime != endiandata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
-      x16_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
+      x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
--- a/algo/x17/x16r-gate.c
+++ b/algo/x17/x16r-gate.c
@@ -1,6 +1,6 @@
 #include "x16r-gate.h"

-void x16r_getAlgoString( const char* prevblock, char *output )
+void x16r_getAlgoString( const uint8_t* prevblock, char *output )
 {
   char *sptr = output;
   for ( int j = 0; j < X16R_HASH_FUNC_COUNT; j++ )
@@ -16,14 +16,13 @@ void x16r_getAlgoString( const char* prevblock, char *output )
   *sptr = '\0';
 }

-void x16s_getAlgoString( const char* prevblock, char *output )
+void x16s_getAlgoString( const uint8_t* prevblock, char *output )
 {
-   uint8_t* data = (uint8_t*)prevblock;
   strcpy( output, "0123456789ABCDEF" );
   for ( int i = 0; i < 16; i++ )
   {
      uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed
-      uint8_t algoDigit = (i & 1) ? data[b] & 0xF : data[b] >> 4;
+      uint8_t algoDigit = (i & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4;
      int offset = algoDigit;
      // insert the nth character at the front
      char oldVal = output[offset];
--- a/algo/x17/x16r-gate.h
+++ b/algo/x17/x16r-gate.h
@@ -29,9 +29,9 @@ enum x16r_Algo {
        X16R_HASH_FUNC_COUNT
 };

-bool (*x16_r_s_getAlgoString) ( const char*, char* );
-void x16r_getAlgoString( const char* prevblock, char *output );
-void x16s_getAlgoString( const char* prevblock, char *output );
+void (*x16_r_s_getAlgoString) ( const uint8_t*, char* );
+void x16r_getAlgoString( const uint8_t* prevblock, char *output );
+void x16s_getAlgoString( const uint8_t* prevblock, char *output );

 bool register_x16r_algo( algo_gate_t* gate );
 bool register_x16s_algo( algo_gate_t* gate );
--- a/algo/x17/x16r.c
+++ b/algo/x17/x16r.c
@@ -205,7 +205,7 @@ int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce,
   if ( s_ntime != pdata[17] )
   {
      uint32_t ntime = swab32(pdata[17]);
-      x16_r_s_getAlgoString( (const char*) (&endiandata[1]), hashOrder );
+      x16_r_s_getAlgoString( (const uint8_t*) (&endiandata[1]), hashOrder );
      s_ntime = ntime;
      if ( opt_debug && !thr_id )
              applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime );
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -1,5 +1,5 @@
 #ifndef AVXDEFS_H__
-#define AVXDEFS_H__
+#define AVXDEFS_H__ 1

 // Some tools to help using SIMD vectors.
 //
@@ -156,7 +156,7 @@ static inline __m128i foo()
 // _mm_setzero_si128 uses pxor instruction, it's unclear what _mm_set_epi does.
 // If a pseudo constant is used repeatedly in a function it may be worthwhile
 // to define a register variable to represent that constant.
-// register __m128i zero = mm_zero;
+// register __m128i zero = mm_setzero_si128();

 // Constant zero
 #define m128_zero      _mm_setzero_si128()
@@ -1018,7 +1018,8 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )

 //////////////////////////////////////////////////////////////

-#if defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VBMI__)
+#if defined(__AVX512F__)
+//#if defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VBMI__)

 // Experimental, not tested.

@@ -1034,7 +1035,12 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
 //
 // Pseudo constants.

-#define m512_zero _mm512_setzero_si512()
+// _mm512_setzero_si512 uses xor instruction. If needed frequently
+// in a function it's better to define a register variable (const?)
+// initialized to zero.
+// It isn't clear to me yet how set or set1 work.
+
+#define m512_zero           _mm512_setzero_si512()
 #define m512_one_512        _mm512_set_epi64x(  0ULL, 0ULL, 0ULL, 0ULL, \
                                                0ULL, 0ULL, 0ULL, 1ULL )
 #define m512_one_256        _mm512_set4_epi64x( 0ULL, 0ULL, 0ULL, 1ULL )
@@ -1058,6 +1064,21 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )
 //
 // Pointer casting

+// p = any aligned pointer
+// i = scaled array index
+// o = scaled address offset
+
+// returns p as pointer to vector
+#define castp_m512i(p) ((__m512i*)(p))
+
+// returns *p as vector value
+#define cast_m512i(p) (*((__m512i*)(p)))
+
+// returns p[i] as vector value
+#define casti_m512i(p,i) (((__m512i*)(p))[(i)])
+
+// returns p+o as pointer to vector
+#define casto_m512i(p,o) (((__m512i*)(p))+(o))

 //
 // Memory functions
@@ -1237,746 +1258,4 @@ inline __m256i mm256_aesenc_nokey_2x128_obs( __m256i x )

 #endif   // AVX512F

-// Paired functions for interleaving and deinterleaving data for vector
-// processing. 
-// Size is specfied in bits regardless of vector size to avoid pointer
-// arithmetic confusion with different size vectors and be consistent with
-// the function's name. 
-//
-// Each function has 2 implementations, an optimized version that uses
-// vector indexing and a slower version that uses pointers. The optimized
-// version can only be used with 64 bit elements and only supports sizes
-// of 256, 512 or 640 bits, 32, 64, and 80 bytes respectively.
-//
-// NOTE: Contrary to GCC documentation, accessing vector elements using array
-// indexes only works with 64 bit elements. 
-// Interleaving and deinterleaving of vectors of 32 bit elements
-// must use the slower implementations that don't use vector indexing. 
-// 
-// All data must be aligned to 256 bits for AVX2, or 128 bits for AVX.
-// Interleave source args and deinterleave destination args are not required
-// to be contiguous in memory but it's more efficient if they are.
-// Interleave source agrs may be the same actual arg repeated.
-// 640 bit deinterleaving 4x64 using 256 bit AVX2 requires the
-// destination buffers be defined with padding up to 768 bits for overrun
-// space. Although overrun space use is non destructive it should not overlay
-// useful data and should be ignored by the caller.
-
-// SSE2 AVX
-
-// interleave 4 arrays of 32 bit elements for 128 bit processing
-// bit_len must be 256, 512 or 640 bits.
-static inline void mm_interleave_4x32( void *dst, const void *src0,
-           const void *src1, const void *src2, const void *src3, int bit_len )
-{
-   uint32_t *s0 = (uint32_t*)src0;
-   uint32_t *s1 = (uint32_t*)src1;
-   uint32_t *s2 = (uint32_t*)src2;
-   uint32_t *s3 = (uint32_t*)src3;
-   __m128i* d = (__m128i*)dst;
-
-   d[0] = _mm_set_epi32( s3[ 0], s2[ 0], s1[ 0], s0[ 0] );
-   d[1] = _mm_set_epi32( s3[ 1], s2[ 1], s1[ 1], s0[ 1] );
-   d[2] = _mm_set_epi32( s3[ 2], s2[ 2], s1[ 2], s0[ 2] );
-   d[3] = _mm_set_epi32( s3[ 3], s2[ 3], s1[ 3], s0[ 3] );
-   d[4] = _mm_set_epi32( s3[ 4], s2[ 4], s1[ 4], s0[ 4] );
-   d[5] = _mm_set_epi32( s3[ 5], s2[ 5], s1[ 5], s0[ 5] );
-   d[6] = _mm_set_epi32( s3[ 6], s2[ 6], s1[ 6], s0[ 6] );
-   d[7] = _mm_set_epi32( s3[ 7], s2[ 7], s1[ 7], s0[ 7] );
-
-   if ( bit_len <= 256 ) return;
-
-   d[ 8] = _mm_set_epi32( s3[ 8], s2[ 8], s1[ 8], s0[ 8] );
-   d[ 9] = _mm_set_epi32( s3[ 9], s2[ 9], s1[ 9], s0[ 9] );
-   d[10] = _mm_set_epi32( s3[10], s2[10], s1[10], s0[10] );
-   d[11] = _mm_set_epi32( s3[11], s2[11], s1[11], s0[11] );
-   d[12] = _mm_set_epi32( s3[12], s2[12], s1[12], s0[12] );
-   d[13] = _mm_set_epi32( s3[13], s2[13], s1[13], s0[13] );
-   d[14] = _mm_set_epi32( s3[14], s2[14], s1[14], s0[14] );
-   d[15] = _mm_set_epi32( s3[15], s2[15], s1[15], s0[15] );
-
-   if ( bit_len <= 512 ) return;
-
-   d[16] = _mm_set_epi32( s3[16], s2[16], s1[16], s0[16] );
-   d[17] = _mm_set_epi32( s3[17], s2[17], s1[17], s0[17] );
-   d[18] = _mm_set_epi32( s3[18], s2[18], s1[18], s0[18] );
-   d[19] = _mm_set_epi32( s3[19], s2[19], s1[19], s0[19] );
-
-   if ( bit_len <= 640 ) return;
-
-   d[20] = _mm_set_epi32( s3[20], s2[20], s1[20], s0[20] );
-   d[21] = _mm_set_epi32( s3[21], s2[21], s1[21], s0[21] );
-   d[22] = _mm_set_epi32( s3[22], s2[22], s1[22], s0[22] );
-   d[23] = _mm_set_epi32( s3[23], s2[23], s1[23], s0[23] );
-
-   d[24] = _mm_set_epi32( s3[24], s2[24], s1[24], s0[24] );
-   d[25] = _mm_set_epi32( s3[25], s2[25], s1[25], s0[25] );
-   d[26] = _mm_set_epi32( s3[26], s2[26], s1[26], s0[26] );
-   d[27] = _mm_set_epi32( s3[27], s2[27], s1[27], s0[27] );
-   d[28] = _mm_set_epi32( s3[28], s2[28], s1[28], s0[28] );
-   d[29] = _mm_set_epi32( s3[29], s2[29], s1[29], s0[29] );
-   d[30] = _mm_set_epi32( s3[30], s2[30], s1[30], s0[30] );
-   d[31] = _mm_set_epi32( s3[31], s2[31], s1[31], s0[31] );
-   // bit_len == 1024
-}
-
-// bit_len must be multiple of 32
-static inline void mm_interleave_4x32x( void *dst, void *src0, void  *src1,
-                                        void *src2, void *src3, int bit_len )
-{
-   uint32_t *d  = (uint32_t*)dst;
-   uint32_t *s0 = (uint32_t*)src0;
-   uint32_t *s1 = (uint32_t*)src1;
-   uint32_t *s2 = (uint32_t*)src2;
-   uint32_t *s3 = (uint32_t*)src3;
-
-   for ( int i = 0; i < bit_len >> 5; i++, d += 4 )
-   {
-      *d     = *(s0+i);
-      *(d+1) = *(s1+i);
-      *(d+2) = *(s2+i);
-      *(d+3) = *(s3+i);
-   }
-}
-
-static inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
-                                     void *dst3, const void *src, int bit_len )
-{
-   uint32_t *s = (uint32_t*)src;
-   __m128i* d0 = (__m128i*)dst0;
-   __m128i* d1 = (__m128i*)dst1;
-   __m128i* d2 = (__m128i*)dst2;
-   __m128i* d3 = (__m128i*)dst3;
-
-   d0[0] = _mm_set_epi32( s[12], s[ 8], s[ 4], s[ 0] );
-   d1[0] = _mm_set_epi32( s[13], s[ 9], s[ 5], s[ 1] );
-   d2[0] = _mm_set_epi32( s[14], s[10], s[ 6], s[ 2] );
-   d3[0] = _mm_set_epi32( s[15], s[11], s[ 7], s[ 3] );
-
-   d0[1] = _mm_set_epi32( s[28], s[24], s[20], s[16] );
-   d1[1] = _mm_set_epi32( s[29], s[25], s[21], s[17] );
-   d2[1] = _mm_set_epi32( s[30], s[26], s[22], s[18] );
-   d3[1] = _mm_set_epi32( s[31], s[27], s[23], s[19] );
-
-   if ( bit_len <= 256 ) return;
-
-   d0[2] = _mm_set_epi32( s[44], s[40], s[36], s[32] );
-   d1[2] = _mm_set_epi32( s[45], s[41], s[37], s[33] );
-   d2[2] = _mm_set_epi32( s[46], s[42], s[38], s[34] );
-   d3[2] = _mm_set_epi32( s[47], s[43], s[39], s[35] );
-
-   d0[3] = _mm_set_epi32( s[60], s[56], s[52], s[48] );
-   d1[3] = _mm_set_epi32( s[61], s[57], s[53], s[49] );
-   d2[3] = _mm_set_epi32( s[62], s[58], s[54], s[50] );
-   d3[3] = _mm_set_epi32( s[63], s[59], s[55], s[51] );
-
-   if ( bit_len <= 512 ) return;
-
-   d0[4] = _mm_set_epi32( s[76], s[72], s[68], s[64] );
-   d1[4] = _mm_set_epi32( s[77], s[73], s[69], s[65] );
-   d2[4] = _mm_set_epi32( s[78], s[74], s[70], s[66] );
-   d3[4] = _mm_set_epi32( s[79], s[75], s[71], s[67] );
-
-   if ( bit_len <= 640 ) return;
-
-   d0[5] = _mm_set_epi32( s[92], s[88], s[84], s[80] );
-   d1[5] = _mm_set_epi32( s[93], s[89], s[85], s[81] );
-   d2[5] = _mm_set_epi32( s[94], s[90], s[86], s[82] );
-   d3[5] = _mm_set_epi32( s[95], s[91], s[87], s[83] );
-
-   d0[6] = _mm_set_epi32( s[108], s[104], s[100], s[ 96] );
-   d1[6] = _mm_set_epi32( s[109], s[105], s[101], s[ 97] );
-   d2[6] = _mm_set_epi32( s[110], s[106], s[102], s[ 98] );
-   d3[6] = _mm_set_epi32( s[111], s[107], s[103], s[ 99] );
-
-   d0[7] = _mm_set_epi32( s[124], s[120], s[116], s[112] );
-   d1[7] = _mm_set_epi32( s[125], s[121], s[117], s[113] );
-   d2[7] = _mm_set_epi32( s[126], s[122], s[118], s[114] );
-   d3[7] = _mm_set_epi32( s[127], s[123], s[119], s[115] );
-   // bit_len == 1024
-}
-
-// deinterleave 4 arrays into individual buffers for scalarm processing
-// bit_len must be multiple of 32
-static inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
-                                    void *dst3, const void *src, int bit_len )
-{
-  uint32_t *s  = (uint32_t*)src;
-  uint32_t *d0 = (uint32_t*)dst0;
-  uint32_t *d1 = (uint32_t*)dst1;
-  uint32_t *d2 = (uint32_t*)dst2;
-  uint32_t *d3 = (uint32_t*)dst3;
-
-  for ( int i = 0; i < bit_len >> 5; i++, s += 4 )
-  {
-     *(d0+i) = *s;
-     *(d1+i) = *(s+1);
-     *(d2+i) = *(s+2);
-     *(d3+i) = *(s+3);
-  }
-}
-
-#if defined (__AVX2__)
-
-// Interleave 4 source buffers containing 64 bit data into the destination
-// buffer. Only bit_len 256, 512, 640 & 1024 are supported.
-static inline void mm256_interleave_4x64( void *dst, const void *src0,
-            const void *src1, const void *src2, const void *src3, int bit_len )
-{
-   __m256i* d = (__m256i*)dst;
-   uint64_t *s0 = (uint64_t*)src0;
-   uint64_t *s1 = (uint64_t*)src1;
-   uint64_t *s2 = (uint64_t*)src2;
-   uint64_t *s3 = (uint64_t*)src3;
-
-   d[0] = _mm256_set_epi64x( s3[0], s2[0], s1[0], s0[0] );
-   d[1] = _mm256_set_epi64x( s3[1], s2[1], s1[1], s0[1] );
-   d[2] = _mm256_set_epi64x( s3[2], s2[2], s1[2], s0[2] );
-   d[3] = _mm256_set_epi64x( s3[3], s2[3], s1[3], s0[3] );
-
-   if ( bit_len <= 256 ) return;
-
-   d[4] = _mm256_set_epi64x( s3[4], s2[4], s1[4], s0[4] );
-   d[5] = _mm256_set_epi64x( s3[5], s2[5], s1[5], s0[5] );
-   d[6] = _mm256_set_epi64x( s3[6], s2[6], s1[6], s0[6] );
-   d[7] = _mm256_set_epi64x( s3[7], s2[7], s1[7], s0[7] );
-
-   if ( bit_len <= 512 ) return;
-
-   d[8] = _mm256_set_epi64x( s3[8], s2[8], s1[8], s0[8] );
-   d[9] = _mm256_set_epi64x( s3[9], s2[9], s1[9], s0[9] );
-
-   if ( bit_len <= 640 ) return;
-
-   d[10] = _mm256_set_epi64x( s3[10], s2[10], s1[10], s0[10] );
-   d[11] = _mm256_set_epi64x( s3[11], s2[11], s1[11], s0[11] );
-
-   d[12] = _mm256_set_epi64x( s3[12], s2[12], s1[12], s0[12] );
-   d[13] = _mm256_set_epi64x( s3[13], s2[13], s1[13], s0[13] );
-   d[14] = _mm256_set_epi64x( s3[14], s2[14], s1[14], s0[14] );
-   d[15] = _mm256_set_epi64x( s3[15], s2[15], s1[15], s0[15] );
-   // bit_len == 1024
-}
-
-// Slower version
-// bit_len must be multiple of 64
-static inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1,
-                                    void *src2, void *src3, int bit_len )
-{
-   uint64_t *d = (uint64_t*)dst;
-   uint64_t *s0 = (uint64_t*)src0;
-   uint64_t *s1 = (uint64_t*)src1;
-   uint64_t *s2 = (uint64_t*)src2;
-   uint64_t *s3 = (uint64_t*)src3;
-
-   for ( int i = 0; i < bit_len>>6; i++, d += 4 )
-   {
-      *d     = *(s0+i);
-      *(d+1) = *(s1+i);
-      *(d+2) = *(s2+i);
-      *(d+3) = *(s3+i);
-  }
-}
-
-// Deinterleave 4 buffers of 64 bit data from the source buffer.
-// bit_len must be 256, 512, 640 or 1024 bits.
-// Requires overrun padding for 640 bit len.
-static inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
-                                     void *dst3, const void *src, int bit_len )
-{
-   __m256i* d0 = (__m256i*)dst0;
-   __m256i* d1 = (__m256i*)dst1;
-   __m256i* d2 = (__m256i*)dst2;
-   __m256i* d3 = (__m256i*)dst3;
-   uint64_t* s = (uint64_t*)src;
-
-   d0[0] = _mm256_set_epi64x( s[12], s[ 8], s[ 4], s[ 0] );
-   d1[0] = _mm256_set_epi64x( s[13], s[ 9], s[ 5], s[ 1] );
-   d2[0] = _mm256_set_epi64x( s[14], s[10], s[ 6], s[ 2] );
-   d3[0] = _mm256_set_epi64x( s[15], s[11], s[ 7], s[ 3] );
-
-   if ( bit_len <= 256 ) return;
-
-   d0[1] = _mm256_set_epi64x( s[28], s[24], s[20], s[16] );
-   d1[1] = _mm256_set_epi64x( s[29], s[25], s[21], s[17] );
-   d2[1] = _mm256_set_epi64x( s[30], s[26], s[22], s[18] );
-   d3[1] = _mm256_set_epi64x( s[31], s[27], s[23], s[19] );
-
-   if ( bit_len <= 512 ) return;
-
-   if ( bit_len <= 640 )
-   {
-      // null change to overrun area
-      d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[36], s[32] );
-      d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[37], s[33] );
-      d2[2] = _mm256_set_epi64x( d2[2][3], d2[2][2], s[38], s[34] );
-      d3[2] = _mm256_set_epi64x( d3[2][3], d3[2][2], s[39], s[35] );
-      return;
-   }
-
-   d0[2] = _mm256_set_epi64x( s[44], s[40], s[36], s[32] );
-   d1[2] = _mm256_set_epi64x( s[45], s[41], s[37], s[33] );
-   d2[2] = _mm256_set_epi64x( s[46], s[42], s[38], s[34] );
-   d3[2] = _mm256_set_epi64x( s[47], s[43], s[39], s[35] );
-
-   d0[3] = _mm256_set_epi64x( s[60], s[56], s[52], s[48] );
-   d1[3] = _mm256_set_epi64x( s[61], s[57], s[53], s[49] );
-   d2[3] = _mm256_set_epi64x( s[62], s[58], s[54], s[50] );
-   d3[3] = _mm256_set_epi64x( s[63], s[59], s[55], s[51] );
-   // bit_len == 1024
-}
-
-// Slower version
-// bit_len must be multiple 0f 64
-static inline void mm256_deinterleave_4x64x( void *dst0, void *dst1,
-                             void *dst2, void *dst3, void *src, int bit_len )
-{
-  uint64_t *s = (uint64_t*)src;
-  uint64_t *d0 = (uint64_t*)dst0;
-  uint64_t *d1 = (uint64_t*)dst1;
-  uint64_t *d2 = (uint64_t*)dst2;
-  uint64_t *d3 = (uint64_t*)dst3;
-
-  for ( int i = 0; i < bit_len>>6; i++, s += 4 )
-  {
-     *(d0+i) = *s;
-     *(d1+i) = *(s+1);
-     *(d2+i) = *(s+2);
-     *(d3+i) = *(s+3);
-  }
-}
-
-// Interleave 8 source buffers containing 32 bit data into the destination
-// vector
-static inline void mm256_interleave_8x32( void *dst, const void *src0,
-        const void *src1, const void *src2, const void *src3, const void *src4,
-        const void *src5, const void *src6, const void *src7, int bit_len )
-{
-   uint32_t *s0 = (uint32_t*)src0;
-   uint32_t *s1 = (uint32_t*)src1;
-   uint32_t *s2 = (uint32_t*)src2;
-   uint32_t *s3 = (uint32_t*)src3;
-   uint32_t *s4 = (uint32_t*)src4;
-   uint32_t *s5 = (uint32_t*)src5;
-   uint32_t *s6 = (uint32_t*)src6;
-   uint32_t *s7 = (uint32_t*)src7;
-   __m256i *d = (__m256i*)dst;
-
-   d[ 0] = _mm256_set_epi32( s7[0], s6[0], s5[0], s4[0],
-                             s3[0], s2[0], s1[0], s0[0] );
-   d[ 1] = _mm256_set_epi32( s7[1], s6[1], s5[1], s4[1],
-                             s3[1], s2[1], s1[1], s0[1] );
-   d[ 2] = _mm256_set_epi32( s7[2], s6[2], s5[2], s4[2],
-                             s3[2], s2[2], s1[2], s0[2] );
-   d[ 3] = _mm256_set_epi32( s7[3], s6[3], s5[3], s4[3],
-                             s3[3], s2[3], s1[3], s0[3] );
-   d[ 4] = _mm256_set_epi32( s7[4], s6[4], s5[4], s4[4],
-                             s3[4], s2[4], s1[4], s0[4] );
-   d[ 5] = _mm256_set_epi32( s7[5], s6[5], s5[5], s4[5],
-                             s3[5], s2[5], s1[5], s0[5] );
-   d[ 6] = _mm256_set_epi32( s7[6], s6[6], s5[6], s4[6],
-                             s3[6], s2[6], s1[6], s0[6] );
-   d[ 7] = _mm256_set_epi32( s7[7], s6[7], s5[7], s4[7],
-                             s3[7], s2[7], s1[7], s0[7] );
-
-   if ( bit_len <= 256 ) return;
-
-   d[ 8] = _mm256_set_epi32( s7[ 8], s6[ 8], s5[ 8], s4[ 8],
-                             s3[ 8], s2[ 8], s1[ 8], s0[ 8] );
-   d[ 9] = _mm256_set_epi32( s7[ 9], s6[ 9], s5[ 9], s4[ 9],
-                             s3[ 9], s2[ 9], s1[ 9], s0[ 9] );
-   d[10] = _mm256_set_epi32( s7[10], s6[10], s5[10], s4[10],
-                             s3[10], s2[10], s1[10], s0[10] );
-   d[11] = _mm256_set_epi32( s7[11], s6[11], s5[11], s4[11],
-                             s3[11], s2[11], s1[11], s0[11] );
-   d[12] = _mm256_set_epi32( s7[12], s6[12], s5[12], s4[12],
-                             s3[12], s2[12], s1[12], s0[12] );
-   d[13] = _mm256_set_epi32( s7[13], s6[13], s5[13], s4[13],
-                             s3[13], s2[13], s1[13], s0[13] );
-   d[14] = _mm256_set_epi32( s7[14], s6[14], s5[14], s4[14],
-                             s3[14], s2[14], s1[14], s0[14] );
-   d[15] = _mm256_set_epi32( s7[15], s6[15], s5[15], s4[15],
-                             s3[15], s2[15], s1[15], s0[15] );
-
-   if ( bit_len <= 512 ) return;
-
-   d[16] = _mm256_set_epi32( s7[16], s6[16], s5[16], s4[16],
-                             s3[16], s2[16], s1[16], s0[16] );
-   d[17] = _mm256_set_epi32( s7[17], s6[17], s5[17], s4[17],
-                             s3[17], s2[17], s1[17], s0[17] );
-   d[18] = _mm256_set_epi32( s7[18], s6[18], s5[18], s4[18],
-                             s3[18], s2[18], s1[18], s0[18] );
-   d[19] = _mm256_set_epi32( s7[19], s6[19], s5[19], s4[19],
-                             s3[19], s2[19], s1[19], s0[19] );
-
-   if ( bit_len <= 640 ) return;
-
-   d[20] = _mm256_set_epi32( s7[20], s6[20], s5[20], s4[20],
-                             s3[20], s2[20], s1[20], s0[20] );
-   d[21] = _mm256_set_epi32( s7[21], s6[21], s5[21], s4[21],
-                             s3[21], s2[21], s1[21], s0[21] );
-   d[22] = _mm256_set_epi32( s7[22], s6[22], s5[22], s4[22],
-                             s3[22], s2[22], s1[22], s0[22] );
-   d[23] = _mm256_set_epi32( s7[23], s6[23], s5[23], s4[23],
-                             s3[23], s2[23], s1[23], s0[23] );
-
-   if ( bit_len <= 768 ) return;
-
-   d[24] = _mm256_set_epi32( s7[24], s6[24], s5[24], s4[24],
-                             s3[24], s2[24], s1[24], s0[24] );
-   d[25] = _mm256_set_epi32( s7[25], s6[25], s5[25], s4[25],
-                             s3[25], s2[25], s1[25], s0[25] );
-   d[26] = _mm256_set_epi32( s7[26], s6[26], s5[26], s4[26],
-                             s3[26], s2[26], s1[26], s0[26] );
-   d[27] = _mm256_set_epi32( s7[27], s6[27], s5[27], s4[27],
-                             s3[27], s2[27], s1[27], s0[27] );
-   d[28] = _mm256_set_epi32( s7[28], s6[28], s5[28], s4[28],
-                             s3[28], s2[28], s1[28], s0[28] );
-   d[29] = _mm256_set_epi32( s7[29], s6[29], s5[29], s4[29],
-                             s3[29], s2[29], s1[29], s0[29] );
-   d[30] = _mm256_set_epi32( s7[30], s6[30], s5[30], s4[30],
-                             s3[30], s2[30], s1[30], s0[30] );
-   d[31] = _mm256_set_epi32( s7[31], s6[31], s5[31], s4[31],
-                             s3[31], s2[31], s1[31], s0[31] );
-
-   // bit_len == 1024
-}
-
-// Slower but it works with 32 bit data
-// bit_len must be multiple of 32
-static inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0,
-          uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4,
-          uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len )
-{
-   uint32_t *d = dst;;
-   for ( int i = 0; i < bit_len>>5; i++, d += 8 )
-   {
-      *d     = *(src0+i);
-      *(d+1) = *(src1+i);
-      *(d+2) = *(src2+i);
-      *(d+3) = *(src3+i);
-      *(d+4) = *(src4+i);
-      *(d+5) = *(src5+i);
-      *(d+6) = *(src6+i);
-      *(d+7) = *(src7+i);
-  }
-}
-
-// Deinterleave 8 buffers of 32 bit data from the source buffer.
-static inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
-              void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
-              const void *src, int bit_len )
-{
-   uint32_t *s = (uint32_t*)src;
-   __m256i* d0 = (__m256i*)dst0;
-   __m256i* d1 = (__m256i*)dst1;
-   __m256i* d2 = (__m256i*)dst2;
-   __m256i* d3 = (__m256i*)dst3;
-   __m256i* d4 = (__m256i*)dst4;
-   __m256i* d5 = (__m256i*)dst5;
-   __m256i* d6 = (__m256i*)dst6;
-   __m256i* d7 = (__m256i*)dst7;
-
-   d0[0] = _mm256_set_epi32( s[ 56], s[ 48], s[ 40], s[ 32],
-                             s[ 24], s[ 16], s[  8], s[  0] );
-   d1[0] = _mm256_set_epi32( s[ 57], s[ 49], s[ 41], s[ 33],
-                             s[ 25], s[ 17], s[  9], s[  1] );
-   d2[0] = _mm256_set_epi32( s[ 58], s[ 50], s[ 42], s[ 34],
-                             s[ 26], s[ 18], s[ 10], s[  2] );
-   d3[0] = _mm256_set_epi32( s[ 59], s[ 51], s[ 43], s[ 35],
-                             s[ 27], s[ 19], s[ 11], s[  3] );
-   d4[0] = _mm256_set_epi32( s[ 60], s[ 52], s[ 44], s[ 36],
-                             s[ 28], s[ 20], s[ 12], s[  4] );
-   d5[0] = _mm256_set_epi32( s[ 61], s[ 53], s[ 45], s[ 37],
-                             s[ 29], s[ 21], s[ 13], s[  5] );
-   d6[0] = _mm256_set_epi32( s[ 62], s[ 54], s[ 46], s[ 38],
-                             s[ 30], s[ 22], s[ 14], s[  6] );
-   d7[0] = _mm256_set_epi32( s[ 63], s[ 55], s[ 47], s[ 39],
-                             s[ 31], s[ 23], s[ 15], s[  7] );
-
-   if ( bit_len <= 256 ) return;
-
-   d0[1] = _mm256_set_epi32( s[120], s[112], s[104], s[ 96],
-                             s[ 88], s[ 80], s[ 72], s[ 64] );
-   d1[1] = _mm256_set_epi32( s[121], s[113], s[105], s[ 97],
-                             s[ 89], s[ 81], s[ 73], s[ 65] );
-   d2[1] = _mm256_set_epi32( s[122], s[114], s[106], s[ 98],
-                             s[ 90], s[ 82], s[ 74], s[ 66]);
-   d3[1] = _mm256_set_epi32( s[123], s[115], s[107], s[ 99],
-                             s[ 91], s[ 83], s[ 75], s[ 67] );
-   d4[1] = _mm256_set_epi32( s[124], s[116], s[108], s[100],
-                             s[ 92], s[ 84], s[ 76], s[ 68] );
-   d5[1] = _mm256_set_epi32( s[125], s[117], s[109], s[101],
-                             s[ 93], s[ 85], s[ 77], s[ 69] );
-   d6[1] = _mm256_set_epi32( s[126], s[118], s[110], s[102],
-                             s[ 94], s[ 86], s[ 78], s[ 70] );
-   d7[1] = _mm256_set_epi32( s[127], s[119], s[111], s[103],
-                             s[ 95], s[ 87], s[ 79], s[ 71] );
-
-   if ( bit_len <= 512 ) return;
-
-   // null change for overrun space, vector indexing doesn't work for
-   // 32 bit data
-   if ( bit_len <= 640 )
-   {
-      uint32_t *d = ((uint32_t*)d0) + 8;
-      d0[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                                  s[152], s[144], s[136], s[128] );
-      d = ((uint32_t*)d1) + 8;
-      d1[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                                  s[153], s[145], s[137], s[129] );
-      d = ((uint32_t*)d2) + 8;
-      d2[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                                  s[154], s[146], s[138], s[130]);
-      d = ((uint32_t*)d3) + 8;
-      d3[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                                  s[155], s[147], s[139], s[131] );
-      d = ((uint32_t*)d4) + 8;
-      d4[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                                  s[156], s[148], s[140], s[132] );
-      d = ((uint32_t*)d5) + 8;
-      d5[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                                  s[157], s[149], s[141], s[133] );
-      d = ((uint32_t*)d6) + 8;
-      d6[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                                  s[158], s[150], s[142], s[134] );
-      d = ((uint32_t*)d7) + 8;
-      d7[2] = _mm256_set_epi32( *(d+7), *(d+6), *(d+5), *(d+4),
-                                  s[159], s[151], s[143], s[135] );
-      return;
-   }
-
-   d0[2] = _mm256_set_epi32( s[184], s[176], s[168], s[160],
-                             s[152], s[144], s[136], s[128] );
-   d1[2] = _mm256_set_epi32( s[185], s[177], s[169], s[161],
-                             s[153], s[145], s[137], s[129] );
-   d2[2] = _mm256_set_epi32( s[186], s[178], s[170], s[162],
-                             s[154], s[146], s[138], s[130] );
-   d3[2] = _mm256_set_epi32( s[187], s[179], s[171], s[163],
-                             s[155], s[147], s[139], s[131] );
-   d4[2] = _mm256_set_epi32( s[188], s[180], s[172], s[164],
-                             s[156], s[148], s[140], s[132] );
-   d5[2] = _mm256_set_epi32( s[189], s[181], s[173], s[165],
-                             s[157], s[149], s[141], s[133] );
-   d6[2] = _mm256_set_epi32( s[190], s[182], s[174], s[166],
-                             s[158], s[150], s[142], s[134] );
-   d7[2] = _mm256_set_epi32( s[191], s[183], s[175], s[167],
-                             s[159], s[151], s[143], s[135] );
-
-   if ( bit_len <= 768 ) return;
-
-   d0[3] = _mm256_set_epi32( s[248], s[240], s[232], s[224],
-                             s[216], s[208], s[200], s[192] );
-   d1[3] = _mm256_set_epi32( s[249], s[241], s[233], s[225],
-                             s[217], s[209], s[201], s[193] );
-   d2[3] = _mm256_set_epi32( s[250], s[242], s[234], s[226],
-                             s[218], s[210], s[202], s[194] );
-   d3[3] = _mm256_set_epi32( s[251], s[243], s[235], s[227],
-                             s[219], s[211], s[203], s[195] );
-   d4[3] = _mm256_set_epi32( s[252], s[244], s[236], s[228],
-                             s[220], s[212], s[204], s[196] );
-   d5[3] = _mm256_set_epi32( s[253], s[245], s[237], s[229],
-                             s[221], s[213], s[205], s[197] );
-   d6[3] = _mm256_set_epi32( s[254], s[246], s[238], s[230],
-                             s[222], s[214], s[206], s[198] );
-   d7[3] = _mm256_set_epi32( s[255], s[247], s[239], s[231],
-                             s[223], s[215], s[207], s[199] );
-// bit_len == 1024
-}
-
-// Deinterleave 8 arrays into indivdual buffers for scalar processing
-// bit_len must be multiple of 32
-static inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
-                uint32_t *dst2,uint32_t *dst3, uint32_t *dst4, uint32_t *dst5,
-                uint32_t *dst6,uint32_t *dst7,uint32_t *src, int bit_len )
-{
-  uint32_t *s = src;
-  for ( int i = 0; i < bit_len>>5; i++, s += 8 )
-  {
-     *(dst0+i) = *( s     );
-     *(dst1+i) = *( s + 1 );
-     *(dst2+i) = *( s + 2 );
-     *(dst3+i) = *( s + 3 );
-     *(dst4+i) = *( s + 4 );
-     *(dst5+i) = *( s + 5 );
-     *(dst6+i) = *( s + 6 );
-     *(dst7+i) = *( s + 7 );
-  }
-}
-
-// Convert from 4x32 AVX interleaving to 4x64 AVX2.
-// Can't do it in place
-static inline void mm256_reinterleave_4x64( void *dst, void *src, int  bit_len )
-{
-   __m256i* d = (__m256i*)dst;
-   uint32_t *s = (uint32_t*)src;
-
-   d[0] = _mm256_set_epi32( s[7], s[3], s[6], s[2], s[5], s[1], s[4], s[0] );
-   d[1] = _mm256_set_epi32( s[15],s[11],s[14],s[10],s[13],s[9],s[12], s[8] );
-   d[2] = _mm256_set_epi32( s[23],s[19],s[22],s[18],s[21],s[17],s[20],s[16] );
-   d[3] = _mm256_set_epi32( s[31],s[27],s[30],s[26],s[29],s[25],s[28],s[24] );
-
-   if ( bit_len <= 256 ) return;
-
-   d[4] = _mm256_set_epi32( s[39],s[35],s[38],s[34],s[37],s[33],s[36],s[32] );
-   d[5] = _mm256_set_epi32( s[47],s[43],s[46],s[42],s[45],s[41],s[44],s[40] );
-   d[6] = _mm256_set_epi32( s[55],s[51],s[54],s[50],s[53],s[49],s[52],s[48] );
-   d[7] = _mm256_set_epi32( s[63],s[59],s[62],s[58],s[61],s[57],s[60],s[56] );
-
-   if ( bit_len <= 512 ) return;
-
-   d[8] = _mm256_set_epi32( s[71],s[67],s[70],s[66],s[69],s[65],s[68],s[64] );
-   d[9] = _mm256_set_epi32( s[79],s[75],s[78],s[74],s[77],s[73],s[76],s[72] );
-
-   if ( bit_len <= 640 ) return;
-
-  d[10] = _mm256_set_epi32(s[87],s[83],s[86],s[82],s[85],s[81],s[84],s[80]);
-  d[11] = _mm256_set_epi32(s[95],s[91],s[94],s[90],s[93],s[89],s[92],s[88]);
-
-  d[12] = _mm256_set_epi32(s[103],s[99],s[102],s[98],s[101],s[97],s[100],s[96]);
-  d[13] = _mm256_set_epi32(s[111],s[107],s[110],s[106],s[109],s[105],s[108],s[104]);
-  d[14] = _mm256_set_epi32(s[119],s[115],s[118],s[114],s[117],s[113],s[116],s[112]);
-  d[15] = _mm256_set_epi32(s[127],s[123],s[126],s[122],s[125],s[121],s[124],s[120]);
-   // bit_len == 1024
-}
-
-// likely of no use.
-// convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2
-// bit_len must be multiple of 64
-// broken
-static inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
-                                             int  bit_len )
-{
-   uint32_t *d = (uint32_t*)dst;
-   uint32_t *s = (uint32_t*)src;
-   for ( int i = 0; i < bit_len >> 5; i += 8 )
-   {
-      *( d + i     ) = *( s + i     );      // 0 <- 0    8 <- 8
-      *( d + i + 1 ) = *( s + i + 4 );      // 1 <- 4    9 <- 12
-      *( d + i + 2 ) = *( s + i + 1 );      // 2 <- 1    10 <- 9
-      *( d + i + 3 ) = *( s + i + 5 );      // 3 <- 5    11 <- 13
-      *( d + i + 4 ) = *( s + i + 2 );      // 4 <- 2    12 <- 10
-      *( d + i + 5 ) = *( s + i + 6 );      // 5 <- 6    13 <- 14
-      *( d + i + 6 ) = *( s + i + 3 );      // 6 <- 3    14 <- 11
-      *( d + i + 7 ) = *( s + i + 7 );      // 7 <- 7    15 <- 15
-     }
-}
-
-// Convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
-// bit_len must be multiple of 64
-static inline void mm256_reinterleave_4x32( void *dst, void *src, int  bit_len )
-{
-   __m256i  *d = (__m256i*)dst;
-   uint32_t *s = (uint32_t*)src;
-
-   d[0] = _mm256_set_epi32( s[ 7],s[ 5],s[ 3],s[ 1],s[ 6],s[ 4],s[ 2],s[ 0] );
-   d[1] = _mm256_set_epi32( s[15],s[13],s[11],s[ 9],s[14],s[12],s[10],s[ 8] );
-   d[2] = _mm256_set_epi32( s[23],s[21],s[19],s[17],s[22],s[20],s[18],s[16] );
-   d[3] = _mm256_set_epi32( s[31],s[29],s[27],s[25],s[30],s[28],s[26],s[24] );
-
-   if ( bit_len <= 256 ) return;
-
-   d[4] = _mm256_set_epi32( s[39],s[37],s[35],s[33],s[38],s[36],s[34],s[32] );
-   d[5] = _mm256_set_epi32( s[47],s[45],s[43],s[41],s[46],s[44],s[42],s[40] );
-   d[6] = _mm256_set_epi32( s[55],s[53],s[51],s[49],s[54],s[52],s[50],s[48] );
-   d[7] = _mm256_set_epi32( s[63],s[61],s[59],s[57],s[62],s[60],s[58],s[56] );
-
-   if ( bit_len <= 512 ) return;
-
-   d[8] = _mm256_set_epi32( s[71],s[69],s[67],s[65],s[70],s[68],s[66],s[64] );
-   d[9] = _mm256_set_epi32( s[79],s[77],s[75],s[73],s[78],s[76],s[74],s[72] );
-
-   if ( bit_len <= 640 ) return;
-
-   d[10] = _mm256_set_epi32( s[87],s[85],s[83],s[81],s[86],s[84],s[82],s[80] );
-   d[11] = _mm256_set_epi32( s[95],s[93],s[91],s[89],s[94],s[92],s[90],s[88] );
-
-   d[12] = _mm256_set_epi32( s[103],s[101],s[99],s[97],s[102],s[100],s[98],s[96] );
-   d[13] = _mm256_set_epi32( s[111],s[109],s[107],s[105],s[110],s[108],s[106],s[104] );
-   d[14] = _mm256_set_epi32( s[119],s[117],s[115],s[113],s[118],s[116],s[114],s[112] );
-   d[15] = _mm256_set_epi32( s[127],s[125],s[123],s[121],s[126],s[124],s[122],s[120] );
-   // bit_len == 1024
-}
-
-static inline void mm256_interleave_2x128( void *dst, void *src0, void *src1,
-                                           int bit_len )
-{
-   __m256i  *d = (__m256i*)dst;
-   uint64_t *s0 = (uint64_t*)src0;
-   uint64_t *s1 = (uint64_t*)src1;   
-
-   d[0] = _mm256_set_epi64x( s1[ 1], s1[ 0], s0[ 1], s0[ 0] );
-   d[1] = _mm256_set_epi64x( s1[ 3], s1[ 2], s0[ 3], s0[ 2] );
-
-   if ( bit_len <= 256 ) return;
-
-   d[2] = _mm256_set_epi64x( s1[ 5], s1[ 4], s0[ 5], s0[ 4] );
-   d[3] = _mm256_set_epi64x( s1[ 7], s1[ 6], s0[ 7], s0[ 6] );
-
-   if ( bit_len <= 512 ) return;
-
-   d[4] = _mm256_set_epi64x( s1[ 9], s1[ 8], s0[ 9], s0[ 8] );
-   
-   if ( bit_len <= 640 ) return;
-
-   d[5] = _mm256_set_epi64x( s1[11], s1[10], s0[11], s0[10] );
-
-   d[6] = _mm256_set_epi64x( s1[13], s1[12], s0[13], s0[12] );
-   d[7] = _mm256_set_epi64x( s1[15], s1[14], s0[15], s0[14] );
-
-   // bit_len == 1024
-}
-
-static inline void mm256_deinterleave_2x128( void *dst0, void *dst1, void *src,
-                                             int bit_len )
-{
-   uint64_t *s = (uint64_t*)src;
-   __m256i  *d0 = (__m256i*)dst0;
-   __m256i  *d1 = (__m256i*)dst1;
-
-   d0[0] = _mm256_set_epi64x( s[ 5], s[4], s[ 1], s[ 0] );
-   d1[0] = _mm256_set_epi64x( s[ 7], s[6], s[ 3], s[ 2] );
-
-   if ( bit_len <= 256 ) return;
-
-   d0[1] = _mm256_set_epi64x( s[13], s[12], s[ 9], s[ 8] );
-   d1[1] = _mm256_set_epi64x( s[15], s[14], s[11], s[10] );
-
-   if ( bit_len <= 512 ) return;
-
-   if ( bit_len <= 640 )
-   {
-      d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[17], s[16] );
-      d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[19], s[18] );
-      return;
-   }
-
-   d0[2] = _mm256_set_epi64x( s[21], s[20], s[17], s[16] );
-   d1[2] = _mm256_set_epi64x( s[23], s[22], s[19], s[18] );
-
-   d0[3] = _mm256_set_epi64x( s[29], s[28], s[25], s[24] );
-   d1[3] = _mm256_set_epi64x( s[31], s[30], s[27], s[26] );
-
-   // bit_len == 1024
-}
-
-// not used
-static inline void mm_reinterleave_4x32( void *dst, void *src, int  bit_len )
-{
-   uint32_t *d = (uint32_t*)dst;
-   uint32_t *s = (uint32_t*)src;
-   for ( int i = 0; i < bit_len >> 5; i +=8 )
-   {
-      *( d + i     ) = *( s + i     );
-      *( d + i + 1 ) = *( s + i + 2 );
-      *( d + i + 2 ) = *( s + i + 4 );
-      *( d + i + 3 ) = *( s + i + 6 );
-      *( d + i + 4 ) = *( s + i + 1 );
-      *( d + i + 5 ) = *( s + i + 3 );
-      *( d + i + 6 ) = *( s + i + 5 );
-      *( d + i + 7 ) = *( s + i + 7 );
-   }
-}
-
-#endif // __AVX2__
-#endif // AVXDEFS_H__
+#endif   // AVXDEFS_H__
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -3,10 +3,30 @@
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
+CFLAGS="-O3 -march=core-avx2 -msha -Wall" ./configure --with-curl
+make -j 4
+strip -s cpuminer.exe
+mv cpuminer.exe cpuminer-avx2-sha.exe
+strip -s cpuminer
+mv cpuminer cpuminer-avx2-sha
+
+make clean || echo clean
+rm -f config.status
+./autogen.sh || echo done
+CFLAGS="-O3 -march=skylake-avx512 -Wall" ./configure --with-curl
+make -j 4
+strip -s cpuminer.exe
+mv cpuminer.exe cpuminer-avx512.exe
+strip -s cpuminer
+mv cpuminer cpuminer-avx512
+
+make clean || echo clean
+rm -f config.status
+./autogen.sh || echo done
 CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-aes-avx2.exe
+mv cpuminer.exe cpuminer-avx2.exe
 strip -s cpuminer
 mv cpuminer cpuminer-avx2

--- a/47
+++ b/47
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.6.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.8.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.8.6'
-PACKAGE_STRING='cpuminer-opt 3.8.6'
+PACKAGE_VERSION='3.8.8.1'
+PACKAGE_STRING='cpuminer-opt 3.8.8.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@@ -738,6 +738,7 @@ infodir
 docdir
 oldincludedir
 includedir
+runstatedir
 localstatedir
 sharedstatedir
 sysconfdir
@@ -819,6 +820,7 @@ datadir='${datarootdir}'
 sysconfdir='${prefix}/etc'
 sharedstatedir='${prefix}/com'
 localstatedir='${prefix}/var'
+runstatedir='${localstatedir}/run'
 includedir='${prefix}/include'
 oldincludedir='/usr/include'
 docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
@@ -1071,6 +1073,15 @@ do
  | -silent | --silent | --silen | --sile | --sil)
    silent=yes ;;

+  -runstatedir | --runstatedir | --runstatedi | --runstated \
+  | --runstate | --runstat | --runsta | --runst | --runs \
+  | --run | --ru | --r)
+    ac_prev=runstatedir ;;
+  -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
+  | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
+  | --run=* | --ru=* | --r=*)
+    runstatedir=$ac_optarg ;;
+
  -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
    ac_prev=sbindir ;;
  -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
@@ -1208,7 +1219,7 @@ fi
 for ac_var in	exec_prefix prefix bindir sbindir libexecdir datarootdir \
 		datadir sysconfdir sharedstatedir localstatedir includedir \
 		oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
-		libdir localedir mandir
+		libdir localedir mandir runstatedir
 do
  eval ac_val=\$$ac_var
  # Remove trailing slashes.
@@ -1321,7 +1332,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.8.6 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.8.8.1 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@@ -1361,6 +1372,7 @@ Fine tuning of the installation directories:
  --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
  --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
  --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
+  --runstatedir=DIR       modifiable per-process data [LOCALSTATEDIR/run]
  --libdir=DIR            object code libraries [EPREFIX/lib]
  --includedir=DIR        C header files [PREFIX/include]
  --oldincludedir=DIR     C header files for non-gcc [/usr/include]
@@ -1392,7 +1404,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.8.6:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.8.8.1:";;
   esac
  cat <<\_ACEOF

@@ -1497,7 +1509,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-cpuminer-opt configure 3.8.6
+cpuminer-opt configure 3.8.8.1
 generated by GNU Autoconf 2.69

 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2012,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by cpuminer-opt $as_me 3.8.6, which was
+It was created by cpuminer-opt $as_me 3.8.8.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  $ $0 $@
@@ -2495,7 +2507,7 @@ ac_config_headers="$ac_config_headers cpuminer-config.h"



-am__api_version='1.14'
+am__api_version='1.15'

 # Find a good install program.  We prefer a C program (faster),
 # so one script is as good as another.  But avoid the broken or
@@ -2667,8 +2679,8 @@ test "$program_suffix" != NONE &&
 ac_script='s/[\\$]/&&/g;s/;s,x,x,$//'
 program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"`

-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
+# Expand $ac_aux_dir to an absolute path.
+am_aux_dir=`cd "$ac_aux_dir" && pwd`

 if test x"${MISSING+set}" != xset; then
  case $am_aux_dir in
@@ -2687,7 +2699,7 @@ else
 $as_echo "$as_me: WARNING: 'missing' script is too old or missing" >&2;}
 fi

-if test x"${install_sh}" != xset; then
+if test x"${install_sh+set}" != xset; then
  case $am_aux_dir in
  *\ * | *\	*)
    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -2981,7 +2993,7 @@ fi

 # Define the identity of the package.
 PACKAGE='cpuminer-opt'
- VERSION='3.8.6'
+ VERSION='3.8.8.1'


 cat >>confdefs.h <<_ACEOF
@@ -3015,8 +3027,8 @@ MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 mkdir_p='$(MKDIR_P)'

-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
+# We need awk for the "check" target (and possibly the TAP driver).  The
+# system "awk" is bad on some platforms.
 # Always define AMTAR for backward compatibility.  Yes, it's still used
 # in the wild :-(  We should find a proper way to deprecate it ...
 AMTAR='$${TAR-tar}'
@@ -3075,6 +3087,7 @@ END
 fi


+
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to enable maintainer-specific portions of Makefiles" >&5
 $as_echo_n "checking whether to enable maintainer-specific portions of Makefiles... " >&6; }
    # Check whether --enable-maintainer-mode was given.
@@ -6677,7 +6690,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.8.6, which was
+This file was extended by cpuminer-opt $as_me 3.8.8.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6756,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.8.6
+cpuminer-opt config.status 3.8.8.1
 configured by $0, generated by GNU Autoconf 2.69,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.8.6])
+AC_INIT([cpuminer-opt], [3.8.8.1])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -98,7 +98,7 @@ static int opt_fail_pause = 10;
 static int opt_time_limit = 0;
 int opt_timeout = 300;
 static int opt_scantime = 5;
-static const bool opt_time = true;
+//static const bool opt_time = true;
 enum algos opt_algo = ALGO_NULL;
 int opt_scrypt_n = 0;
 int opt_pluck_n = 128;
@@ -1988,7 +1988,9 @@ static void *miner_thread( void *userdata )
          }
       }
       // Display benchmark total
-       if ( opt_benchmark && thr_id == opt_n_threads - 1 )
+       // Update hashrate for API if no shares accepted yet.
+       if ( ( opt_benchmark || !accepted_count ) 
+            && thr_id == opt_n_threads - 1 )
       {
          double hashrate  = 0.;
          double hashcount = 0.;
@@ -1999,27 +2001,30 @@ static void *miner_thread( void *userdata )
          }
          if ( hashcount )
          {
-             char hc[16];
-             char hc_units[2] = {0,0};
-             char hr[16];
-             char hr_units[2] = {0,0};
             global_hashcount = hashcount;
             global_hashrate  = hashrate;
-             scale_hash_for_display( &hashcount, hc_units );
-             scale_hash_for_display( &hashrate,  hr_units );
-             if ( hc_units[0] )
-                sprintf( hc, "%.2f", hashcount );
-             else  // no fractions of a hash
-                sprintf( hc, "%.0f", hashcount );
-             sprintf( hr, "%.2f", hashrate );
+             if ( opt_benchmark )
+             {
+                char hc[16];
+                char hc_units[2] = {0,0};
+                char hr[16];
+                char hr_units[2] = {0,0};
+	        scale_hash_for_display( &hashcount, hc_units );
+                scale_hash_for_display( &hashrate,  hr_units );
+                if ( hc_units[0] )
+                   sprintf( hc, "%.2f", hashcount );
+                else  // no fractions of a hash
+                   sprintf( hc, "%.0f", hashcount );
+                sprintf( hr, "%.2f", hashrate );
 #if ((defined(_WIN64) || defined(__WINDOWS__)))
-             applog( LOG_NOTICE, "Total: %s %sH, %s %sH/s",
+                applog( LOG_NOTICE, "Total: %s %sH, %s %sH/s",
                                  hc, hc_units, hr, hr_units );
 #else
-             applog( LOG_NOTICE, "Total: %s %sH, %s %sH/s, %dC",
+                applog( LOG_NOTICE, "Total: %s %sH, %s %sH/s, %dC",
                         hc, hc_units, hr, hr_units, (uint32_t)cpu_temp(0) );
 #endif
-          }
+             }
+	  }
       }
   }  // miner_thread loop

@@ -2999,45 +3004,37 @@ static void show_credits()
 bool check_cpu_capability ()
 {
     char cpu_brand[0x40];
-     // there is no CPU related feature specific to 4way, just AVX2 and AES
-     bool cpu_has_sse2 = has_sse2();
-     bool cpu_has_aes  = has_aes_ni();
-     bool cpu_has_sse42 = has_sse42();
-     bool cpu_has_avx  = has_avx1();
-     bool cpu_has_avx2 = has_avx2();
-     bool cpu_has_sha  = has_sha();
-     // no need to check if sw has sse2,
-     // the code won't compile without it.
-//     bool sw_has_sse2  = false;
-     bool sw_has_aes   = false;
-     bool sw_has_sse42 = false;
-     bool sw_has_avx   = false;
-     bool sw_has_avx2  = false;
-     bool sw_has_sha   = false;
-//     bool sw_has_4way  = false;
+     bool cpu_has_sse2   = has_sse2();
+     bool cpu_has_aes    = has_aes_ni();
+     bool cpu_has_sse42  = has_sse42();
+     bool cpu_has_avx    = has_avx1();
+     bool cpu_has_avx2   = has_avx2();
+     bool cpu_has_sha    = has_sha();
+     bool cpu_has_avx512 = has_avx512f();
+     bool sw_has_aes    = false;
+     bool sw_has_sse42  = false;
+     bool sw_has_avx    = false;
+     bool sw_has_avx2   = false;
+     bool sw_has_avx512 = false;
+     bool sw_has_sha    = false;
     set_t algo_features = algo_gate.optimizations;
-     bool algo_has_sse2 = set_incl( SSE2_OPT,     algo_features );
-     bool algo_has_aes  = set_incl( AES_OPT,      algo_features );
-     bool algo_has_sse42  = set_incl( SSE42_OPT,      algo_features );
-     bool algo_has_avx  = set_incl( AVX_OPT,      algo_features );
-     bool algo_has_avx2 = set_incl( AVX2_OPT,     algo_features );
-     bool algo_has_sha  = set_incl( SHA_OPT,      algo_features );
-//     bool algo_has_4way = set_incl( FOUR_WAY_OPT, algo_features );
+     bool algo_has_sse2   = set_incl( SSE2_OPT,    algo_features );
+     bool algo_has_aes    = set_incl( AES_OPT,     algo_features );
+     bool algo_has_sse42  = set_incl( SSE42_OPT,   algo_features );
+     bool algo_has_avx2   = set_incl( AVX2_OPT,    algo_features );
+     bool algo_has_avx512 = set_incl( AVX512_OPT,  algo_features );
+     bool algo_has_sha    = set_incl( SHA_OPT,     algo_features );
     bool use_aes;
     bool use_sse2;
     bool use_sse42;
-     bool use_avx;
     bool use_avx2;
+     bool use_avx512;
     bool use_sha;
-//     bool use_4way;
     bool use_none;

     #ifdef __AES__
       sw_has_aes = true;
     #endif
-//     #ifdef __SSE2__
-//         sw_has_sse2 = true;
-//     #endif
     #ifdef __SSE4_2__
         sw_has_sse42 = true;
     #endif
@@ -3047,12 +3044,12 @@ bool check_cpu_capability ()
     #ifdef __AVX2__
         sw_has_avx2 = true;
     #endif
+     #ifdef __AVX512F__
+         sw_has_avx512 = true;
+     #endif
     #ifdef __SHA__
         sw_has_sha = true;
     #endif
-//     #ifdef HASH_4WAY
-//         sw_has_4way = true;
-//     #endif

     #if !((__AES__) || (__SSE2__))
         printf("Neither __AES__ nor __SSE2__ defined.\n");
@@ -3072,33 +3069,33 @@ bool check_cpu_capability ()
     #endif

     printf("CPU features:");
-     if ( cpu_has_sse2 )  printf( " SSE2" );
-     if ( cpu_has_aes  )  printf( " AES"  );
-     if ( cpu_has_sse42  )  printf( " SSE4.2"  );
-     if ( cpu_has_avx  )  printf( " AVX"  );
-     if ( cpu_has_avx2 )  printf( " AVX2" );
-     if ( cpu_has_sha  )  printf( " SHA"  );
+     if ( cpu_has_sse2   )    printf( " SSE2"   );
+     if ( cpu_has_aes    )    printf( " AES"    );
+     if ( cpu_has_sse42  )    printf( " SSE4.2" );
+     if ( cpu_has_avx    )    printf( " AVX"    );
+     if ( cpu_has_avx2   )    printf( " AVX2"   );
+     if ( cpu_has_avx512 )    printf( " AVX512" );
+     if ( cpu_has_sha    )    printf( " SHA"    );

     printf(".\nSW features: SSE2");
-     if ( sw_has_aes  )    printf( " AES"  );
-     if ( sw_has_sse42  )    printf( " SSE4.2"  );
-     if ( sw_has_avx  )    printf( " AVX"  );
-     if ( sw_has_avx2 )    printf( " AVX2" );
-//     if ( sw_has_4way )    printf( " 4WAY" );
-     if ( sw_has_sha  )    printf( " SHA"  );
+     if ( sw_has_aes    )     printf( " AES"    );
+     if ( sw_has_sse42  )     printf( " SSE4.2" );
+     if ( sw_has_avx    )     printf( " AVX"    );
+     if ( sw_has_avx2   )     printf( " AVX2"   );
+     if ( sw_has_avx512 )     printf( " AVX512" );
+     if ( sw_has_sha    )     printf( " SHA"    );
    

     printf(".\nAlgo features:");
     if ( algo_features == EMPTY_SET ) printf( " None" );
     else
     {
-        if ( algo_has_sse2 )           printf( " SSE2" );
-        if ( algo_has_aes  )           printf( " AES"  );
-        if ( algo_has_sse42  )           printf( " SSE4.2"  );
-        if ( algo_has_avx  )           printf( " AVX"  );
-        if ( algo_has_avx2 )           printf( " AVX2" );
-//        if ( algo_has_4way )           printf( " 4WAY" );
-        if ( algo_has_sha  )           printf( " SHA"  );
+        if ( algo_has_sse2   ) printf( " SSE2"    );
+        if ( algo_has_aes    ) printf( " AES"     );
+        if ( algo_has_sse42  ) printf( " SSE4.2"  );
+        if ( algo_has_avx2   ) printf( " AVX2"   );
+        if ( algo_has_avx512 ) printf( " AVX512" );
+        if ( algo_has_sha    ) printf( " SHA"    );
     }
     printf(".\n");

@@ -3118,11 +3115,6 @@ bool check_cpu_capability ()
        printf( "The SW build requires a CPU with SSE4.2!\n" );
        return false;
     }
-     if ( sw_has_avx && !cpu_has_avx )
-     {
-        printf( "The SW build requires a CPU with AVX!\n" );
-        return false;
-     }
     if ( sw_has_aes && !cpu_has_aes )
     {
        printf( "The SW build requires a CPU with AES!\n" );
@@ -3135,13 +3127,13 @@ bool check_cpu_capability ()
     }

     // Determine mining options
-     use_sse2 = cpu_has_sse2 && algo_has_sse2;
-     use_aes  = cpu_has_aes  && sw_has_aes  && algo_has_aes;
+     use_sse2   = cpu_has_sse2   && algo_has_sse2;
+     use_aes    = cpu_has_aes    && sw_has_aes    && algo_has_aes;
     use_sse42  = cpu_has_sse42  && sw_has_sse42  && algo_has_sse42;
-     use_avx  = cpu_has_avx  && sw_has_avx  && algo_has_avx;
-     use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2;
-     use_sha  = cpu_has_sha  && sw_has_sha  && algo_has_sha;
-     use_none = !( use_sse2 || use_aes || use_sse42 || use_avx || use_avx2 ||
+     use_avx2   = cpu_has_avx2   && sw_has_avx2   && algo_has_avx2;
+     use_avx512 = cpu_has_avx512 && sw_has_avx512 && algo_has_avx512;
+     use_sha    = cpu_has_sha    && sw_has_sha    && algo_has_sha;
+     use_none = !( use_sse2 || use_aes || use_sse42 || use_avx512 || use_avx2 ||
                   use_sha );
      
     // Display best options
@@ -3149,12 +3141,12 @@ bool check_cpu_capability ()
     if         ( use_none ) printf( " no optimizations" );
     else
     {
-        if      ( use_aes  ) printf( " AES"  );
-        if      ( use_avx2 ) printf( " AVX2" );
-        else if ( use_avx  ) printf( " AVX"  );
+        if      ( use_aes    ) printf( " AES"  );
+        if      ( use_avx512 ) printf( " AVX512" );
+        else if ( use_avx2   ) printf( " AVX2" );
        else if ( use_sse42  ) printf( " SSE4.2"  );
-        else if ( use_sse2 ) printf( " SSE2" );
-        if      ( use_sha  ) printf( " SHA"  );
+        else if ( use_sse2   ) printf( " SSE2" );
+        if      ( use_sha    ) printf( " SHA"  );
     }
     printf( ".\n\n" );

--- a/interleave.h
+++ b/interleave.h
--- a/miner.h
+++ b/miner.h
@@ -333,6 +333,7 @@ bool   has_sha();
 bool   has_aes_ni();
 bool   has_avx1();
 bool   has_avx2();
+bool   has_avx512f();
 bool   has_sse2();
 bool   has_xop();
 bool   has_fma3();
@@ -485,8 +486,9 @@ enum algos {
        ALGO_ALLIUM,
        ALGO_ANIME,
        ALGO_ARGON2,
-        ALGO_ARGON2DCRDS,
-        ALGO_ARGON2DDYN,
+        ALGO_ARGON2D250,
+        ALGO_ARGON2D500,
+        ALGO_ARGON2D4096,
        ALGO_AXIOM,       
        ALGO_BASTION,
        ALGO_BLAKE,       
@@ -496,7 +498,8 @@ enum algos {
        ALGO_BMW,        
        ALGO_C11,         
        ALGO_CRYPTOLIGHT, 
-        ALGO_CRYPTONIGHT, 
+        ALGO_CRYPTONIGHT,
+        ALGO_CRYPTONIGHTV7, 
        ALGO_DECRED,
        ALGO_DEEP,
        ALGO_DMD_GR,
@@ -565,8 +568,9 @@ static const char* const algo_names[] = {
        "allium",
        "anime",
        "argon2",
-        "argon2d-crds",
-        "argon2d-dyn",
+        "argon2d250",
+        "argon2d500",
+        "argon2d4096",
        "axiom",
        "bastion",
        "blake",
@@ -577,6 +581,7 @@ static const char* const algo_names[] = {
        "c11",
        "cryptolight",
        "cryptonight",
+        "cryptonightv7",
        "decred",
        "deep",
        "dmd-gr",
@@ -701,82 +706,84 @@ static char const usage[] = "\
 Usage: " PACKAGE_NAME " [OPTIONS]\n\
 Options:\n\
  -a, --algo=ALGO       specify the algorithm to use\n\
-                          allium       Garlicoin (GRLC)\n\
-                          anime        Animecoin (ANI)\n\
-                          argon2       Argon2 Coin (AR2)\n\
-                          argon2d-crds Credits (CRDS)\n\
-                          argon2d-dyn  Dynamic (DYN)\n\
-                          axiom        Shabal-256 MemoHash\n\
+                          allium        Garlicoin (GRLC)\n\
+                          anime         Animecoin (ANI)\n\
+                          argon2        Argon2 Coin (AR2)\n\
+                          argon2d250    argon2d-crds, Credits (CRDS)\n\
+                          argon2d500    argon2d-dyn, Dynamic (DYN)\n\
+                          argon2d4096   argon2d-uis, Unitus (UIS)\n\
+                          axiom         Shabal-256 MemoHash\n\
                          bastion\n\
-                          blake        blake256r14 (SFR)\n\
-                          blakecoin    blake256r8\n\
-                          blake2s      Blake-2 S\n\
-                          bmw          BMW 256\n\
-                          c11          Chaincoin\n\
-                          cryptolight  Cryptonight-light\n\
-                          cryptonight  cryptonote, Monero (XMR)\n\
-                          decred       Blake256r14dcr\n\
-                          deep         Deepcoin (DCN)\n\
-                          dmd-gr       Diamond\n\
-                          drop         Dropcoin\n\
-                          fresh        Fresh\n\
-                          groestl      Groestl coin\n\
-                          heavy        Heavy\n\
-                          hmq1725      Espers\n\
-                          hodl         Hodlcoin\n\
-                          jha          jackppot (Jackpotcoin)\n\
-                          keccak       Maxcoin\n\
-                          keccakc      Creative Coin\n\
-                          lbry         LBC, LBRY Credits\n\
-                          luffa        Luffa\n\
-                          lyra2h       Hppcoin\n\
-                          lyra2re      lyra2\n\
-                          lyra2rev2    lyrav2, Vertcoin\n\
-                          lyra2z       Zcoin (XZC)\n\
-                          lyra2z330    Lyra2 330 rows, Zoin (ZOI)\n\
-                          m7m          Magi (XMG)\n\
-                          myr-gr       Myriad-Groestl\n\
-                          neoscrypt    NeoScrypt(128, 2, 1)\n\
-                          nist5        Nist5\n\
-                          pentablake   5 x blake512\n\
-                          phi1612      phi, LUX coin\n\
-                          pluck        Pluck:128 (Supcoin)\n\
+                          blake         blake256r14 (SFR)\n\
+                          blakecoin     blake256r8\n\
+                          blake2s       Blake-2 S\n\
+                          bmw           BMW 256\n\
+                          c11           Chaincoin\n\
+                          cryptolight   Cryptonight-light\n\
+                          cryptonight   Cryptonote legacy\n\
+                          cryptonightv7 variant 7, Monero (XMR)\n\
+                          decred        Blake256r14dcr\n\
+                          deep          Deepcoin (DCN)\n\
+                          dmd-gr        Diamond\n\
+                          drop          Dropcoin\n\
+                          fresh         Fresh\n\
+                          groestl       Groestl coin\n\
+                          heavy         Heavy\n\
+                          hmq1725       Espers\n\
+                          hodl          Hodlcoin\n\
+                          jha           jackppot (Jackpotcoin)\n\
+                          keccak        Maxcoin\n\
+                          keccakc       Creative Coin\n\
+                          lbry          LBC, LBRY Credits\n\
+                          luffa         Luffa\n\
+                          lyra2h        Hppcoin\n\
+                          lyra2re       lyra2\n\
+                          lyra2rev2     lyrav2, Vertcoin\n\
+                          lyra2z        Zcoin (XZC)\n\
+                          lyra2z330     Lyra2 330 rows, Zoin (ZOI)\n\
+                          m7m           Magi (XMG)\n\
+                          myr-gr        Myriad-Groestl\n\
+                          neoscrypt     NeoScrypt(128, 2, 1)\n\
+                          nist5         Nist5\n\
+                          pentablake    5 x blake512\n\
+                          phi1612       phi, LUX coin\n\
+                          pluck         Pluck:128 (Supcoin)\n\
                          polytimos\n\
-                          quark        Quark\n\
-                          qubit        Qubit\n\
-                          scrypt       scrypt(1024, 1, 1) (default)\n\
-                          scrypt:N     scrypt(N, 1, 1)\n\
+                          quark         Quark\n\
+                          qubit         Qubit\n\
+                          scrypt        scrypt(1024, 1, 1) (default)\n\
+                          scrypt:N      scrypt(N, 1, 1)\n\
                          scryptjane:nf\n\
-                          sha256d      Double SHA-256\n\
-                          sha256t      Triple SHA-256, Onecoin (OC)\n\
-                          shavite3     Shavite3\n\
-                          skein        Skein+Sha (Skeincoin)\n\
-                          skein2       Double Skein (Woodcoin)\n\
-                          skunk        Signatum (SIGT)\n\
-                          timetravel   timeravel8, Machinecoin (MAC)\n\
-                          timetravel10 Bitcore (BTX)\n\
-                          tribus       Denarius (DNR)\n\
-                          vanilla      blake256r8vnl (VCash)\n\
+                          sha256d       Double SHA-256\n\
+                          sha256t       Triple SHA-256, Onecoin (OC)\n\
+                          shavite3      Shavite3\n\
+                          skein         Skein+Sha (Skeincoin)\n\
+                          skein2        Double Skein (Woodcoin)\n\
+                          skunk         Signatum (SIGT)\n\
+                          timetravel    timeravel8, Machinecoin (MAC)\n\
+                          timetravel10  Bitcore (BTX)\n\
+                          tribus        Denarius (DNR)\n\
+                          vanilla       blake256r8vnl (VCash)\n\
                          veltor\n\
                          whirlpool\n\
                          whirlpoolx\n\
-                          x11          Dash\n\
-                          x11evo       Revolvercoin (XRE)\n\
-                          x11gost      sib (SibCoin)\n\
-                          x12          Galaxie Cash (GCH)\n\
-                          x13          X13\n\
-                          x13sm3       hsr (Hshare)\n\
-                          x14          X14\n\
-                          x15          X15\n\
-                          x16r         Ravencoin (RVN)\n\
-                          x16s         Pigeoncoin (PGN)\n\
+                          x11           Dash\n\
+                          x11evo        Revolvercoin (XRE)\n\
+                          x11gost       sib (SibCoin)\n\
+                          x12           Galaxie Cash (GCH)\n\
+                          x13           X13\n\
+                          x13sm3        hsr (Hshare)\n\
+                          x14            X14\n\
+                          x15           X15\n\
+                          x16r          Ravencoin (RVN)\n\
+                          x16s          Pigeoncoin (PGN)\n\
                          x17\n\
-                          xevan        Bitsend (BSD)\n\
-                          yescrypt     Globlboost-Y (BSTY)\n\
-                          yescryptr8   BitZeny (ZNY)\n\
-                          yescryptr16  Yenten (YTN)\n\
-                          yescryptr32  WAVI\n\
-                          zr5          Ziftr\n\
+                          xevan         Bitsend (BSD)\n\
+                          yescrypt      Globlboost-Y (BSTY)\n\
+                          yescryptr8    BitZeny (ZNY)\n\
+                          yescryptr16   Yenten (YTN)\n\
+                          yescryptr32   WAVI\n\
+                          zr5           Ziftr\n\
  -o, --url=URL         URL of mining server\n\
  -O, --userpass=U:P    username:password pair for mining server\n\
  -u, --user=USERNAME   username for mining server\n\
--- a/sysinfos.c
+++ b/sysinfos.c
@@ -274,6 +274,7 @@ void cpu_getmodelid(char *outbuf, size_t maxsz)
 #define SSE2_Flag     (1<<26) 

 #define AVX2_Flag     (1<< 5) // ADV EBX
+#define AVX512F_Flag  (1<<16)
 #define SHA_Flag      (1<<29)

 // Use this to detect presence of feature
@@ -350,6 +351,21 @@ static inline bool has_avx2_()

 bool has_avx2() { return has_avx2_(); }

+static inline bool has_avx512f_()
+{
+#ifdef __arm__
+    return false;
+#else
+    int cpu_info[4] = { 0 };
+    cpuid( EXTENDED_FEATURES, cpu_info );
+    return cpu_info[ EBX_Reg ] & AVX512F_Flag;
+#endif
+}
+
+bool has_avx512f() { return has_avx512f_(); }
+
+
+// AMD only
 static inline bool has_xop_()
 {
 #ifdef __arm__
--- a/winbuild-cross.sh
+++ b/winbuild-cross.sh
@@ -51,11 +51,13 @@ rm -f config.status
 CFLAGS="-O3 -march=corei7-avx -Wall" ./configure $F 
 make 
 strip -s cpuminer.exe
-mv cpuminer.exe release/cpuminer-aes-avx.exe
+mv cpuminer.exe release/cpuminer-avx.exe

+# -march=westmere is supported in gcc5
 make clean || echo clean
 rm -f config.status
-CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $F
+CFLAGS="-O3 -march=westmere -Wall" ./configure $F
+#CFLAGS="-O3 -maes -msse4.2 -Wall" ./configure $F
 make 
 strip -s cpuminer.exe
 mv cpuminer.exe release/cpuminer-aes-sse42.exe
Author	SHA1	Message	Date
Jay D Dee	bfd1c002f9	v3.8.8.1	2018-05-11 11:52:36 -04:00
Jay D Dee	9edc650042	v3.8.7.2	2018-04-11 13:44:26 -04:00
Jay D Dee	218cef337a	v3.8.7.1	2018-04-10 21:49:06 -04:00
Jay D Dee	9ffce7bdb7	v3.8.7	2018-04-09 19:14:38 -04:00
Jay D Dee	c7efa50aad	v3.8.6.1	2018-04-06 11:42:01 -04:00