v3.9.0

v3.8.8.1
v3.8.7.2
2025-09-17 23:44:27 +00:00 · 2019-05-19 13:39:45 -04:00 · 2018-05-11 11:52:36 -04:00 · 2018-04-11 13:44:26 -04:00 · 2018-04-10 21:49:06 -04:00 · 2018-04-09 19:14:38 -04:00
156 changed files with 12038 additions and 4462 deletions
--- a/4
+++ b/4
@@ -29,3 +29,7 @@ Wolf0
 Optiminer

 Jay D Dee
+
+xcouiz@gmail.com
+
+Cryply
--- a/Makefile.am
+++ b/Makefile.am
@@ -74,7 +74,7 @@ cpuminer_SOURCES = \
  algo/cryptonight/cryptonight-aesni.c\
  algo/cryptonight/cryptonight.c\
  algo/cubehash/sph_cubehash.c \
-  algo/cubehash/sse2/cubehash_sse2.c\
+  algo/cubehash/cubehash_sse2.c\
  algo/cubehash/cube-hash-2way.c \
  algo/echo/sph_echo.c \
  algo/echo/aes_ni/hash.c\
@@ -116,20 +116,20 @@ cpuminer_SOURCES = \
  algo/luffa/luffa-hash-2way.c \
  algo/lyra2/lyra2.c \
  algo/lyra2/sponge.c \
-  algo/lyra2/lyra2rev2-gate.c \
+  algo/lyra2/lyra2-gate.c \
  algo/lyra2/lyra2rev2.c \
  algo/lyra2/lyra2rev2-4way.c \
+  algo/lyra2/lyra2rev3.c \
+  algo/lyra2/lyra2rev3-4way.c \
  algo/lyra2/lyra2re.c \
-  algo/lyra2/lyra2z-gate.c \
  algo/lyra2/lyra2z.c \
  algo/lyra2/lyra2z-4way.c \
  algo/lyra2/lyra2z330.c \
-  algo/lyra2/lyra2h-gate.c \
  algo/lyra2/lyra2h.c \
  algo/lyra2/lyra2h-4way.c \
-  algo/lyra2/allium-gate.c \
  algo/lyra2/allium-4way.c \
  algo/lyra2/allium.c \
+  algo/lyra2/phi2.c \
  algo/m7m.c \
  algo/neoscrypt/neoscrypt.c \
  algo/nist5/nist5-gate.c \
@@ -252,7 +252,10 @@ cpuminer_SOURCES = \
  algo/x17/hmq1725.c \
  algo/yescrypt/yescrypt.c \
  algo/yescrypt/sha256_Y.c \
-  algo/yescrypt/yescrypt-best.c
+  algo/yescrypt/yescrypt-best.c \
+  algo/yespower/yespower.c \
+  algo/yespower/sha256.c \
+  algo/yespower/yespower-opt.c

 disable_flags =

--- a/README.md
+++ b/README.md
@@ -45,82 +45,87 @@ MacOS, OSx and Android are not supported.
 Supported Algorithms
 --------------------

-                          allium       Garlicoin
-                          anime        Animecoin
-                          argon2       Argon2 coin (AR2)
-                          argon2d-crds Credits (CRDS)
-                          argon2d-dyn  Dynamic (DYN)
-                          axiom        Shabal-256 MemoHash
+                          allium        Garlicoin
+                          anime         Animecoin
+                          argon2        Argon2 coin (AR2)
+                          argon2d250    argon2d-crds, Credits (CRDS)
+                          argon2d500    argon2d-dyn,  Dynamic (DYN)
+                          argon2d4096   argon2d-uis, Unitus, (UIS)
+                          axiom         Shabal-256 MemoHash
                          bastion
-                          blake        Blake-256 (SFR)
-                          blakecoin    blake256r8
-                          blake2s      Blake-2 S
-                          bmw          BMW 256
-                          c11          Chaincoin
-                          cryptolight  Cryptonight-light
-                          cryptonight  cryptonote, Monero (XMR)
+                          blake         Blake-256 (SFR)
+                          blakecoin     blake256r8
+                          blake2s       Blake-2 S
+                          bmw           BMW 256
+                          c11           Chaincoin
+                          cryptolight   Cryptonight-light
+                          cryptonight  
+                          cryptonightv7 Monero (XMR)
                          decred
-                          deep         Deepcoin (DCN)
-                          dmd-gr       Diamond-Groestl
-                          drop         Dropcoin
-                          fresh        Fresh
-                          groestl      Groestl coin
-                          heavy        Heavy
-                          hmq1725      Espers
-                          hodl         Hodlcoin
-                          jha          Jackpotcoin
-                          keccak       Maxcoin
-                          keccakc      Creative coin
-                          lbry         LBC, LBRY Credits
-                          luffa        Luffa
-                          lyra2h       Hppcoin
-                          lyra2re      lyra2
-                          lyra2rev2    lyra2v2, Vertcoin
-                          lyra2z       Zcoin (XZC)
-                          lyra2z330    Lyra2 330 rows, Zoin (ZOI)
-                          m7m          Magi (XMG)
-                          myr-gr       Myriad-Groestl
-                          neoscrypt    NeoScrypt(128, 2, 1)
-                          nist5        Nist5
-                          pentablake   Pentablake
-                          phi1612      phi, LUX coin
-                          pluck        Pluck:128 (Supcoin)
-                          polytimos    Ninja
-                          quark        Quark
-                          qubit        Qubit
-                          scrypt       scrypt(1024, 1, 1) (default)
-                          scrypt:N     scrypt(N, 1, 1)
+                          deep          Deepcoin (DCN)
+                          dmd-gr        Diamond-Groestl
+                          drop          Dropcoin
+                          fresh         Fresh
+                          groestl       Groestl coin
+                          heavy         Heavy
+                          hmq1725       Espers
+                          hodl          Hodlcoin
+                          jha           Jackpotcoin
+                          keccak        Maxcoin
+                          keccakc       Creative coin
+                          lbry          LBC, LBRY Credits
+                          luffa         Luffa
+                          lyra2h        Hppcoin
+                          lyra2re       lyra2
+                          lyra2rev2     lyra2v2, Vertcoin
+                          lyra2z        Zcoin (XZC)
+                          lyra2z330     Lyra2 330 rows, Zoin (ZOI)
+                          m7m           Magi (XMG)
+                          myr-gr        Myriad-Groestl
+                          neoscrypt     NeoScrypt(128, 2, 1)
+                          nist5         Nist5
+                          pentablake    Pentablake
+                          phi1612       phi, LUX coin (original algo)
+                          phi2          LUX coin (new algo)
+                          pluck         Pluck:128 (Supcoin)
+                          polytimos     Ninja
+                          quark         Quark
+                          qubit         Qubit
+                          scrypt        scrypt(1024, 1, 1) (default)
+                          scrypt:N      scrypt(N, 1, 1)
                          scryptjane:nf
-                          sha256d      Double SHA-256
-                          sha256t      Triple SHA-256, Onecoin (OC)
-                          shavite3     Shavite3
-                          skein        Skein+Sha (Skeincoin)
-                          skein2       Double Skein (Woodcoin)
-                          skunk        Signatum (SIGT)
-                          timetravel   Machinecoin (MAC)
-                          timetravel10 Bitcore
-                          tribus       Denarius (DNR)
-                          vanilla      blake256r8vnl (VCash)
-                          veltor       (VLT)
+                          sha256d       Double SHA-256
+                          sha256t       Triple SHA-256, Onecoin (OC)
+                          shavite3      Shavite3
+                          skein         Skein+Sha (Skeincoin)
+                          skein2        Double Skein (Woodcoin)
+                          skunk         Signatum (SIGT)
+                          timetravel    Machinecoin (MAC)
+                          timetravel10  Bitcore
+                          tribus        Denarius (DNR)
+                          vanilla       blake256r8vnl (VCash)
+                          veltor        (VLT)
                          whirlpool
                          whirlpoolx
-                          x11          Dash
-                          x11evo       Revolvercoin
-                          x11gost      sib (SibCoin)
-                          x12          Galaxie Cash (GCH)
-                          x13          X13
-                          x13sm3       hsr (Hshare)
-                          x14          X14
-                          x15          X15
-                          x16r         Ravencoin (RVN)
-                          x16s         pigeoncoin (PGN)
+                          x11           Dash
+                          x11evo        Revolvercoin
+                          x11gost       sib (SibCoin)
+                          x12           Galaxie Cash (GCH)
+                          x13           X13
+                          x13sm3        hsr (Hshare)
+                          x14           X14
+                          x15           X15
+                          x16r          Ravencoin (RVN)
+                          x16s          pigeoncoin (PGN)
                          x17
-                          xevan        Bitsend (BSD)
-                          yescrypt     Globalboost-Y (BSTY)
-                          yescryptr8   BitZeny (ZNY)
-                          yescryptr16  Yenten (YTN)
-                          yescryptr32  WAVI
-                          zr5          Ziftr
+                          xevan         Bitsend (BSD)
+                          yescrypt      Globalboost-Y (BSTY)
+                          yescryptr8    BitZeny (ZNY)
+                          yescryptr16   Eli
+                          yescryptr32   WAVI
+                          yespower      Cryply
+                          yespowerr16   Yenten (YTN)
+                          zr5           Ziftr

 Errata
 ------
--- a/README.txt
+++ b/README.txt
@@ -21,14 +21,16 @@ AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not
 supported by cpuminer-opt due to an incompatible implementation of SSE2 on
 these CPUs. Some algos may crash the miner with an invalid instruction.
 Users are recommended to use an unoptimized miner such as cpuminer-multi.
+Changes in v3.8.4 may have improved compatibility with some of these CPUs.

-Exe name                Compile flags              Arch name

-cpuminer-sse2.exe      "-msse2"                    Core2, Nehalem   
-cpuminer-aes-sse42.exe "-maes -msse4.2"            Westmere
-cpuminer-aes-avx.exe   "-march=corei7-avx"         Sandybridge, Ivybridge
-cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
-cpuminer-avx2-sha.exe  "-march=core-avx2 -msha"    Ryzen
+Exe name                Compile flags            Arch name
+
+cpuminer-sse2.exe      "-msse2"                  Core2, Nehalem   
+cpuminer-aes-sse42.exe "-march=westmere"         Westmere, Sandy-Ivybridge
+cpuminer-avx.exe       "-march=corei7-avx"       Sandy-Ivybridge
+cpuminer-avx2.exe      "-march=core-avx2"        Haswell, Sky-Kaby-Coffeelake
+cpuminer-avx2-sha.exe  "-march=core-avx2 -msha"  Ryzen

 If you like this software feel free to donate:

--- a/45
+++ b/45
@@ -50,8 +50,7 @@ will give a clue as to the missing package.
 The following command should install everything you need on Debian based
 distributions such as Ubuntu:

-sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake
-
+sudo apt-get install build-essential libssl-dev libcurl4-openssl-dev libjansson-dev libgmp-dev automake zlib1g-dev

 build-essential  (for Ubuntu, Development Tools package group on Fedora)
 automake
@@ -81,12 +80,12 @@ cd cpuminer-opt-x.y.z
 Run ./build.sh to build on Linux or execute the following commands.

 ./autogen.sh
-CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
+CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 make

 Additional optional compile flags, add the following to CFLAGS to activate:

-DUSE_SPH_SHA
+-DUSE_SPH_SHA (deprecated)

 SPH may give slightly better performance on algos that use sha256 when using
 openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform
@@ -160,6 +159,44 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------

+v3.9.0
+
+Added support for Windows CPU groups.
+Fixed BIP34 coinbase height.
+Prep work for AVX512.
+Added lyra2rev3 for the vertcoin algo change.
+Added yespower, yespowerr16 (Yenten)
+Added phi2 algo for LUX
+
+v3.8.8.1
+
+Fixed x16r.
+Removed cryptonight variant check due to false positives.
+API displays hashrate before shares are submitted.
+
+v3.8.8
+
+Added cryptonightv7 for Monero.
+
+v3.8.7.2
+
+Fixed argon2d-dyn regression in v3.8.7.1.
+Changed compile options for aes-sse42 Windows build to -march=westmere
+
+v3.8.7.1
+
+Fixed argon2d-uis low difficulty rejects.
+Fixed argon2d aliases.
+
+v3.8.7
+
+Added argon2d4096 (alias argon2d-uis) for Unitus (UIS).
+argon2d-crds and argon2d-dyn renamed to argon2d250 and argon2d500 respectively.
+  The old names are recognized as aliases.
+AVX512 is now supported for argon2d algos, Linux only.
+AVX is no longer a reported feature and an AVX Windows binary is no longer
+  provided. Use AES-SSE42 build instead.
+
 v3.8.6.1

 Faster argon2d* AVX2.
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.14.1 -*- Autoconf -*-
+# generated automatically by aclocal 1.15.1 -*- Autoconf -*-

-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2017 Free Software Foundation, Inc.

 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -20,7 +20,7 @@ You have another version of autoconf.  It may work, but is not guaranteed to.
 If you have problems, you may need to regenerate the build system entirely.
 To do so, use the procedure documented by the package, typically 'autoreconf'.])])

-# Copyright (C) 2002-2013 Free Software Foundation, Inc.
+# Copyright (C) 2002-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -32,10 +32,10 @@ To do so, use the procedure documented by the package, typically 'autoreconf'.])
 # generated from the m4 files accompanying Automake X.Y.
 # (This private macro should not be called outside this file.)
 AC_DEFUN([AM_AUTOMAKE_VERSION],
-[am__api_version='1.14'
+[am__api_version='1.15'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.14.1], [],
+m4_if([$1], [1.15.1], [],
      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])

@@ -51,14 +51,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.14.1])dnl
+[AM_AUTOMAKE_VERSION([1.15.1])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])

 # Figure out how to run the assembler.                      -*- Autoconf -*-

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -78,7 +78,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl

 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -123,15 +123,14 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl
 # configured tree to be moved without reconfiguration.

 AC_DEFUN([AM_AUX_DIR_EXPAND],
-[dnl Rely on autoconf to set up CDPATH properly.
-AC_PREREQ([2.50])dnl
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
+[AC_REQUIRE([AC_CONFIG_AUX_DIR_DEFAULT])dnl
+# Expand $ac_aux_dir to an absolute path.
+am_aux_dir=`cd "$ac_aux_dir" && pwd`
 ])

 # AM_CONDITIONAL                                            -*- Autoconf -*-

-# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+# Copyright (C) 1997-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -162,7 +161,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])

-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -353,7 +352,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl

 # Generate code to set up dependency tracking.              -*- Autoconf -*-

-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -429,7 +428,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],

 # Do all the work for Automake.                             -*- Autoconf -*-

-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -519,8 +518,8 @@ AC_REQUIRE([AC_PROG_MKDIR_P])dnl
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00001.html>
 # <http://lists.gnu.org/archive/html/automake/2012-07/msg00014.html>
 AC_SUBST([mkdir_p], ['$(MKDIR_P)'])
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
+# We need awk for the "check" target (and possibly the TAP driver).  The
+# system "awk" is bad on some platforms.
 AC_REQUIRE([AC_PROG_AWK])dnl
 AC_REQUIRE([AC_PROG_MAKE_SET])dnl
 AC_REQUIRE([AM_SET_LEADING_DOT])dnl
@@ -593,7 +592,11 @@ to "yes", and re-run configure.
 END
    AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
  fi
-fi])
+fi
+dnl The trailing newline in this macro's definition is deliberate, for
+dnl backward compatibility and to allow trailing 'dnl'-style comments
+dnl after the AM_INIT_AUTOMAKE invocation. See automake bug#16841.
+])

 dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
 dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
@@ -622,7 +625,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -633,7 +636,7 @@ echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_co
 # Define $install_sh.
 AC_DEFUN([AM_PROG_INSTALL_SH],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-if test x"${install_sh}" != xset; then
+if test x"${install_sh+set}" != xset; then
  case $am_aux_dir in
  *\ * | *\	*)
    install_sh="\${SHELL} '$am_aux_dir/install-sh'" ;;
@@ -643,7 +646,7 @@ if test x"${install_sh}" != xset; then
 fi
 AC_SUBST([install_sh])])

-# Copyright (C) 2003-2013 Free Software Foundation, Inc.
+# Copyright (C) 2003-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -665,7 +668,7 @@ AC_SUBST([am__leading_dot])])
 # Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
 # From Jim Meyering

-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -700,7 +703,7 @@ AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])

 # Check to see how 'make' treats includes.	            -*- Autoconf -*-

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -750,7 +753,7 @@ rm -f confinc confmf

 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-

-# Copyright (C) 1997-2013 Free Software Foundation, Inc.
+# Copyright (C) 1997-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -789,7 +792,7 @@ fi

 # Helper functions for option handling.                     -*- Autoconf -*-

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -818,7 +821,7 @@ AC_DEFUN([_AM_SET_OPTIONS],
 AC_DEFUN([_AM_IF_OPTION],
 [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])

-# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+# Copyright (C) 1999-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -865,7 +868,7 @@ AC_LANG_POP([C])])
 # For backward compatibility.
 AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -884,7 +887,7 @@ AC_DEFUN([AM_RUN_LOG],

 # Check to make sure that the build environment is sane.    -*- Autoconf -*-

-# Copyright (C) 1996-2013 Free Software Foundation, Inc.
+# Copyright (C) 1996-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -965,7 +968,7 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])

-# Copyright (C) 2009-2013 Free Software Foundation, Inc.
+# Copyright (C) 2009-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1025,7 +1028,7 @@ AC_SUBST([AM_BACKSLASH])dnl
 _AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
 ])

-# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+# Copyright (C) 2001-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1053,7 +1056,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])

-# Copyright (C) 2006-2013 Free Software Foundation, Inc.
+# Copyright (C) 2006-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@@ -1072,7 +1075,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])

 # Check how to create a tarball.                            -*- Autoconf -*-

-# Copyright (C) 2004-2013 Free Software Foundation, Inc.
+# Copyright (C) 2004-2017 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
--- a/algo-gate-api.c
+++ b/algo-gate-api.c
@@ -69,6 +69,8 @@ void do_nothing   () {}
 bool return_true  () { return true;  }
 bool return_false () { return false; }
 void *return_null () { return NULL;  }
+void call_error   () { printf("ERR: Uninitialized function pointer\n"); }
+

 void algo_not_tested()
 {
@@ -113,7 +115,8 @@ void init_algo_gate( algo_gate_t* gate )
   gate->hash_suw                = (void*)&null_hash_suw;
   gate->get_new_work            = (void*)&std_get_new_work;
   gate->get_nonceptr            = (void*)&std_get_nonceptr;
-   gate->display_extra_data      = (void*)&do_nothing;
+   gate->work_decode             = (void*)&std_le_work_decode;
+   gate->decode_extra_data       = (void*)&do_nothing;
   gate->wait_for_diff           = (void*)&std_wait_for_diff;
   gate->get_max64               = (void*)&get_max64_0x1fffffLL;
   gate->gen_merkle_root         = (void*)&sha256d_gen_merkle_root;
@@ -121,7 +124,6 @@ void init_algo_gate( algo_gate_t* gate )
   gate->build_stratum_request   = (void*)&std_le_build_stratum_request;
   gate->malloc_txs_request      = (void*)&std_malloc_txs_request;
   gate->set_target              = (void*)&std_set_target;
-   gate->work_decode             = (void*)&std_le_work_decode;
   gate->submit_getwork_result   = (void*)&std_le_submit_getwork_result;
   gate->build_block_header      = (void*)&std_build_block_header;
   gate->build_extraheader       = (void*)&std_build_extraheader;
@@ -132,11 +134,11 @@ void init_algo_gate( algo_gate_t* gate )
   gate->do_this_thread          = (void*)&return_true;
   gate->longpoll_rpc_call       = (void*)&std_longpoll_rpc_call;
   gate->stratum_handle_response = (void*)&std_stratum_handle_response;
+   gate->get_work_data_size      = (void*)&std_get_work_data_size;
   gate->optimizations           = EMPTY_SET;
   gate->ntime_index             = STD_NTIME_INDEX;
   gate->nbits_index             = STD_NBITS_INDEX;
   gate->nonce_index             = STD_NONCE_INDEX;
-   gate->work_data_size          = STD_WORK_DATA_SIZE;
   gate->work_cmp_size           = STD_WORK_CMP_SIZE;
 }

@@ -157,81 +159,93 @@ bool register_algo_gate( int algo, algo_gate_t *gate )

   switch (algo)
   {
-     case ALGO_ALLIUM:       register_allium_algo      ( gate ); break;
-     case ALGO_ANIME:        register_anime_algo       ( gate ); break;
-     case ALGO_ARGON2:       register_argon2_algo      ( gate ); break;
-     case ALGO_ARGON2DCRDS:  register_argon2d_crds_algo( gate ); break;
-     case ALGO_ARGON2DDYN:   register_argon2d_dyn_algo ( gate ); break;
-     case ALGO_AXIOM:        register_axiom_algo       ( gate ); break;
-     case ALGO_BASTION:      register_bastion_algo     ( gate ); break;
-     case ALGO_BLAKE:        register_blake_algo       ( gate ); break;
-     case ALGO_BLAKECOIN:    register_blakecoin_algo   ( gate ); break;
+     case ALGO_ALLIUM:       register_allium_algo       ( gate ); break;
+     case ALGO_ANIME:        register_anime_algo        ( gate ); break;
+     case ALGO_ARGON2:       register_argon2_algo       ( gate ); break;
+     case ALGO_ARGON2D250:   register_argon2d_crds_algo ( gate ); break;
+     case ALGO_ARGON2D500:   register_argon2d_dyn_algo  ( gate ); break;
+     case ALGO_ARGON2D4096:  register_argon2d4096_algo  ( gate ); break;
+     case ALGO_AXIOM:        register_axiom_algo        ( gate ); break;
+     case ALGO_BASTION:      register_bastion_algo      ( gate ); break;
+     case ALGO_BLAKE:        register_blake_algo        ( gate ); break;
+     case ALGO_BLAKECOIN:    register_blakecoin_algo    ( gate ); break;
 //     case ALGO_BLAKE2B:      register_blake2b_algo    ( gate ); break;
-     case ALGO_BLAKE2S:      register_blake2s_algo     ( gate ); break;
-     case ALGO_C11:          register_c11_algo         ( gate ); break;
-     case ALGO_CRYPTOLIGHT:  register_cryptolight_algo ( gate ); break;
-     case ALGO_CRYPTONIGHT:  register_cryptonight_algo ( gate ); break;
-     case ALGO_DECRED:       register_decred_algo      ( gate ); break;
-     case ALGO_DEEP:         register_deep_algo        ( gate ); break;
-     case ALGO_DMD_GR:       register_dmd_gr_algo      ( gate ); break;
-     case ALGO_DROP:         register_drop_algo        ( gate ); break;
-     case ALGO_FRESH:        register_fresh_algo       ( gate ); break;
-     case ALGO_GROESTL:      register_groestl_algo     ( gate ); break;
-     case ALGO_HEAVY:        register_heavy_algo       ( gate ); break;
-     case ALGO_HMQ1725:      register_hmq1725_algo     ( gate ); break;
-     case ALGO_HODL:         register_hodl_algo        ( gate ); break;
-     case ALGO_JHA:          register_jha_algo         ( gate ); break;
-     case ALGO_KECCAK:       register_keccak_algo      ( gate ); break;
-     case ALGO_KECCAKC:      register_keccakc_algo     ( gate ); break;
-     case ALGO_LBRY:         register_lbry_algo        ( gate ); break;
-     case ALGO_LUFFA:        register_luffa_algo       ( gate ); break;
-     case ALGO_LYRA2H:       register_lyra2h_algo      ( gate ); break;
-     case ALGO_LYRA2RE:      register_lyra2re_algo     ( gate ); break;
-     case ALGO_LYRA2REV2:    register_lyra2rev2_algo   ( gate ); break;
-     case ALGO_LYRA2Z:       register_lyra2z_algo      ( gate ); break;
-     case ALGO_LYRA2Z330:    register_lyra2z330_algo   ( gate ); break;
-     case ALGO_M7M:          register_m7m_algo         ( gate ); break;
-     case ALGO_MYR_GR:       register_myriad_algo      ( gate ); break;
-     case ALGO_NEOSCRYPT:    register_neoscrypt_algo   ( gate ); break;
-     case ALGO_NIST5:        register_nist5_algo       ( gate ); break;
-     case ALGO_PENTABLAKE:   register_pentablake_algo  ( gate ); break;
-     case ALGO_PHI1612:      register_phi1612_algo     ( gate ); break;
-     case ALGO_PLUCK:        register_pluck_algo       ( gate ); break;
-     case ALGO_POLYTIMOS:    register_polytimos_algo   ( gate ); break;
-     case ALGO_QUARK:        register_quark_algo       ( gate ); break;
-     case ALGO_QUBIT:        register_qubit_algo       ( gate ); break;
-     case ALGO_SCRYPT:       register_scrypt_algo      ( gate ); break;
-     case ALGO_SCRYPTJANE:   register_scryptjane_algo  ( gate ); break;
-     case ALGO_SHA256D:      register_sha256d_algo     ( gate ); break;
-     case ALGO_SHA256T:      register_sha256t_algo     ( gate ); break;
-     case ALGO_SHAVITE3:     register_shavite_algo     ( gate ); break;
-     case ALGO_SKEIN:        register_skein_algo       ( gate ); break;
-     case ALGO_SKEIN2:       register_skein2_algo      ( gate ); break;
-     case ALGO_SKUNK:        register_skunk_algo       ( gate ); break;
-     case ALGO_TIMETRAVEL:   register_timetravel_algo  ( gate ); break;
-     case ALGO_TIMETRAVEL10: register_timetravel10_algo( gate ); break;
-     case ALGO_TRIBUS:       register_tribus_algo      ( gate ); break;
-     case ALGO_VANILLA:      register_vanilla_algo     ( gate ); break;
-     case ALGO_VELTOR:       register_veltor_algo      ( gate ); break;
-     case ALGO_WHIRLPOOL:    register_whirlpool_algo   ( gate ); break;
-     case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo  ( gate ); break;
-     case ALGO_X11:          register_x11_algo         ( gate ); break;
-     case ALGO_X11EVO:       register_x11evo_algo      ( gate ); break;
-     case ALGO_X11GOST:      register_x11gost_algo     ( gate ); break;
-     case ALGO_X12:          register_x12_algo         ( gate ); break;
-     case ALGO_X13:          register_x13_algo         ( gate ); break;
-     case ALGO_X13SM3:       register_x13sm3_algo      ( gate ); break;
-     case ALGO_X14:          register_x14_algo         ( gate ); break;
-     case ALGO_X15:          register_x15_algo         ( gate ); break;
-     case ALGO_X16R:         register_x16r_algo        ( gate ); break;
-     case ALGO_X16S:         register_x16s_algo        ( gate ); break;
-     case ALGO_X17:          register_x17_algo         ( gate ); break;
-     case ALGO_XEVAN:        register_xevan_algo       ( gate ); break;
-     case ALGO_YESCRYPT:     register_yescrypt_algo    ( gate ); break;
-     case ALGO_YESCRYPTR8:   register_yescryptr8_algo  ( gate ); break;
-     case ALGO_YESCRYPTR16:  register_yescryptr16_algo ( gate ); break;
-     case ALGO_YESCRYPTR32:  register_yescryptr32_algo ( gate ); break;
-     case ALGO_ZR5:          register_zr5_algo         ( gate ); break;
+     case ALGO_BLAKE2S:      register_blake2s_algo      ( gate ); break;
+     case ALGO_C11:          register_c11_algo          ( gate ); break;
+     case ALGO_CRYPTOLIGHT:  register_cryptolight_algo  ( gate ); break;
+     case ALGO_CRYPTONIGHT:  register_cryptonight_algo  ( gate ); break;
+     case ALGO_CRYPTONIGHTV7:register_cryptonightv7_algo( gate ); break;
+     case ALGO_DECRED:       register_decred_algo       ( gate ); break;
+     case ALGO_DEEP:         register_deep_algo         ( gate ); break;
+     case ALGO_DMD_GR:       register_dmd_gr_algo       ( gate ); break;
+     case ALGO_DROP:         register_drop_algo         ( gate ); break;
+     case ALGO_FRESH:        register_fresh_algo        ( gate ); break;
+     case ALGO_GROESTL:      register_groestl_algo      ( gate ); break;
+     case ALGO_HEAVY:        register_heavy_algo        ( gate ); break;
+     case ALGO_HMQ1725:      register_hmq1725_algo      ( gate ); break;
+     case ALGO_HODL:         register_hodl_algo         ( gate ); break;
+     case ALGO_JHA:          register_jha_algo          ( gate ); break;
+     case ALGO_KECCAK:       register_keccak_algo       ( gate ); break;
+     case ALGO_KECCAKC:      register_keccakc_algo      ( gate ); break;
+     case ALGO_LBRY:         register_lbry_algo         ( gate ); break;
+     case ALGO_LUFFA:        register_luffa_algo        ( gate ); break;
+     case ALGO_LYRA2H:       register_lyra2h_algo       ( gate ); break;
+     case ALGO_LYRA2RE:      register_lyra2re_algo      ( gate ); break;
+     case ALGO_LYRA2REV2:    register_lyra2rev2_algo    ( gate ); break;
+     case ALGO_LYRA2REV3:    register_lyra2rev3_algo    ( gate ); break;
+     case ALGO_LYRA2Z:       register_lyra2z_algo       ( gate ); break;
+     case ALGO_LYRA2Z330:    register_lyra2z330_algo    ( gate ); break;
+     case ALGO_M7M:          register_m7m_algo          ( gate ); break;
+     case ALGO_MYR_GR:       register_myriad_algo       ( gate ); break;
+     case ALGO_NEOSCRYPT:    register_neoscrypt_algo    ( gate ); break;
+     case ALGO_NIST5:        register_nist5_algo        ( gate ); break;
+     case ALGO_PENTABLAKE:   register_pentablake_algo   ( gate ); break;
+     case ALGO_PHI1612:      register_phi1612_algo      ( gate ); break;
+     case ALGO_PHI2:         register_phi2_algo         ( gate ); break;
+     case ALGO_PLUCK:        register_pluck_algo        ( gate ); break;
+     case ALGO_POLYTIMOS:    register_polytimos_algo    ( gate ); break;
+     case ALGO_QUARK:        register_quark_algo        ( gate ); break;
+     case ALGO_QUBIT:        register_qubit_algo        ( gate ); break;
+     case ALGO_SCRYPT:       register_scrypt_algo       ( gate ); break;
+     case ALGO_SCRYPTJANE:   register_scryptjane_algo   ( gate ); break;
+     case ALGO_SHA256D:      register_sha256d_algo      ( gate ); break;
+     case ALGO_SHA256T:      register_sha256t_algo      ( gate ); break;
+     case ALGO_SHAVITE3:     register_shavite_algo      ( gate ); break;
+     case ALGO_SKEIN:        register_skein_algo        ( gate ); break;
+     case ALGO_SKEIN2:       register_skein2_algo       ( gate ); break;
+     case ALGO_SKUNK:        register_skunk_algo        ( gate ); break;
+     case ALGO_TIMETRAVEL:   register_timetravel_algo   ( gate ); break;
+     case ALGO_TIMETRAVEL10: register_timetravel10_algo ( gate ); break;
+     case ALGO_TRIBUS:       register_tribus_algo       ( gate ); break;
+     case ALGO_VANILLA:      register_vanilla_algo      ( gate ); break;
+     case ALGO_VELTOR:       register_veltor_algo       ( gate ); break;
+     case ALGO_WHIRLPOOL:    register_whirlpool_algo    ( gate ); break;
+     case ALGO_WHIRLPOOLX:   register_whirlpoolx_algo   ( gate ); break;
+     case ALGO_X11:          register_x11_algo          ( gate ); break;
+     case ALGO_X11EVO:       register_x11evo_algo       ( gate ); break;
+     case ALGO_X11GOST:      register_x11gost_algo      ( gate ); break;
+     case ALGO_X12:          register_x12_algo          ( gate ); break;
+     case ALGO_X13:          register_x13_algo          ( gate ); break;
+     case ALGO_X13SM3:       register_x13sm3_algo       ( gate ); break;
+     case ALGO_X14:          register_x14_algo          ( gate ); break;
+     case ALGO_X15:          register_x15_algo          ( gate ); break;
+     case ALGO_X16R:         register_x16r_algo         ( gate ); break;
+     case ALGO_X16S:         register_x16s_algo         ( gate ); break;
+     case ALGO_X17:          register_x17_algo          ( gate ); break;
+     case ALGO_XEVAN:        register_xevan_algo        ( gate ); break;
+/*    case ALGO_YESCRYPT:     register_yescrypt_05_algo     ( gate ); break;
+     case ALGO_YESCRYPTR8:   register_yescryptr8_05_algo   ( gate ); break;
+     case ALGO_YESCRYPTR16:  register_yescryptr16_05_algo  ( gate ); break;
+     case ALGO_YESCRYPTR32:  register_yescryptr32_05_algo  ( gate ); break;
+*/
+     case ALGO_YESCRYPT:     register_yescrypt_algo     ( gate ); break;
+     case ALGO_YESCRYPTR8:   register_yescryptr8_algo   ( gate ); break;
+     case ALGO_YESCRYPTR16:  register_yescryptr16_algo  ( gate ); break;
+     case ALGO_YESCRYPTR32:  register_yescryptr32_algo  ( gate ); break;
+
+     case ALGO_YESPOWER:     register_yespower_algo     ( gate ); break;
+     case ALGO_YESPOWERR16:  register_yespowerr16_algo  ( gate ); break;
+     case ALGO_ZR5:          register_zr5_algo          ( gate ); break;
    default:
        applog(LOG_ERR,"FAIL: algo_gate registration failed, unknown algo %s.\n", algo_names[opt_algo] );
        return false;
@@ -288,6 +302,9 @@ void exec_hash_function( int algo, void *output, const void *pdata )
 const char* const algo_alias_map[][2] =
 {
 //   alias                proper
+  { "argon2d-crds",      "argon2d250"   },
+  { "argon2d-dyn",       "argon2d500"   },
+  { "argon2d-uis",       "argon2d4096"  },
  { "bitcore",           "timetravel10" },
  { "bitzeny",           "yescryptr8"   },
  { "blake256r8",        "blakecoin"    },
@@ -305,6 +322,7 @@ const char* const algo_alias_map[][2] =
  { "jane",              "scryptjane"   }, 
  { "lyra2",             "lyra2re"      },
  { "lyra2v2",           "lyra2rev2"    },
+  { "lyra2v3",           "lyra2rev3"    },
  { "lyra2zoin",         "lyra2z330"    },
  { "myrgr",             "myr-gr"       },
  { "myriad",            "myr-gr"       },
--- a/algo-gate-api.h
+++ b/algo-gate-api.h
@@ -2,6 +2,8 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include "miner.h"
+#include "avxdefs.h"
+#include "interleave.h"

 /////////////////////////////
 ////
@@ -91,6 +93,7 @@ typedef  uint32_t set_t;
 #define AVX_OPT         8
 #define AVX2_OPT     0x10
 #define SHA_OPT      0x20
+#define AVX512_OPT   0x40

 // return set containing all elements from sets a & b
 inline set_t set_union ( set_t a, set_t b ) { return a | b; }
@@ -119,7 +122,7 @@ void ( *stratum_gen_work )       ( struct stratum_ctx*, struct work* );
 void ( *get_new_work )           ( struct work*, struct work*, int, uint32_t*,
                                   bool );
 uint32_t *( *get_nonceptr )      ( uint32_t* );
-void ( *display_extra_data )     ( struct work*, uint64_t* );
+void ( *decode_extra_data )      ( struct work*, uint64_t* );
 void ( *wait_for_diff )          ( struct stratum_ctx* );
 int64_t ( *get_max64 )           ();
 bool ( *work_decode )            ( const json_t*, struct work* );
@@ -128,7 +131,7 @@ bool ( *submit_getwork_result )  ( CURL*, struct work* );
 void ( *gen_merkle_root )        ( char*, struct stratum_ctx* );
 void ( *build_extraheader )      ( struct work*, struct stratum_ctx* );
 void ( *build_block_header )     ( struct work*, uint32_t, uint32_t*,
-                                   uint32_t*, uint32_t, uint32_t );
+	                           uint32_t*, uint32_t, uint32_t );
 void ( *build_stratum_request )  ( char*, struct work*, struct stratum_ctx* );
 char* ( *malloc_txs_request )    ( struct work* );
 void ( *set_work_data_endian )   ( struct work* );
@@ -139,10 +142,10 @@ bool ( *do_this_thread )         ( int );
 json_t* (*longpoll_rpc_call)     ( CURL*, int*, char* );
 bool ( *stratum_handle_response )( json_t* );
 set_t optimizations;
+int  ( *get_work_data_size )     ();
 int  ntime_index;
 int  nbits_index;
 int  nonce_index;            // use with caution, see warning below
-int  work_data_size;
 int  work_cmp_size;

 } algo_gate_t;
@@ -239,8 +242,8 @@ void set_work_data_big_endian( struct work *work );
 double std_calc_network_diff( struct work *work );

 void std_build_block_header( struct work* g_work, uint32_t version,
-                             uint32_t *prevhash, uint32_t *merkle_root,
-                             uint32_t ntime, uint32_t nbits );
+	                     uint32_t *prevhash,  uint32_t *merkle_root,
+   	                     uint32_t ntime, uint32_t nbits );

 void std_build_extraheader( struct work *work, struct stratum_ctx *sctx );

@@ -253,6 +256,8 @@ bool jr2_stratum_handle_response( json_t *val );
 bool std_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
                        int thr_id );

+int std_get_work_data_size();
+
 // Gate admin functions

 // Called from main to initialize all gate functions and algo-specific data
--- a/algo/argon2/argon2d/argon2d-gate.c
+++ b/algo/argon2/argon2d/argon2d-gate.c
@@ -28,6 +28,7 @@ void argon2d_crds_hash( void *output, const void *input )
 	context.lanes = 4;    // Degree of Parallelism
 	context.threads = 1;  // Threads
 	context.t_cost = 1;   // Iterations
+        context.version = ARGON2_VERSION_10;

 	argon2_ctx( &context, Argon2_d );
 }
@@ -70,7 +71,8 @@ bool register_argon2d_crds_algo( algo_gate_t* gate )
        gate->scanhash = (void*)&scanhash_argon2d_crds;
        gate->hash = (void*)&argon2d_crds_hash;
        gate->set_target = (void*)&scrypt_set_target;
-        gate->optimizations = SSE2_OPT | AVX2_OPT;
+        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        return true;
 }

 // Dynamic
@@ -96,6 +98,7 @@ void argon2d_dyn_hash( void *output, const void *input )
    context.lanes = 8;     // Degree of Parallelism
    context.threads = 1;   // Threads
    context.t_cost = 2;    // Iterations
+    context.version = ARGON2_VERSION_10;

    argon2_ctx( &context, Argon2_d );
 }
@@ -138,6 +141,58 @@ bool register_argon2d_dyn_algo( algo_gate_t* gate )
        gate->scanhash = (void*)&scanhash_argon2d_dyn;
        gate->hash = (void*)&argon2d_dyn_hash;
        gate->set_target = (void*)&scrypt_set_target;
-        gate->optimizations = SSE2_OPT | AVX2_OPT;
+        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        return true;
+}
+
+// Unitus
+
+int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
+                           uint64_t *hashes_done)
+{
+   uint32_t _ALIGN(64) vhash[8];
+   uint32_t _ALIGN(64) endiandata[20];
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t Htarg = ptarget[7];
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+    
+   uint32_t t_cost = 1; // 1 iteration
+   uint32_t m_cost = 4096; // use 4MB
+   uint32_t parallelism = 1; // 1 thread, 2 lanes
+
+   for ( int i = 0; i < 19; i++ )
+      be32enc( &endiandata[i], pdata[i] );
+
+   do {
+      be32enc( &endiandata[19], n );
+      argon2d_hash_raw( t_cost, m_cost, parallelism, (char*) endiandata, 80,
+                 (char*) endiandata, 80, (char*) vhash, 32, ARGON2_VERSION_13 );
+      if ( vhash[7] < Htarg && fulltest( vhash, ptarget ) )
+      {
+         *hashes_done = n - first_nonce + 1;
+         pdata[19] = n;
+         return true;
+      }
+      n++;
+
+   } while (n < max_nonce && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   pdata[19] = n;
+
+   return 0;
+}
+
+int64_t get_max64_0x1ff() { return 0x1ff; }
+
+bool register_argon2d4096_algo( algo_gate_t* gate )
+{
+        gate->scanhash = (void*)&scanhash_argon2d4096;
+        gate->set_target = (void*)&scrypt_set_target;
+        gate->get_max64  = (void*)&get_max64_0x1ff;
+        gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT;
+        return true;
 }

--- a/algo/argon2/argon2d/argon2d-gate.h
+++ b/algo/argon2/argon2d/argon2d-gate.h
@@ -4,7 +4,7 @@
 #include "algo-gate-api.h"
 #include <stdint.h>

-// Credits
+// Credits: version = 0x10, m_cost = 250.
 bool register_argon2d_crds_algo( algo_gate_t* gate );

 void argon2d_crds_hash( void *state, const void *input );
@@ -12,7 +12,7 @@ void argon2d_crds_hash( void *state, const void *input );
 int scanhash_argon2d_crds( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );

-// Dynamic
+// Dynamic: version = 0x10, m_cost = 500.
 bool register_argon2d_dyn_algo( algo_gate_t* gate );

 void argon2d_dyn_hash( void *state, const void *input );
@@ -21,5 +21,11 @@ int scanhash_argon2d_dyn( int thr_id, struct work *work, uint32_t max_nonce,
                    uint64_t *hashes_done );


+// Unitus: version = 0x13, m_cost = 4096.
+bool register_argon2d4096_algo( algo_gate_t* gate );
+
+int scanhash_argon2d4096( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
 #endif

--- a/algo/argon2/argon2d/argon2d/argon2.c
+++ b/algo/argon2/argon2d/argon2d/argon2.c
@@ -180,60 +180,65 @@ int argon2i_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
                         const uint32_t parallelism, const void *pwd,
                         const size_t pwdlen, const void *salt,
                         const size_t saltlen, const size_t hashlen,
-                         char *encoded, const size_t encodedlen) {
+                         char *encoded, const size_t encodedlen,
+                         const uint32_t version) {

    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
                       NULL, hashlen, encoded, encodedlen, Argon2_i,
-                       ARGON2_VERSION_NUMBER);
+                       version );
 }

 int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
                     const uint32_t parallelism, const void *pwd,
                     const size_t pwdlen, const void *salt,
-                     const size_t saltlen, void *hash, const size_t hashlen) {
+                     const size_t saltlen, void *hash, const size_t hashlen,
+                     const uint32_t version ) {

    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
-                       hash, hashlen, NULL, 0, Argon2_i, ARGON2_VERSION_NUMBER);
+                       hash, hashlen, NULL, 0, Argon2_i, version );
 }

 int argon2d_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
                         const uint32_t parallelism, const void *pwd,
                         const size_t pwdlen, const void *salt,
                         const size_t saltlen, const size_t hashlen,
-                         char *encoded, const size_t encodedlen) {
+                         char *encoded, const size_t encodedlen,
+                         const uint32_t version ) {

    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
                       NULL, hashlen, encoded, encodedlen, Argon2_d,
-                       ARGON2_VERSION_NUMBER);
+                       version );
 }

 int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
                     const uint32_t parallelism, const void *pwd,
                     const size_t pwdlen, const void *salt,
-                     const size_t saltlen, void *hash, const size_t hashlen) {
+                     const size_t saltlen, void *hash, const size_t hashlen,
+                     const uint32_t version ) {

    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
-                       hash, hashlen, NULL, 0, Argon2_d, ARGON2_VERSION_NUMBER);
+                       hash, hashlen, NULL, 0, Argon2_d, version );
 }

 int argon2id_hash_encoded(const uint32_t t_cost, const uint32_t m_cost,
                          const uint32_t parallelism, const void *pwd,
                          const size_t pwdlen, const void *salt,
                          const size_t saltlen, const size_t hashlen,
-                          char *encoded, const size_t encodedlen) {
+                          char *encoded, const size_t encodedlen,
+                          const uint32_t version ) {

    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
                       NULL, hashlen, encoded, encodedlen, Argon2_id,
-                       ARGON2_VERSION_NUMBER);
+                       version);
 }

 int argon2id_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
                      const uint32_t parallelism, const void *pwd,
                      const size_t pwdlen, const void *salt,
-                      const size_t saltlen, void *hash, const size_t hashlen) {
+                      const size_t saltlen, void *hash, const size_t hashlen,
+                      const uint32_t version ) {
    return argon2_hash(t_cost, m_cost, parallelism, pwd, pwdlen, salt, saltlen,
-                       hash, hashlen, NULL, 0, Argon2_id,
-                       ARGON2_VERSION_NUMBER);
+                       hash, hashlen, NULL, 0, Argon2_id, version );
 }

 static int argon2_compare(const uint8_t *b1, const uint8_t *b2, size_t len) {
@@ -443,10 +448,11 @@ const char *argon2_error_message(int error_code) {
        return "Unknown error code";
    }
 }
-
+/*
 size_t argon2_encodedlen(uint32_t t_cost, uint32_t m_cost, uint32_t parallelism,
                         uint32_t saltlen, uint32_t hashlen, argon2_type type) {
  return strlen("$$v=$m=,t=,p=$$") + strlen(argon2_type2string(type, 0)) +
         numlen(t_cost) + numlen(m_cost) + numlen(parallelism) +
         b64len(saltlen) + b64len(hashlen) + numlen(ARGON2_VERSION_NUMBER) + 1;
 }
+*/
--- a/algo/argon2/argon2d/argon2d/argon2.h
+++ b/algo/argon2/argon2d/argon2d/argon2.h
@@ -225,11 +225,8 @@ typedef enum Argon2_type {
 } argon2_type;

 /* Version of the algorithm */
-typedef enum Argon2_version {
-    ARGON2_VERSION_10 = 0x10,
-    ARGON2_VERSION_13 = 0x13,
-    ARGON2_VERSION_NUMBER = ARGON2_VERSION_10
-} argon2_version;
+#define ARGON2_VERSION_10 0x10
+#define ARGON2_VERSION_13 0x13

 /*
 * Function that gives the string representation of an argon2_type.
@@ -267,7 +264,8 @@ ARGON2_PUBLIC int argon2i_hash_encoded(const uint32_t t_cost,
                                       const void *pwd, const size_t pwdlen,
                                       const void *salt, const size_t saltlen,
                                       const size_t hashlen, char *encoded,
-                                       const size_t encodedlen);
+                                       const size_t encodedlen,
+                                       const uint32_t version );

 /**
 * Hashes a password with Argon2i, producing a raw hash at @hash
@@ -287,7 +285,8 @@ ARGON2_PUBLIC int argon2i_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
                                   const uint32_t parallelism, const void *pwd,
                                   const size_t pwdlen, const void *salt,
                                   const size_t saltlen, void *hash,
-                                   const size_t hashlen);
+                                   const size_t hashlen,
+                                   const uint32_t version );

 ARGON2_PUBLIC int argon2d_hash_encoded(const uint32_t t_cost,
                                       const uint32_t m_cost,
@@ -295,13 +294,15 @@ ARGON2_PUBLIC int argon2d_hash_encoded(const uint32_t t_cost,
                                       const void *pwd, const size_t pwdlen,
                                       const void *salt, const size_t saltlen,
                                       const size_t hashlen, char *encoded,
-                                       const size_t encodedlen);
+                                       const size_t encodedlen,
+                                       const uint32_t version );

 ARGON2_PUBLIC int argon2d_hash_raw(const uint32_t t_cost, const uint32_t m_cost,
                                   const uint32_t parallelism, const void *pwd,
                                   const size_t pwdlen, const void *salt,
                                   const size_t saltlen, void *hash,
-                                   const size_t hashlen);
+                                   const size_t hashlen,
+                                   const uint32_t version );

 ARGON2_PUBLIC int argon2id_hash_encoded(const uint32_t t_cost,
                                        const uint32_t m_cost,
@@ -309,14 +310,16 @@ ARGON2_PUBLIC int argon2id_hash_encoded(const uint32_t t_cost,
                                        const void *pwd, const size_t pwdlen,
                                        const void *salt, const size_t saltlen,
                                        const size_t hashlen, char *encoded,
-                                        const size_t encodedlen);
+                                        const size_t encodedlen,
+                                        const uint32_t version );

 ARGON2_PUBLIC int argon2id_hash_raw(const uint32_t t_cost,
                                    const uint32_t m_cost,
                                    const uint32_t parallelism, const void *pwd,
                                    const size_t pwdlen, const void *salt,
                                    const size_t saltlen, void *hash,
-                                    const size_t hashlen);
+                                    const size_t hashlen,
+                                    const uint32_t version );

 /* generic function underlying the above ones */
 ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
@@ -325,7 +328,7 @@ ARGON2_PUBLIC int argon2_hash(const uint32_t t_cost, const uint32_t m_cost,
                              const size_t saltlen, void *hash,
                              const size_t hashlen, char *encoded,
                              const size_t encodedlen, argon2_type type,
-                              const uint32_t version);
+                              const uint32_t version );

 /**
 * Verifies a password against an encoded string
--- a/algo/argon2/argon2d/argon2d/core.c
+++ b/algo/argon2/argon2d/argon2d/core.c
@@ -544,7 +544,8 @@ void initial_hash(uint8_t *blockhash, argon2_context *context,
    store32(&value, context->t_cost);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));

-    store32(&value, ARGON2_VERSION_NUMBER);
+//    store32(&value, ARGON2_VERSION_NUMBER);
+    store32(&value, context->version);
    blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value));

    store32(&value, (uint32_t)type);
--- a/algo/argon2/argon2d/argon2d/opt.c
+++ b/algo/argon2/argon2d/argon2d/opt.c
@@ -345,15 +345,15 @@ void fill_segment(const argon2_instance_t *instance,
        ref_block =
            instance->memory + instance->lane_length * ref_lane + ref_index;
        curr_block = instance->memory + curr_offset;
-        // if (ARGON2_VERSION_10 == instance->version) {
-        //     /* version 1.2.1 and earlier: overwrite, not XOR */
-        //     fill_block(state, ref_block, curr_block, 0);
-        // } else {
-        //     if(0 == position.pass) {
+         if (ARGON2_VERSION_10 == instance->version) {
+             /* version 1.2.1 and earlier: overwrite, not XOR */
+             fill_block(state, ref_block, curr_block, 0);
+         } else {
+             if(0 == position.pass) {
                fill_block(state, ref_block, curr_block, 0);
-        //     } else {
-        //         fill_block(state, ref_block, curr_block, 1);
-        //     }
-        // }
+             } else {
+                 fill_block(state, ref_block, curr_block, 1);
+             }
+         }
    }
 }
--- a/algo/blake/blake-4way.c
+++ b/algo/blake/blake-4way.c
@@ -15,7 +15,7 @@ void blakehash_4way(void *state, const void *input)
     memcpy( &ctx, &blake_4w_ctx, sizeof ctx );
     blake256r14_4way( &ctx, input + (64<<2), 16 );
     blake256r14_4way_close( &ctx, vhash );
-     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -37,7 +37,7 @@ int scanhash_blake_4way( int thr_id, struct work *work, uint32_t max_nonce,

   // we need big endian data...
   swab32_array( edata, pdata, 20 );
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake256r14_4way_init( &blake_4w_ctx );
   blake256r14_4way( &blake_4w_ctx, vdata, 64 );

--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -363,14 +363,14 @@ static const sph_u64 CB[16] = {
 do { \
   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
                 _mm_set_epi32( c1, c1, c1, c1 ), m0 ), b ), a ); \
-   d = mm_ror_32( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
   c = _mm_add_epi32( c, d ); \
-   b = mm_ror_32( _mm_xor_si128( b, c ), 12 ); \
+   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
   a = _mm_add_epi32( _mm_add_epi32( _mm_xor_si128( \
                 _mm_set_epi32( c0, c0, c0, c0 ), m1 ), b ), a ); \
-   d = mm_ror_32( _mm_xor_si128( d, a ), 8 ); \
+   d = mm128_ror_32( _mm_xor_si128( d, a ), 8 ); \
   c = _mm_add_epi32( c, d ); \
-   b = mm_ror_32( _mm_xor_si128( b, c ), 7 ); \
+   b = mm128_ror_32( _mm_xor_si128( b, c ), 7 ); \
 } while (0)

 #if SPH_COMPACT_BLAKE_32
@@ -562,22 +562,22 @@ do { \
                          , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
        VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
                            _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
-	M[0x0] = mm_bswap_32( *(buf +  0) ); \
-	M[0x1] = mm_bswap_32( *(buf +  1) ); \
-	M[0x2] = mm_bswap_32( *(buf +  2) ); \
-	M[0x3] = mm_bswap_32( *(buf +  3) ); \
-	M[0x4] = mm_bswap_32( *(buf +  4) ); \
-	M[0x5] = mm_bswap_32( *(buf +  5) ); \
-	M[0x6] = mm_bswap_32( *(buf +  6) ); \
-	M[0x7] = mm_bswap_32( *(buf +  7) ); \
-	M[0x8] = mm_bswap_32( *(buf +  8) ); \
-	M[0x9] = mm_bswap_32( *(buf +  9) ); \
-	M[0xA] = mm_bswap_32( *(buf + 10) ); \
-	M[0xB] = mm_bswap_32( *(buf + 11) ); \
-	M[0xC] = mm_bswap_32( *(buf + 12) ); \
-	M[0xD] = mm_bswap_32( *(buf + 13) ); \
-	M[0xE] = mm_bswap_32( *(buf + 14) ); \
-	M[0xF] = mm_bswap_32( *(buf + 15) ); \
+	M[0x0] = mm128_bswap_32( *(buf +  0) ); \
+	M[0x1] = mm128_bswap_32( *(buf +  1) ); \
+	M[0x2] = mm128_bswap_32( *(buf +  2) ); \
+	M[0x3] = mm128_bswap_32( *(buf +  3) ); \
+	M[0x4] = mm128_bswap_32( *(buf +  4) ); \
+	M[0x5] = mm128_bswap_32( *(buf +  5) ); \
+	M[0x6] = mm128_bswap_32( *(buf +  6) ); \
+	M[0x7] = mm128_bswap_32( *(buf +  7) ); \
+	M[0x8] = mm128_bswap_32( *(buf +  8) ); \
+	M[0x9] = mm128_bswap_32( *(buf +  9) ); \
+	M[0xA] = mm128_bswap_32( *(buf + 10) ); \
+	M[0xB] = mm128_bswap_32( *(buf + 11) ); \
+	M[0xC] = mm128_bswap_32( *(buf + 12) ); \
+	M[0xD] = mm128_bswap_32( *(buf + 13) ); \
+	M[0xE] = mm128_bswap_32( *(buf + 14) ); \
+	M[0xF] = mm128_bswap_32( *(buf + 15) ); \
 	for (r = 0; r < rounds; r ++) \
 		ROUND_S_4WAY(r); \
        H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -624,22 +624,22 @@ do { \
   VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
   VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
   VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
-   M0 = mm_bswap_32( * buf ); \
-   M1 = mm_bswap_32( *(buf+1) ); \
-   M2 = mm_bswap_32( *(buf+2) ); \
-   M3 = mm_bswap_32( *(buf+3) ); \
-   M4 = mm_bswap_32( *(buf+4) ); \
-   M5 = mm_bswap_32( *(buf+5) ); \
-   M6 = mm_bswap_32( *(buf+6) ); \
-   M7 = mm_bswap_32( *(buf+7) ); \
-   M8 = mm_bswap_32( *(buf+8) ); \
-   M9 = mm_bswap_32( *(buf+9) ); \
-   MA = mm_bswap_32( *(buf+10) ); \
-   MB = mm_bswap_32( *(buf+11) ); \
-   MC = mm_bswap_32( *(buf+12) ); \
-   MD = mm_bswap_32( *(buf+13) ); \
-   ME = mm_bswap_32( *(buf+14) ); \
-   MF = mm_bswap_32( *(buf+15) ); \
+   M0 = mm128_bswap_32( * buf ); \
+   M1 = mm128_bswap_32( *(buf+1) ); \
+   M2 = mm128_bswap_32( *(buf+2) ); \
+   M3 = mm128_bswap_32( *(buf+3) ); \
+   M4 = mm128_bswap_32( *(buf+4) ); \
+   M5 = mm128_bswap_32( *(buf+5) ); \
+   M6 = mm128_bswap_32( *(buf+6) ); \
+   M7 = mm128_bswap_32( *(buf+7) ); \
+   M8 = mm128_bswap_32( *(buf+8) ); \
+   M9 = mm128_bswap_32( *(buf+9) ); \
+   MA = mm128_bswap_32( *(buf+10) ); \
+   MB = mm128_bswap_32( *(buf+11) ); \
+   MC = mm128_bswap_32( *(buf+12) ); \
+   MD = mm128_bswap_32( *(buf+13) ); \
+   ME = mm128_bswap_32( *(buf+14) ); \
+   MF = mm128_bswap_32( *(buf+15) ); \
   ROUND_S_4WAY(0); \
   ROUND_S_4WAY(1); \
   ROUND_S_4WAY(2); \
@@ -1073,8 +1073,8 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
       if (out_size_w32 == 8)
           buf[52>>2] = _mm_or_si128( buf[52>>2],
                                        _mm_set1_epi32( 0x01000000UL ) );
-       *(buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
-       *(buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
+       *(buf+(56>>2)) = mm128_bswap_32( _mm_set1_epi32( th ) );
+       *(buf+(60>>2)) = mm128_bswap_32( _mm_set1_epi32( tl ) );
       blake32_4way( sc, buf + (ptr>>2), 64 - ptr );
   }
   else
@@ -1086,13 +1086,13 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
 	memset_zero_128( buf, 56>>2 );
       if (out_size_w32 == 8)
           buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
-        *(buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
-        *(buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
+        *(buf+(56>>2)) = mm128_bswap_32( _mm_set1_epi32( th ) );
+        *(buf+(60>>2)) = mm128_bswap_32( _mm_set1_epi32( tl ) );
 	blake32_4way( sc, buf, 64 );
   }
   out = (__m128i*)dst;
   for ( k = 0; k < out_size_w32; k++ )
-        out[k] = mm_bswap_32( sc->H[k] );
+        out[k] = mm128_bswap_32( sc->H[k] );
 }

 #if defined (__AVX2__)
--- a/algo/blake/blake2s-4way.c
+++ b/algo/blake/blake2s-4way.c
@@ -85,7 +85,8 @@ void blake2s_4way_hash( void *output, const void *input )
   blake2s_4way_update( &ctx, input + (64<<2), 16 );
   blake2s_4way_final( &ctx, vhash, BLAKE2S_OUTBYTES );

-   mm_deinterleave_4x32( output, output+32, output+64, output+96, vhash, 256 );
+   mm128_deinterleave_4x32( output, output+32, output+64, output+96,
+		            vhash, 256 );
 }

 int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -104,7 +105,7 @@ int scanhash_blake2s_4way( int thr_id, struct work *work, uint32_t max_nonce,
   uint32_t *noncep = vdata + 76;   // 19*4

   swab32_array( edata, pdata, 20 );
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake2s_4way_init( &blake2s_4w_ctx, BLAKE2S_OUTBYTES );
   blake2s_4way_update( &blake2s_4w_ctx, vdata, 64 );

--- a/algo/blake/blake2s-hash-4way.c
+++ b/algo/blake/blake2s-hash-4way.c
@@ -92,13 +92,13 @@ int blake2s_4way_compress( blake2s_4way_state *S, const __m128i* block )
 #define G4W(r,i,a,b,c,d) \
 do { \
   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+0] ] ); \
-   d = mm_ror_32( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_ror_32( _mm_xor_si128( d, a ), 16 ); \
   c = _mm_add_epi32( c, d ); \
-   b = mm_ror_32( _mm_xor_si128( b, c ), 12 ); \
+   b = mm128_ror_32( _mm_xor_si128( b, c ), 12 ); \
   a = _mm_add_epi32( _mm_add_epi32( a, b ), m[ blake2s_sigma[r][2*i+1] ] ); \
-   d = mm_ror_32( _mm_xor_si128( d, a ),  8 ); \
+   d = mm128_ror_32( _mm_xor_si128( d, a ),  8 ); \
   c = _mm_add_epi32( c, d ); \
-   b = mm_ror_32( _mm_xor_si128( b, c ),  7 ); \
+   b = mm128_ror_32( _mm_xor_si128( b, c ),  7 ); \
 } while(0)

 #define ROUND4W(r)  \
--- a/algo/blake/blakecoin-4way.c
+++ b/algo/blake/blakecoin-4way.c
@@ -17,7 +17,7 @@ void blakecoin_4way_hash(void *state, const void *input)
     blake256r8_4way( &ctx, input + (64<<2), 16 );
     blake256r8_4way_close( &ctx, vhash );

-     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -37,7 +37,7 @@ int scanhash_blakecoin_4way( int thr_id, struct work *work, uint32_t max_nonce,
      HTarget = 0x7f;

   swab32_array( edata, pdata, 20 );
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake256r8_4way_init( &blakecoin_4w_ctx );
   blake256r8_4way( &blakecoin_4w_ctx, vdata, 64 );

--- a/algo/blake/decred-4way.c
+++ b/algo/blake/decred-4way.c
@@ -23,7 +23,7 @@ void decred_hash_4way( void *state, const void *input )
     memcpy( &ctx, &blake_mid, sizeof(blake_mid) );
     blake256_4way( &ctx, tail, tail_len );
     blake256_4way_close( &ctx, vhash );
-     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+     mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -44,7 +44,7 @@ int scanhash_decred_4way( int thr_id, struct work *work, uint32_t max_nonce,
   memcpy( edata, pdata, 180 );

   // use the old way until  new way updated for size.
-   mm_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );
+   mm128_interleave_4x32x( vdata, edata, edata, edata, edata, 180*8 );

   blake256_4way_init( &blake_mid );
   blake256_4way( &blake_mid, vdata, DECRED_MIDSTATE_LEN );
--- a/algo/blake/decred-gate.c
+++ b/algo/blake/decred-gate.c
@@ -140,6 +140,7 @@ bool decred_ready_to_mine( struct work* work, struct stratum_ctx* stratum,
   return true;
 }

+int decred_get_work_data_size() { return DECRED_DATA_SIZE; }

 bool register_decred_algo( algo_gate_t* gate )
 {
@@ -154,7 +155,7 @@ bool register_decred_algo( algo_gate_t* gate )
  gate->optimizations = AVX2_OPT;
  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
-  gate->display_extra_data    = (void*)&decred_decode_extradata;
+  gate->decode_extra_data     = (void*)&decred_decode_extradata;
  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
  gate->work_decode           = (void*)&std_be_work_decode;
  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
@@ -163,7 +164,7 @@ bool register_decred_algo( algo_gate_t* gate )
  gate->nbits_index           = DECRED_NBITS_INDEX;
  gate->ntime_index           = DECRED_NTIME_INDEX;
  gate->nonce_index           = DECRED_NONCE_INDEX;
-  gate->work_data_size        = DECRED_DATA_SIZE;
+  gate->get_work_data_size    = (void*)&decred_get_work_data_size;
  gate->work_cmp_size         = DECRED_WORK_COMPARE_SIZE;
  allow_mininginfo            = false;
  have_gbt                    = false;
--- a/algo/blake/decred.c
+++ b/algo/blake/decred.c
@@ -268,7 +268,7 @@ bool register_decred_algo( algo_gate_t* gate )
  gate->hash                  = (void*)&decred_hash;
  gate->get_nonceptr          = (void*)&decred_get_nonceptr;
  gate->get_max64             = (void*)&get_max64_0x3fffffLL;
-  gate->display_extra_data    = (void*)&decred_decode_extradata;
+  gate->decode_extra_data     = (void*)&decred_decode_extradata;
  gate->build_stratum_request = (void*)&decred_be_build_stratum_request;
  gate->work_decode           = (void*)&std_be_work_decode;
  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
@@ -77,26 +77,26 @@ static const sph_u64 IV512[] = {
 #define ss0(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
                                 _mm_slli_epi32( (x), 3) ), \
-                  _mm_xor_si128( mm_rol_32( (x),  4), \
-                                 mm_rol_32( (x), 19) ) )
+                  _mm_xor_si128( mm128_rol_32( (x),  4), \
+                                 mm128_rol_32( (x), 19) ) )

 #define ss1(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 1), \
                                 _mm_slli_epi32( (x), 2) ), \
-                  _mm_xor_si128( mm_rol_32( (x),  8), \
-                                 mm_rol_32( (x), 23) ) )
+                  _mm_xor_si128( mm128_rol_32( (x),  8), \
+                                 mm128_rol_32( (x), 23) ) )

 #define ss2(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
                                 _mm_slli_epi32( (x), 1) ), \
-                  _mm_xor_si128( mm_rol_32( (x), 12), \
-                                 mm_rol_32( (x), 25) ) )
+                  _mm_xor_si128( mm128_rol_32( (x), 12), \
+                                 mm128_rol_32( (x), 25) ) )

 #define ss3(x) \
   _mm_xor_si128( _mm_xor_si128( _mm_srli_epi32( (x), 2), \
                                 _mm_slli_epi32( (x), 2) ), \
-                  _mm_xor_si128( mm_rol_32( (x), 15), \
-                                 mm_rol_32( (x), 29) ) )
+                  _mm_xor_si128( mm128_rol_32( (x), 15), \
+                                 mm128_rol_32( (x), 29) ) )

 #define ss4(x) \
  _mm_xor_si128( (x), _mm_srli_epi32( (x), 1 ) )
@@ -104,16 +104,16 @@ static const sph_u64 IV512[] = {
 #define ss5(x) \
  _mm_xor_si128( (x), _mm_srli_epi32( (x), 2 ) )

-#define rs1(x)    mm_rol_32( x,  3 ) 
-#define rs2(x)    mm_rol_32( x,  7 ) 
-#define rs3(x)    mm_rol_32( x, 13 ) 
-#define rs4(x)    mm_rol_32( x, 16 ) 
-#define rs5(x)    mm_rol_32( x, 19 ) 
-#define rs6(x)    mm_rol_32( x, 23 ) 
-#define rs7(x)    mm_rol_32( x, 27 ) 
+#define rs1(x)    mm128_rol_32( x,  3 ) 
+#define rs2(x)    mm128_rol_32( x,  7 ) 
+#define rs3(x)    mm128_rol_32( x, 13 ) 
+#define rs4(x)    mm128_rol_32( x, 16 ) 
+#define rs5(x)    mm128_rol_32( x, 19 ) 
+#define rs6(x)    mm128_rol_32( x, 23 ) 
+#define rs7(x)    mm128_rol_32( x, 27 ) 

 #define rol_off_32( M, j, off ) \
-   mm_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
+   mm128_rol_32( M[ ( (j) + (off) ) & 0xF ] , \
                ( ( (j) + (off) ) & 0xF ) + 1 )

 #define add_elt_s( M, H, j ) \
@@ -526,42 +526,42 @@ void compress_small( const __m128i *M, const __m128i H[16], __m128i dH[16] )
                                     _mm_slli_epi32( qt[23], 2 ) ) ),
                 _mm_xor_si128( _mm_xor_si128( xl, qt[31] ), qt[ 7] ));
   dH[ 8] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[4], 9 ),
+                 mm128_rol_32( dH[4], 9 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[24] ), M[ 8] )),
                 _mm_xor_si128( _mm_slli_epi32( xl, 8 ),
                                _mm_xor_si128( qt[23], qt[ 8] ) ) );
   dH[ 9] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[5], 10 ),
+                 mm128_rol_32( dH[5], 10 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[25] ), M[ 9] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 6 ),
                                _mm_xor_si128( qt[16], qt[ 9] ) ) );
   dH[10] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[6], 11 ),
+                 mm128_rol_32( dH[6], 11 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[26] ), M[10] )),
                 _mm_xor_si128( _mm_slli_epi32( xl, 6 ),
                                _mm_xor_si128( qt[17], qt[10] ) ) );
   dH[11] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[7], 12 ),
+                 mm128_rol_32( dH[7], 12 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[27] ), M[11] )),
                 _mm_xor_si128( _mm_slli_epi32( xl, 4 ),
                                _mm_xor_si128( qt[18], qt[11] ) ) );
   dH[12] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[0], 13 ),
+                 mm128_rol_32( dH[0], 13 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[28] ), M[12] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 3 ),
                                _mm_xor_si128( qt[19], qt[12] ) ) );
   dH[13] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[1], 14 ),
+                 mm128_rol_32( dH[1], 14 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[29] ), M[13] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 4 ),
                                _mm_xor_si128( qt[20], qt[13] ) ) );
   dH[14] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[2], 15 ),
+                 mm128_rol_32( dH[2], 15 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[30] ), M[14] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 7 ),
                                _mm_xor_si128( qt[21], qt[14] ) ) );
   dH[15] = _mm_add_epi32( _mm_add_epi32(
-                 mm_rol_32( dH[3], 16 ),
+                 mm128_rol_32( dH[3], 16 ),
                 _mm_xor_si128( _mm_xor_si128( xh, qt[31] ), M[15] )),
                 _mm_xor_si128( _mm_srli_epi32( xl, 2 ),
                                _mm_xor_si128( qt[22], qt[15] ) ) );
--- a/algo/cryptonight/cryptolight.c
+++ b/algo/cryptonight/cryptolight.c
@@ -325,7 +325,7 @@ int scanhash_cryptolight(int thr_id, struct work *work,

 	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));

-#ifndef NO_AES_NI
+#if defined(__AES__)
 		do {
 			*nonceptr = ++n;
 			cryptolight_hash_ctx_aes_ni(hash, pdata, 76, ctx);
--- a/algo/cryptonight/cryptonight-aesni.c
+++ b/algo/cryptonight/cryptonight-aesni.c
@@ -1,14 +1,11 @@
+#if defined(__AES__)
+
 #include <x86intrin.h>
 #include <memory.h>
 #include "cryptonight.h"
 #include "miner.h"
 #include "crypto/c_keccak.h"
 #include <immintrin.h>
-//#include "avxdefs.h"
-
-void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey);
-void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state);
-void that_fucking_loop(uint8_t a[16], uint8_t b[16], uint8_t *long_state);

 static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)
 {
@@ -25,7 +22,6 @@ static inline void ExpandAESKey256_sub1(__m128i *tmp1, __m128i *tmp2)

 static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
 {
-#ifndef NO_AES_NI
 	__m128i tmp2, tmp4;
 	
 	tmp4 = _mm_aeskeygenassist_si128(*tmp1, 0x00);
@@ -37,14 +33,12 @@ static inline void ExpandAESKey256_sub2(__m128i *tmp1, __m128i *tmp3)
 	tmp4 = _mm_slli_si128(tmp4, 0x04);
 	*tmp3 = _mm_xor_si128(*tmp3, tmp4);
 	*tmp3 = _mm_xor_si128(*tmp3, tmp2);
-#endif
 }

 // Special thanks to Intel for helping me
 // with ExpandAESKey256() and its subroutines
 static inline void ExpandAESKey256(char *keybuf)
 {
-#ifndef NO_AES_NI
 	__m128i tmp1, tmp2, tmp3, *keys;
 	
 	keys = (__m128i *)keybuf;
@@ -91,7 +85,6 @@ static inline void ExpandAESKey256(char *keybuf)
 	tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40);
 	ExpandAESKey256_sub1(&tmp1, &tmp2);
 	keys[14] = tmp1;
-#endif
 }

 // align to 64 byte cache line
@@ -109,13 +102,19 @@ static __thread cryptonight_ctx ctx;

 void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 {
-#ifndef NO_AES_NI
-
    uint8_t ExpandedKey[256] __attribute__((aligned(64)));
    __m128i *longoutput, *expkey, *xmminput;
    size_t i, j;
    
    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
+
+    if ( cryptonightV7 && len < 43 )
+      return;
+
+    const uint64_t tweak = cryptonightV7 
+                         ? *((const uint64_t*) (((const uint8_t*)input) + 35))
+                           ^ ctx.state.hs.w[24] : 0; 
+
    memcpy( ExpandedKey, ctx.state.hs.b, AES_KEY_SIZE );
    ExpandAESKey256( ExpandedKey );
    memcpy( ctx.text, ctx.state.init, INIT_SIZE_BYTE );
@@ -214,7 +213,15 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 	_mm_store_si128( (__m128i*)c, c_x );
        b_x = _mm_xor_si128( b_x, c_x );
        nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
-	_mm_store_si128( lsa, b_x );
+        _mm_store_si128( lsa, b_x );
+
+        if ( cryptonightV7 )
+        {
+           const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
+           const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
+           ((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
+        } 
+
 	b[0] = nextblock[0];
 	b[1] = nextblock[1];

@@ -227,10 +234,14 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
 		 : "cc" );

        b_x = c_x;
-        nextblock[0] = a[0] + hi;
-        nextblock[1] = a[1] + lo;
-        a[0] = b[0] ^ nextblock[0];
-        a[1] = b[1] ^ nextblock[1];
+
+        a[0] += hi;
+        a[1] += lo;
+        nextblock[0] = a[0];
+        nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
+        a[0] ^= b[0];
+        a[1] ^= b[1];
+
        lsa = (__m128i*)&ctx.long_state[ a[0] & 0x1FFFF0 ];
        a_x = _mm_load_si128( (__m128i*)a );
        c_x = _mm_load_si128( lsa );
@@ -241,6 +252,14 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
    b_x = _mm_xor_si128( b_x, c_x );
    nextblock = (uint64_t *)&ctx.long_state[c[0] & 0x1FFFF0];
    _mm_store_si128( lsa, b_x );
+
+    if ( cryptonightV7 )
+    {
+       const uint8_t tmp = ( (const uint8_t*)(lsa) )[11];
+       const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
+       ((uint8_t*)(lsa))[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
+    }
+
    b[0] = nextblock[0];
    b[1] = nextblock[1];

@@ -251,8 +270,12 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
               "rm" ( b[0] )
             : "cc" );

-    nextblock[0] = a[0] + hi;
-    nextblock[1] = a[1] + lo;
+    a[0] += hi;
+    a[1] += lo;
+    nextblock[0] = a[0];
+    nextblock[1] = cryptonightV7 ? a[1] ^ tweak : a[1];
+    a[0] ^= b[0];
+    a[1] ^= b[1];

    memcpy( ExpandedKey, &ctx.state.hs.b[32], AES_KEY_SIZE );
    ExpandAESKey256( ExpandedKey );
@@ -330,5 +353,5 @@ void cryptonight_hash_aes( void *restrict output, const void *input, int len )
    keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
    extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);

-#endif
 }
+#endif
--- a/algo/cryptonight/cryptonight-common.c
+++ b/algo/cryptonight/cryptonight-common.c
@@ -7,11 +7,11 @@
 #include "cpuminer-config.h"
 #include "algo-gate-api.h"

-#ifndef NO_AES_NI
+#if defined(__AES__)
  #include "algo/groestl/aes_ni/hash-groestl256.h"
-#endif
-
+#else
 #include "crypto/c_groestl.h"
+#endif
 #include "crypto/c_blake256.h"
 #include "crypto/c_jh.h"
 #include "crypto/c_skein.h"
@@ -30,12 +30,12 @@ void do_blake_hash(const void* input, size_t len, char* output) {
 }

 void do_groestl_hash(const void* input, size_t len, char* output) {
-#ifdef NO_AES_NI
-    groestl(input, len * 8, (uint8_t*)output);
-#else
+#if defined(__AES__)
    hashState_groestl256 ctx;
    init_groestl256( &ctx, 32 );
    update_and_final_groestl256( &ctx, output, input, len * 8 );
+#else
+    groestl(input, len * 8, (uint8_t*)output);
 #endif
 }

@@ -52,23 +52,24 @@ void (* const extra_hashes[4])( const void *, size_t, char *) =

 void cryptonight_hash( void *restrict output, const void *input, int len )
 {
-
-#ifdef NO_AES_NI
-  cryptonight_hash_ctx ( output, input, len );
-#else 
+#if defined(__AES__)
  cryptonight_hash_aes( output, input, len );
+#else
+  cryptonight_hash_ctx ( output, input, len );
 #endif
 }

 void cryptonight_hash_suw( void *restrict output, const void *input )
 {
-#ifdef NO_AES_NI
-  cryptonight_hash_ctx ( output, input, 76 );
-#else
+#if defined(__AES__)
  cryptonight_hash_aes( output, input, 76 );
+#else
+  cryptonight_hash_ctx ( output, input, 76 );
 #endif
 }

+bool cryptonightV7 = false;
+
 int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
                   uint64_t *hashes_done )
 {
@@ -80,6 +81,11 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
    const uint32_t first_nonce = n + 1;
    const uint32_t Htarg = ptarget[7];
    uint32_t hash[32 / 4] __attribute__((aligned(32)));
+
+//    if (  (  cryptonightV7 && ( *(uint8_t*)pdata <  7 ) )
+//       || ( !cryptonightV7 && ( *(uint8_t*)pdata == 7 ) ) )
+//          applog(LOG_WARNING,"Cryptonight variant mismatch, shares may be rejected.");
+
    do
    {
       *nonceptr = ++n;
@@ -87,6 +93,7 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,
       if (unlikely( hash[7] < Htarg ))
       {
           *hashes_done = n - first_nonce + 1;
+//           work_set_target_ratio( work, hash );
 	   return true;
       }
    } while (likely((n <= max_nonce && !work_restart[thr_id].restart)));
@@ -97,6 +104,7 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,

 bool register_cryptonight_algo( algo_gate_t* gate )
 {
+  cryptonightV7 = false;
  register_json_rpc2( gate );
  gate->optimizations = SSE2_OPT | AES_OPT;
  gate->scanhash         = (void*)&scanhash_cryptonight;
@@ -106,3 +114,15 @@ bool register_cryptonight_algo( algo_gate_t* gate )
  return true;
 };

+bool register_cryptonightv7_algo( algo_gate_t* gate )
+{
+  cryptonightV7 = true;
+  register_json_rpc2( gate );
+  gate->optimizations = SSE2_OPT | AES_OPT;
+  gate->scanhash      = (void*)&scanhash_cryptonight;
+  gate->hash          = (void*)&cryptonight_hash;
+  gate->hash_suw      = (void*)&cryptonight_hash_suw;
+  gate->get_max64     = (void*)&get_max64_0x40LL;
+  return true;
+};
+
--- a/algo/cryptonight/cryptonight.c
+++ b/algo/cryptonight/cryptonight.c
@@ -20,8 +20,8 @@
 #include "crypto/c_jh.h"
 #include "crypto/c_skein.h"
 #include "crypto/int-util.h"
-#include "crypto/hash-ops.h"
-//#include "cryptonight.h"
+//#include "crypto/hash-ops.h"
+#include "cryptonight.h"

 #if USE_INT128

@@ -51,6 +51,7 @@ typedef __uint128_t uint128_t;
 #define INIT_SIZE_BLK   8
 #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE)

+/*
 #pragma pack(push, 1)
 union cn_slow_hash_state {
 	union hash_state hs;
@@ -78,6 +79,7 @@ static void do_skein_hash(const void* input, size_t len, char* output) {
 	int r = skein_hash(8 * HASH_SIZE, input, 8 * len, (uint8_t*)output);
 	assert(likely(SKEIN_SUCCESS == r));
 }
+*/

 extern int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey);
 extern int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey);
@@ -120,9 +122,11 @@ static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* pro
 extern uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi);
 #endif

+/*
 static void (* const extra_hashes[4])(const void *, size_t, char *) = {
 		do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash
 };
+*/

 static inline size_t e2i(const uint8_t* a) {
 #if !LITE
@@ -132,14 +136,16 @@ static inline size_t e2i(const uint8_t* a) {
 #endif
 }

-static inline void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
+static inline void mul_sum_xor_dst( const uint8_t* a, uint8_t* c, uint8_t* dst, 
+         const uint64_t tweak )
+{
 	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
 	hi += ((uint64_t*) c)[0];

 	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
 	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
 	((uint64_t*) dst)[0] = hi;
-	((uint64_t*) dst)[1] = lo;
+	((uint64_t*) dst)[1] = cryptonightV7 ? lo ^ tweak : lo;
 }

 static inline void xor_blocks(uint8_t* a, const uint8_t* b) {
@@ -174,8 +180,16 @@ static __thread cryptonight_ctx ctx;

 void cryptonight_hash_ctx(void* output, const void* input, int len)
 {
-	hash_process(&ctx.state.hs, (const uint8_t*) input, len);
-	ctx.aes_ctx = (oaes_ctx*) oaes_alloc();
+//    hash_process(&ctx.state.hs, (const uint8_t*) input, len);
+    keccak( (const uint8_t*)input, 76, (char*)&ctx.state.hs.b, 200 );
+
+    if ( cryptonightV7 && len < 43 )
+      return;
+    const uint64_t tweak = cryptonightV7
+                         ? *((const uint64_t*) (((const uint8_t*)input) + 35))
+                           ^ ctx.state.hs.w[24] : 0;
+
+    ctx.aes_ctx = (oaes_ctx*) oaes_alloc();

    __builtin_prefetch( ctx.text,             0, 3 );
    __builtin_prefetch( ctx.text       +  64, 0, 3 );
@@ -211,23 +225,44 @@ void cryptonight_hash_ctx(void* output, const void* input, int len)
 	xor_blocks_dst(&ctx.state.k[0], &ctx.state.k[32], ctx.a);
 	xor_blocks_dst(&ctx.state.k[16], &ctx.state.k[48], ctx.b);

-	for (i = 0; likely(i < ITER / 4); ++i) {
-		/* Dependency chain: address -> read value ------+
-		 * written value <-+ hard function (AES or MUL) <+
-		 * next address  <-+
-		 */
-		/* Iteration 1 */
-		j = e2i(ctx.a);
-		aesb_single_round(&ctx.long_state[j], ctx.c, ctx.a);
-		xor_blocks_dst(ctx.c, ctx.b, &ctx.long_state[j]);
-		/* Iteration 2 */
-		mul_sum_xor_dst(ctx.c, ctx.a, &ctx.long_state[e2i(ctx.c)]);
-		/* Iteration 3 */
-		j = e2i(ctx.a);
-		aesb_single_round(&ctx.long_state[j], ctx.b, ctx.a);
-		xor_blocks_dst(ctx.b, ctx.c, &ctx.long_state[j]);
-		/* Iteration 4 */
-		mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)]);
+	for (i = 0; likely(i < ITER / 4); ++i)
+        {
+           /* Dependency chain: address -> read value ------+
+            * written value <-+ hard function (AES or MUL) <+
+            * next address  <-+
+            */
+           /* Iteration 1 */
+           j = e2i(ctx.a);
+           aesb_single_round(&ctx.long_state[j], ctx.c, ctx.a);
+           xor_blocks_dst(ctx.c, ctx.b, &ctx.long_state[j]);
+
+           if ( cryptonightV7 )
+           {
+              uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
+              const uint8_t tmp = lsa[11];
+              const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
+              lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
+           }
+
+           /* Iteration 2 */
+           mul_sum_xor_dst(ctx.c, ctx.a, &ctx.long_state[e2i(ctx.c)], tweak );
+
+           /* Iteration 3 */
+           j = e2i(ctx.a);
+           aesb_single_round(&ctx.long_state[j], ctx.b, ctx.a);
+           xor_blocks_dst(ctx.b, ctx.c, &ctx.long_state[j]);
+
+           if ( cryptonightV7 )
+           {
+              uint8_t *lsa = (uint8_t*)&ctx.long_state[((uint64_t *)(ctx.a))[0] & 0x1FFFF0];
+              const uint8_t tmp = lsa[11];
+              const uint8_t index = ( ( (tmp >> 3) & 6 ) | (tmp & 1) ) << 1;
+              lsa[11] = tmp ^ ( ( 0x75310 >> index) & 0x30 );
+           }
+
+           /* Iteration 4 */
+           mul_sum_xor_dst(ctx.b, ctx.a, &ctx.long_state[e2i(ctx.b)], tweak );
+
 	}

    __builtin_prefetch( ctx.text,             0, 3 );
@@ -266,7 +301,8 @@ void cryptonight_hash_ctx(void* output, const void* input, int len)
 		aesb_pseudo_round_mut(&ctx.text[7 * AES_BLOCK_SIZE], ctx.aes_ctx->key->exp_data);
 	}
 	memcpy(ctx.state.init, ctx.text, INIT_SIZE_BYTE);
-	hash_permutation(&ctx.state.hs);
+//	hash_permutation(&ctx.state.hs);
+        keccakf( (uint64_t*)&ctx.state.hs.w, 24 );
 	/*memcpy(hash, &state, 32);*/
 	extra_hashes[ctx.state.hs.b[0] & 3](&ctx.state, 200, output);
 	oaes_free((OAES_CTX **) &ctx.aes_ctx);
--- a/algo/cryptonight/cryptonight.h
+++ b/algo/cryptonight/cryptonight.h
@@ -45,5 +45,7 @@ int scanhash_cryptonight( int thr_id, struct work *work, uint32_t max_nonce,

 void cryptonight_hash_aes( void *restrict output, const void *input, int len );

+extern bool cryptonightV7;
+
 #endif

--- a/algo/cubehash/cube-hash-2way.c
+++ b/algo/cubehash/cube-hash-2way.c
@@ -92,7 +92,6 @@ int cube_2way_reinit( cube_2way_context *sp )
 {
   memcpy( sp, &cube_2way_ctx_cache, sizeof(cube_2way_context) );
   return 0;
-
 }

 int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,
@@ -123,7 +122,7 @@ int cube_2way_init( cube_2way_context *sp, int hashbitlen, int rounds,

 int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
 {
-    const int len = size / 16;
+    const int len = size >> 4;
    const __m256i *in = (__m256i*)data;
    int i;

@@ -140,7 +139,6 @@ int cube_2way_update( cube_2way_context *sp, const void *data, size_t size )
           sp->pos = 0;
        }
    }
-
    return 0;
 }

@@ -151,25 +149,22 @@ int cube_2way_close( cube_2way_context *sp, void *output )

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
-                    _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80,
-                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+                                _mm256_set_epi32( 0,0,0,0x80,  0,0,0,0x80 ) );
    transform_2way( sp );

-    sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
-                                                             1,0,0,0 ) );
-    for ( i = 0; i < 10; ++i )
-       transform_2way( &cube_2way_ctx_cache );
+    sp->h[7] = _mm256_xor_si256( sp->h[7],
+		                 _mm256_set_epi32( 1,0,0,0,  1,0,0,0 ) );

-    for ( i = 0; i < sp->hashlen; i++ )
-       hash[i] = sp->h[i];
+    for ( i = 0; i < 10; ++i )           transform_2way( sp );

+    for ( i = 0; i < sp->hashlen; i++ )  hash[i] = sp->h[i];
    return 0;
 }

 int cube_2way_update_close( cube_2way_context *sp, void *output,
                               const void *data, size_t size )
 {
-    const int len = size / 16;
+    const int len = size >> 4;
    const __m256i *in = (__m256i*)data;
    __m256i *hash = (__m256i*)output;
    int i;
@@ -187,18 +182,15 @@ int cube_2way_update_close( cube_2way_context *sp, void *output,

    // pos is zero for 64 byte data, 1 for 80 byte data.
    sp->h[ sp->pos ] = _mm256_xor_si256( sp->h[ sp->pos ],
-                    _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80,
-                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x80 ) );
+                    _mm256_set_epi32( 0,0,0,0x80,  0,0,0,0x80 ) );
    transform_2way( sp );

    sp->h[7] = _mm256_xor_si256( sp->h[7], _mm256_set_epi32( 1,0,0,0,
                                                             1,0,0,0 ) );
-    for ( i = 0; i < 10; ++i )
-       transform_2way( &cube_2way_ctx_cache );

-    for ( i = 0; i < sp->hashlen; i++ )
-       hash[i] = sp->h[i];
+    for ( i = 0; i < 10; ++i )            transform_2way( sp );

+    for ( i = 0; i < sp->hashlen; i++ )   hash[i] = sp->h[i];
    return 0;
 }

--- a/algo/cubehash/cube-hash-2way.h
+++ b/algo/cubehash/cube-hash-2way.h
@@ -10,12 +10,12 @@

 struct _cube_2way_context
 {
+    __m256i h[8];
    int hashlen;           // __m128i
    int rounds;
    int blocksize;         // __m128i
    int pos;               // number of __m128i read into x from current block
-    __m256i h[8] __attribute__ ((aligned (64)));
-};
+} __attribute__ ((aligned (64)));

 typedef struct _cube_2way_context cube_2way_context;

--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -254,6 +254,7 @@ int cubehashUpdateDigest( cubehashParam *sp, byte *digest,
    transform( sp );

    sp->x[7] = _mm_xor_si128( sp->x[7], _mm_set_epi32( 1,0,0,0 ) );
+
    transform( sp );
    transform( sp );
    transform( sp );
--- a/algo/cubehash/sse2/cubehash_sse2.h
+++ b/algo/cubehash/sse2/cubehash_sse2.h
--- a/algo/echo/aes_ni/hash.c
+++ b/algo/echo/aes_ni/hash.c
@@ -60,336 +60,174 @@ MYALIGN const unsigned int	zero[]			= {0x00000000, 0x00000000, 0x00000000, 0x000
 MYALIGN const unsigned int	mul2ipt[]		= {0x728efc00, 0x6894e61a, 0x3fc3b14d, 0x25d9ab57, 0xfd5ba600, 0x2a8c71d7, 0x1eb845e3, 0xc96f9234};


-//#include "crypto_hash.h"
-
- int crypto_hash(
-   unsigned char *out,
-   const unsigned char *in,
-   unsigned long long inlen
- )
- {
-
-	 if(hash_echo(512, in, inlen * 8, out) == SUCCESS) 
-		 return 0;
-	 
-	 return -1;
- }
-
-/*
-int main()
-{
-	return 0;
-}
-*/
-
-#if 0
-void DumpState(__m128i *ps)
-{
-	int i, j, k;
-	unsigned int ucol;
-
-	for(j = 0; j < 4; j++)
-	{
-		for(i = 0; i < 4; i++)
-		{
-			printf("row %d,col %d : ", i, j);
-			for(k = 0; k < 4; k++)
-			{
-				ucol = *((int*)ps + 16 * i + 4 * j + k);
-				printf("%02x%02x%02x%02x ", (ucol >> 0) & 0xff, (ucol >> 8) & 0xff, (ucol >> 16) & 0xff, (ucol >> 24) & 0xff);
-			}
-
-			printf("\n");
-		}
-	}
-
-	printf("\n");
-}
-#endif
-
-
-
-
-#ifndef NO_AES_NI
 #define ECHO_SUBBYTES(state, i, j) \
-				state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
-				state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
-				k1 = _mm_add_epi32(k1, M128(const1))
-#else
-#define ECHO_SUBBYTES(state, i, j) \
-				AES_ROUND_VPERM(state[i][j], t1, t2, t3, t4, s1, s2, s3);\
-				state[i][j] = _mm_xor_si128(state[i][j], k1);\
-				AES_ROUND_VPERM(state[i][j], t1, t2, t3, t4, s1, s2, s3);\
-				k1 = _mm_add_epi32(k1, M128(const1))
-
-#define ECHO_SUB_AND_MIX(state, i, j, state2, c, r1, r2, r3, r4) \
-				AES_ROUND_VPERM_CORE(state[i][j], t1, t2, t3, t4, s1, s2, s3);\
-				ktemp = k1;\
-				TRANSFORM(ktemp, _k_ipt, t1, t4);\
-				state[i][j] = _mm_xor_si128(state[i][j], ktemp);\
-				AES_ROUND_VPERM_CORE(state[i][j], t1, t2, t3, t4, s1, s2, s3);\
-				k1 = _mm_add_epi32(k1, M128(const1));\
-				s1 = state[i][j];\
-				s2 = s1;\
-				TRANSFORM(s2, mul2ipt, t1, t2);\
-				s3 = _mm_xor_si128(s1, s2);\
-				state2[r1][c] = _mm_xor_si128(state2[r1][c], s2);\
-				state2[r2][c] = _mm_xor_si128(state2[r2][c], s1);\
-				state2[r3][c] = _mm_xor_si128(state2[r3][c], s1);\
-				state2[r4][c] = _mm_xor_si128(state2[r4][c], s3)
-
-
-
-#endif
-
+	state[i][j] = _mm_aesenc_si128(state[i][j], k1);\
+	state[i][j] = _mm_aesenc_si128(state[i][j], M128(zero));\
+	k1 = _mm_add_epi32(k1, M128(const1))

 #define ECHO_MIXBYTES(state1, state2, j, t1, t2, s2) \
-				s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
-				t1 = _mm_srli_epi16(state1[0][j], 7);\
-				t1 = _mm_and_si128(t1, M128(lsbmask));\
-				t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-				s2 = _mm_xor_si128(s2, t2);\
-				state2[0][j] = s2;\
-				state2[1][j] = state1[0][j];\
-				state2[2][j] = state1[0][j];\
-				state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
-				s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
-				t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
-				t1 = _mm_and_si128(t1, M128(lsbmask));\
-				t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-				s2 = _mm_xor_si128(s2, t2);\
-				state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
-				state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
-				state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
-				state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
-				s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
-				t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
-				t1 = _mm_and_si128(t1, M128(lsbmask));\
-				t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-				s2 = _mm_xor_si128(s2, t2);\
-				state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
-				state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
-				state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
-				state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
-				s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
-				t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
-				t1 = _mm_and_si128(t1, M128(lsbmask));\
-				t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
-				s2 = _mm_xor_si128(s2, t2);\
-				state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
-				state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
-				state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
-				state2[3][j] = _mm_xor_si128(state2[3][j], s2)
+	s2 = _mm_add_epi8(state1[0][j], state1[0][j]);\
+	t1 = _mm_srli_epi16(state1[0][j], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = s2;\
+	state2[1][j] = state1[0][j];\
+	state2[2][j] = state1[0][j];\
+	state2[3][j] = _mm_xor_si128(s2, state1[0][j]);\
+	s2 = _mm_add_epi8(state1[1][(j + 1) & 3], state1[1][(j + 1) & 3]);\
+	t1 = _mm_srli_epi16(state1[1][(j + 1) & 3], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = _mm_xor_si128(state2[0][j], _mm_xor_si128(s2, state1[1][(j + 1) & 3]));\
+	state2[1][j] = _mm_xor_si128(state2[1][j], s2);\
+	state2[2][j] = _mm_xor_si128(state2[2][j], state1[1][(j + 1) & 3]);\
+	state2[3][j] = _mm_xor_si128(state2[3][j], state1[1][(j + 1) & 3]);\
+	s2 = _mm_add_epi8(state1[2][(j + 2) & 3], state1[2][(j + 2) & 3]);\
+	t1 = _mm_srli_epi16(state1[2][(j + 2) & 3], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = _mm_xor_si128(state2[0][j], state1[2][(j + 2) & 3]);\
+	state2[1][j] = _mm_xor_si128(state2[1][j], _mm_xor_si128(s2, state1[2][(j + 2) & 3]));\
+	state2[2][j] = _mm_xor_si128(state2[2][j], s2);\
+	state2[3][j] = _mm_xor_si128(state2[3][j], state1[2][(j + 2) & 3]);\
+	s2 = _mm_add_epi8(state1[3][(j + 3) & 3], state1[3][(j + 3) & 3]);\
+	t1 = _mm_srli_epi16(state1[3][(j + 3) & 3], 7);\
+	t1 = _mm_and_si128(t1, M128(lsbmask));\
+	t2 = _mm_shuffle_epi8(M128(mul2mask), t1);\
+	s2 = _mm_xor_si128(s2, t2);\
+	state2[0][j] = _mm_xor_si128(state2[0][j], state1[3][(j + 3) & 3]);\
+	state2[1][j] = _mm_xor_si128(state2[1][j], state1[3][(j + 3) & 3]);\
+	state2[2][j] = _mm_xor_si128(state2[2][j], _mm_xor_si128(s2, state1[3][(j + 3) & 3]));\
+	state2[3][j] = _mm_xor_si128(state2[3][j], s2)


 #define ECHO_ROUND_UNROLL2 \
-			ECHO_SUBBYTES(_state, 0, 0);\
-			ECHO_SUBBYTES(_state, 1, 0);\
-			ECHO_SUBBYTES(_state, 2, 0);\
-			ECHO_SUBBYTES(_state, 3, 0);\
-			ECHO_SUBBYTES(_state, 0, 1);\
-			ECHO_SUBBYTES(_state, 1, 1);\
-			ECHO_SUBBYTES(_state, 2, 1);\
-			ECHO_SUBBYTES(_state, 3, 1);\
-			ECHO_SUBBYTES(_state, 0, 2);\
-			ECHO_SUBBYTES(_state, 1, 2);\
-			ECHO_SUBBYTES(_state, 2, 2);\
-			ECHO_SUBBYTES(_state, 3, 2);\
-			ECHO_SUBBYTES(_state, 0, 3);\
-			ECHO_SUBBYTES(_state, 1, 3);\
-			ECHO_SUBBYTES(_state, 2, 3);\
-			ECHO_SUBBYTES(_state, 3, 3);\
-			ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
-			ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
-			ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
-			ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
-			ECHO_SUBBYTES(_state2, 0, 0);\
-			ECHO_SUBBYTES(_state2, 1, 0);\
-			ECHO_SUBBYTES(_state2, 2, 0);\
-			ECHO_SUBBYTES(_state2, 3, 0);\
-			ECHO_SUBBYTES(_state2, 0, 1);\
-			ECHO_SUBBYTES(_state2, 1, 1);\
-			ECHO_SUBBYTES(_state2, 2, 1);\
-			ECHO_SUBBYTES(_state2, 3, 1);\
-			ECHO_SUBBYTES(_state2, 0, 2);\
-			ECHO_SUBBYTES(_state2, 1, 2);\
-			ECHO_SUBBYTES(_state2, 2, 2);\
-			ECHO_SUBBYTES(_state2, 3, 2);\
-			ECHO_SUBBYTES(_state2, 0, 3);\
-			ECHO_SUBBYTES(_state2, 1, 3);\
-			ECHO_SUBBYTES(_state2, 2, 3);\
-			ECHO_SUBBYTES(_state2, 3, 3);\
-			ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
-			ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
-			ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
-			ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)
+	ECHO_SUBBYTES(_state, 0, 0);\
+	ECHO_SUBBYTES(_state, 1, 0);\
+	ECHO_SUBBYTES(_state, 2, 0);\
+	ECHO_SUBBYTES(_state, 3, 0);\
+	ECHO_SUBBYTES(_state, 0, 1);\
+	ECHO_SUBBYTES(_state, 1, 1);\
+	ECHO_SUBBYTES(_state, 2, 1);\
+	ECHO_SUBBYTES(_state, 3, 1);\
+	ECHO_SUBBYTES(_state, 0, 2);\
+	ECHO_SUBBYTES(_state, 1, 2);\
+	ECHO_SUBBYTES(_state, 2, 2);\
+	ECHO_SUBBYTES(_state, 3, 2);\
+	ECHO_SUBBYTES(_state, 0, 3);\
+	ECHO_SUBBYTES(_state, 1, 3);\
+	ECHO_SUBBYTES(_state, 2, 3);\
+	ECHO_SUBBYTES(_state, 3, 3);\
+	ECHO_MIXBYTES(_state, _state2, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state, _state2, 3, t1, t2, s2);\
+	ECHO_SUBBYTES(_state2, 0, 0);\
+	ECHO_SUBBYTES(_state2, 1, 0);\
+	ECHO_SUBBYTES(_state2, 2, 0);\
+	ECHO_SUBBYTES(_state2, 3, 0);\
+	ECHO_SUBBYTES(_state2, 0, 1);\
+	ECHO_SUBBYTES(_state2, 1, 1);\
+	ECHO_SUBBYTES(_state2, 2, 1);\
+	ECHO_SUBBYTES(_state2, 3, 1);\
+	ECHO_SUBBYTES(_state2, 0, 2);\
+	ECHO_SUBBYTES(_state2, 1, 2);\
+	ECHO_SUBBYTES(_state2, 2, 2);\
+	ECHO_SUBBYTES(_state2, 3, 2);\
+	ECHO_SUBBYTES(_state2, 0, 3);\
+	ECHO_SUBBYTES(_state2, 1, 3);\
+	ECHO_SUBBYTES(_state2, 2, 3);\
+	ECHO_SUBBYTES(_state2, 3, 3);\
+	ECHO_MIXBYTES(_state2, _state, 0, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 1, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 2, t1, t2, s2);\
+	ECHO_MIXBYTES(_state2, _state, 3, t1, t2, s2)



 #define SAVESTATE(dst, src)\
-		dst[0][0] = src[0][0];\
-		dst[0][1] = src[0][1];\
-		dst[0][2] = src[0][2];\
-		dst[0][3] = src[0][3];\
-		dst[1][0] = src[1][0];\
-		dst[1][1] = src[1][1];\
-		dst[1][2] = src[1][2];\
-		dst[1][3] = src[1][3];\
-		dst[2][0] = src[2][0];\
-		dst[2][1] = src[2][1];\
-		dst[2][2] = src[2][2];\
-		dst[2][3] = src[2][3];\
-		dst[3][0] = src[3][0];\
-		dst[3][1] = src[3][1];\
-		dst[3][2] = src[3][2];\
-		dst[3][3] = src[3][3]
+	dst[0][0] = src[0][0];\
+	dst[0][1] = src[0][1];\
+	dst[0][2] = src[0][2];\
+	dst[0][3] = src[0][3];\
+	dst[1][0] = src[1][0];\
+	dst[1][1] = src[1][1];\
+	dst[1][2] = src[1][2];\
+	dst[1][3] = src[1][3];\
+	dst[2][0] = src[2][0];\
+	dst[2][1] = src[2][1];\
+	dst[2][2] = src[2][2];\
+	dst[2][3] = src[2][3];\
+	dst[3][0] = src[3][0];\
+	dst[3][1] = src[3][1];\
+	dst[3][2] = src[3][2];\
+	dst[3][3] = src[3][3]


 void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount)
 {
-	unsigned int r, b, i, j;
-//      __m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp;
-	__m128i t1, t2, s2, k1;
-	__m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 
+   unsigned int r, b, i, j;
+   __m128i t1, t2, s2, k1;
+   __m128i _state[4][4], _state2[4][4], _statebackup[4][4]; 

+   for(i = 0; i < 4; i++)
+	for(j = 0; j < ctx->uHashSize / 256; j++)
+		_state[i][j] = ctx->state[i][j];

-	for(i = 0; i < 4; i++)
-		for(j = 0; j < ctx->uHashSize / 256; j++)
-			_state[i][j] = ctx->state[i][j];
+   for(b = 0; b < uBlockCount; b++)
+   {
+	ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);

-
-#ifdef NO_AES_NI
-	// transform cv
-	for(i = 0; i < 4; i++)
-		for(j = 0; j < ctx->uHashSize / 256; j++)
-		{
-			TRANSFORM(_state[i][j], _k_ipt, t1, t2);
-		}
-#endif
-
-	for(b = 0; b < uBlockCount; b++)
+	// load message
+	for(j = ctx->uHashSize / 256; j < 4; j++)
 	{
-		ctx->k = _mm_add_epi64(ctx->k, ctx->const1536);
-
-		// load message
-		for(j = ctx->uHashSize / 256; j < 4; j++)
-		{
-			for(i = 0; i < 4; i++)
-			{
-				_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
-
-#ifdef NO_AES_NI
-				// transform message
-				TRANSFORM(_state[i][j], _k_ipt, t1, t2);
-#endif
-			}
-		}
-
-		// save state
-		SAVESTATE(_statebackup, _state);
-
-
-		k1 = ctx->k;
-
-#ifndef NO_AES_NI
-		for(r = 0; r < ctx->uRounds / 2; r++)
-		{
-			ECHO_ROUND_UNROLL2;
-		}
-
-#else
-		for(r = 0; r < ctx->uRounds / 2; r++)
-		{
-			_state2[0][0] = M128(zero); _state2[1][0] = M128(zero); _state2[2][0] = M128(zero); _state2[3][0] = M128(zero);
-			_state2[0][1] = M128(zero); _state2[1][1] = M128(zero); _state2[2][1] = M128(zero); _state2[3][1] = M128(zero);
-			_state2[0][2] = M128(zero); _state2[1][2] = M128(zero); _state2[2][2] = M128(zero); _state2[3][2] = M128(zero);
-			_state2[0][3] = M128(zero); _state2[1][3] = M128(zero); _state2[2][3] = M128(zero); _state2[3][3] = M128(zero);																			
-
-			ECHO_SUB_AND_MIX(_state, 0, 0, _state2, 0, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state, 1, 0, _state2, 3, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state, 2, 0, _state2, 2, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state, 3, 0, _state2, 1, 3, 0, 1, 2);
-			ECHO_SUB_AND_MIX(_state, 0, 1, _state2, 1, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state, 1, 1, _state2, 0, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state, 2, 1, _state2, 3, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state, 3, 1, _state2, 2, 3, 0, 1, 2);
-			ECHO_SUB_AND_MIX(_state, 0, 2, _state2, 2, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state, 1, 2, _state2, 1, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state, 2, 2, _state2, 0, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state, 3, 2, _state2, 3, 3, 0, 1, 2);
-			ECHO_SUB_AND_MIX(_state, 0, 3, _state2, 3, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state, 1, 3, _state2, 2, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state, 2, 3, _state2, 1, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state, 3, 3, _state2, 0, 3, 0, 1, 2);
-
-			_state[0][0] = M128(zero); _state[1][0] = M128(zero); _state[2][0] = M128(zero); _state[3][0] = M128(zero);
-			_state[0][1] = M128(zero); _state[1][1] = M128(zero); _state[2][1] = M128(zero); _state[3][1] = M128(zero);
-			_state[0][2] = M128(zero); _state[1][2] = M128(zero); _state[2][2] = M128(zero); _state[3][2] = M128(zero);
-			_state[0][3] = M128(zero); _state[1][3] = M128(zero); _state[2][3] = M128(zero); _state[3][3] = M128(zero);																			
-
-			ECHO_SUB_AND_MIX(_state2, 0, 0, _state, 0, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state2, 1, 0, _state, 3, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state2, 2, 0, _state, 2, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state2, 3, 0, _state, 1, 3, 0, 1, 2);
-			ECHO_SUB_AND_MIX(_state2, 0, 1, _state, 1, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state2, 1, 1, _state, 0, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state2, 2, 1, _state, 3, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state2, 3, 1, _state, 2, 3, 0, 1, 2);
-			ECHO_SUB_AND_MIX(_state2, 0, 2, _state, 2, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state2, 1, 2, _state, 1, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state2, 2, 2, _state, 0, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state2, 3, 2, _state, 3, 3, 0, 1, 2);
-			ECHO_SUB_AND_MIX(_state2, 0, 3, _state, 3, 0, 1, 2, 3);
-			ECHO_SUB_AND_MIX(_state2, 1, 3, _state, 2, 1, 2, 3, 0);
-			ECHO_SUB_AND_MIX(_state2, 2, 3, _state, 1, 2, 3, 0, 1);
-			ECHO_SUB_AND_MIX(_state2, 3, 3, _state, 0, 3, 0, 1, 2);
-
-		}
-#endif
-
-		
-		if(ctx->uHashSize == 256)
-		{
-			for(i = 0; i < 4; i++)
-			{
-				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
-				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
-				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
-
-				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
-				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
-				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
-				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
-			}
-		}
-		else
-		{
-			for(i = 0; i < 4; i++)
-			{
-				_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
-				_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
-
-				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
-				_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
-
-				_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
-				_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
-			}
-		}
-
-		pmsg += ctx->uBlockLength;
+	   for(i = 0; i < 4; i++)
+	   {
+		_state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i);
+	   }
 	}

-#ifdef NO_AES_NI
-	// transform state
-	for(i = 0; i < 4; i++)
-		for(j = 0; j < 4; j++)
-		{
-			TRANSFORM(_state[i][j], _k_opt, t1, t2);
-		}
-#endif
+	// save state
+	SAVESTATE(_statebackup, _state);

-		SAVESTATE(ctx->state, _state);
+	k1 = ctx->k;
+
+	for(r = 0; r < ctx->uRounds / 2; r++)
+	{
+		ECHO_ROUND_UNROLL2;
+	}
+		
+	if(ctx->uHashSize == 256)
+	{
+	   for(i = 0; i < 4; i++)
+	   {
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]);
+	   }
+	}
+	else
+	{
+	   for(i = 0; i < 4; i++)
+	   {
+		_state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]);
+		_state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]);
+		_state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]);
+		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]);
+		_state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]);
+           }
+	}
+	pmsg += ctx->uBlockLength;
+   }
+	SAVESTATE(ctx->state, _state);

 }

--- a/algo/echo/aes_ni/hash_api.h
+++ b/algo/echo/aes_ni/hash_api.h
@@ -30,6 +30,7 @@
 typedef struct
 {
 	__m128i			state[4][4];
+        BitSequence             buffer[192];
 	__m128i			k;
 	__m128i			hashsize;
 	__m128i			const1536;
@@ -39,9 +40,8 @@ typedef struct
 	unsigned int	uBlockLength;
 	unsigned int	uBufferBytes;
 	DataLength		processed_bits;
-	BitSequence		buffer[192];

-} hashState_echo;
+} hashState_echo __attribute__ ((aligned (64)));

 HashReturn init_echo(hashState_echo *state, int hashbitlen);

--- a/algo/echo/sse2/echo.c
+++ b/algo/echo/sse2/echo.c
--- a/algo/echo/sse2/sph_echo.h
+++ b/algo/echo/sse2/sph_echo.h
@@ -1,320 +0,0 @@
-/* $Id: sph_echo.h 216 2010-06-08 09:46:57Z tp $ */
-/**
- * ECHO interface. ECHO is a family of functions which differ by
- * their output size; this implementation defines ECHO for output
- * sizes 224, 256, 384 and 512 bits.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_echo.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_ECHO_H__
-#define SPH_ECHO_H__
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-#include <stddef.h>
-#include "algo/sha/sph_types.h"
-
-/**
- * Output size (in bits) for ECHO-224.
- */
-#define SPH_SIZE_echo224   224
-
-/**
- * Output size (in bits) for ECHO-256.
- */
-#define SPH_SIZE_echo256   256
-
-/**
- * Output size (in bits) for ECHO-384.
- */
-#define SPH_SIZE_echo384   384
-
-/**
- * Output size (in bits) for ECHO-512.
- */
-#define SPH_SIZE_echo512   512
-
-/**
- * This structure is a context for ECHO computations: it contains the
- * intermediate values and some data from the last entered block. Once
- * an ECHO computation has been performed, the context can be reused for
- * another computation. This specific structure is used for ECHO-224
- * and ECHO-256.
- *
- * The contents of this structure are private. A running ECHO computation
- * can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char buf[192];    /* first field, for alignment */
-	size_t ptr;
-	union {
-		sph_u32 Vs[4][4];
-#if SPH_64
-		sph_u64 Vb[4][2];
-#endif
-	} u;
-	sph_u32 C0, C1, C2, C3;
-#endif
-} sph_echo_small_context;
-
-/**
- * This structure is a context for ECHO computations: it contains the
- * intermediate values and some data from the last entered block. Once
- * an ECHO computation has been performed, the context can be reused for
- * another computation. This specific structure is used for ECHO-384
- * and ECHO-512.
- *
- * The contents of this structure are private. A running ECHO computation
- * can be cloned by copying the context (e.g. with a simple
- * <code>memcpy()</code>).
- */
-typedef struct {
-#ifndef DOXYGEN_IGNORE
-	unsigned char buf[128];    /* first field, for alignment */
-	size_t ptr;
-	union {
-		sph_u32 Vs[8][4];
-#if SPH_64
-		sph_u64 Vb[8][2];
-#endif
-	} u;
-	sph_u32 C0, C1, C2, C3;
-#endif
-} sph_echo_big_context;
-
-/**
- * Type for a ECHO-224 context (identical to the common "small" context).
- */
-typedef sph_echo_small_context sph_echo224_context;
-
-/**
- * Type for a ECHO-256 context (identical to the common "small" context).
- */
-typedef sph_echo_small_context sph_echo256_context;
-
-/**
- * Type for a ECHO-384 context (identical to the common "big" context).
- */
-typedef sph_echo_big_context sph_echo384_context;
-
-/**
- * Type for a ECHO-512 context (identical to the common "big" context).
- */
-typedef sph_echo_big_context sph_echo512_context;
-
-/**
- * Initialize an ECHO-224 context. This process performs no memory allocation.
- *
- * @param cc   the ECHO-224 context (pointer to a
- *             <code>sph_echo224_context</code>)
- */
-void sph_echo224_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the ECHO-224 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_echo224(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current ECHO-224 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (28 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the ECHO-224 context
- * @param dst   the destination buffer
- */
-void sph_echo224_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (28 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the ECHO-224 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_echo224_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-/**
- * Initialize an ECHO-256 context. This process performs no memory allocation.
- *
- * @param cc   the ECHO-256 context (pointer to a
- *             <code>sph_echo256_context</code>)
- */
-void sph_echo256_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the ECHO-256 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_echo256(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current ECHO-256 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (32 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the ECHO-256 context
- * @param dst   the destination buffer
- */
-void sph_echo256_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (32 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the ECHO-256 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_echo256_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-/**
- * Initialize an ECHO-384 context. This process performs no memory allocation.
- *
- * @param cc   the ECHO-384 context (pointer to a
- *             <code>sph_echo384_context</code>)
- */
-void sph_echo384_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the ECHO-384 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_echo384(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current ECHO-384 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (48 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the ECHO-384 context
- * @param dst   the destination buffer
- */
-void sph_echo384_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (48 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the ECHO-384 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_echo384_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-
-/**
- * Initialize an ECHO-512 context. This process performs no memory allocation.
- *
- * @param cc   the ECHO-512 context (pointer to a
- *             <code>sph_echo512_context</code>)
- */
-void sph_echo512_init(void *cc);
-
-/**
- * Process some data bytes. It is acceptable that <code>len</code> is zero
- * (in which case this function does nothing).
- *
- * @param cc     the ECHO-512 context
- * @param data   the input data
- * @param len    the input data length (in bytes)
- */
-void sph_echo512(void *cc, const void *data, size_t len);
-
-/**
- * Terminate the current ECHO-512 computation and output the result into
- * the provided buffer. The destination buffer must be wide enough to
- * accomodate the result (64 bytes). The context is automatically
- * reinitialized.
- *
- * @param cc    the ECHO-512 context
- * @param dst   the destination buffer
- */
-void sph_echo512_close(void *cc, void *dst);
-
-/**
- * Add a few additional bits (0 to 7) to the current computation, then
- * terminate it and output the result in the provided buffer, which must
- * be wide enough to accomodate the result (64 bytes). If bit number i
- * in <code>ub</code> has value 2^i, then the extra bits are those
- * numbered 7 downto 8-n (this is the big-endian convention at the byte
- * level). The context is automatically reinitialized.
- *
- * @param cc    the ECHO-512 context
- * @param ub    the extra bits
- * @param n     the number of extra bits (0 to 7)
- * @param dst   the destination buffer
- */
-void sph_echo512_addbits_and_close(
-	void *cc, unsigned ub, unsigned n, void *dst);
-	
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/algo/groestl/myrgr-4way.c
+++ b/algo/groestl/myrgr-4way.c
@@ -33,7 +33,7 @@ void myriad_4way_hash( void *output, const void *input )
     myrgr_4way_ctx_holder ctx;
     memcpy( &ctx, &myrgr_4way_ctx, sizeof(myrgr_4way_ctx) );

-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, input, 640 );

     update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 640 );
     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
@@ -43,12 +43,12 @@ void myriad_4way_hash( void *output, const void *input )
     memcpy( &ctx.groestl, &myrgr_4way_ctx.groestl, sizeof(hashState_groestl) );
     update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 640 );

-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );

     sha256_4way( &ctx.sha, vhash, 64 );
     sha256_4way_close( &ctx.sha, vhash );

-     mm_deinterleave_4x32( output, output+32, output+64, output+96,
+     mm128_deinterleave_4x32( output, output+32, output+64, output+96,
                           vhash, 256 );
 }

@@ -79,7 +79,7 @@ int scanhash_myriad_4way( int thr_id, struct work *work, uint32_t max_nonce,
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

   swab32_array( edata, pdata, 20 );
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

   do {
      be32enc( noncep,   n   );
--- a/algo/haval/haval-hash-4way.c
+++ b/algo/haval/haval-hash-4way.c
@@ -83,7 +83,7 @@ extern "C"{
           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x2 ), \
                                         _mm_or_si128( x4, x6 ) ), x5 ) ), \
        _mm_and_si128( x4, \
-           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm_not(x2), x5 ), \
+           _mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm128_not(x2), x5 ), \
                          _mm_xor_si128( x1, x6 ) ), x0 ) ) ), \
     _mm_xor_si128( _mm_and_si128( x2, x6 ), x0 ) )

@@ -91,7 +91,7 @@ extern "C"{
 #define F5(x6, x5, x4, x3, x2, x1, x0) \
   _mm_xor_si128( \
       _mm_and_si128( x0, \
-            mm_not( _mm_xor_si128( \
+            mm128_not( _mm_xor_si128( \
                    _mm_and_si128( _mm_and_si128( x1, x2 ), x3 ), x5 ) ) ), \
      _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \
                                    _mm_and_si128( x2, x5 ) ), \
@@ -136,8 +136,8 @@ extern "C"{
 #define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \
 do { \
   __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \
-   x7 = _mm_add_epi32( _mm_add_epi32( mm_ror_32( t, 7 ), \
-                                      mm_ror_32( x7, 11 ) ), \
+   x7 = _mm_add_epi32( _mm_add_epi32( mm128_ror_32( t, 7 ), \
+                                      mm128_ror_32( x7, 11 ) ), \
                       _mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \
 } while (0)

--- a/algo/keccak/keccak-4way.c
+++ b/algo/keccak/keccak-4way.c
@@ -10,14 +10,10 @@

 void keccakhash_4way(void *state, const void *input)
 {
-    uint64_t vhash[4*4] __attribute__ ((aligned (64)));
    keccak256_4way_context ctx;
-
    keccak256_4way_init( &ctx );
    keccak256_4way( &ctx, input, 80 );
-    keccak256_4way_close( &ctx, vhash );
-
-    mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 );
+    keccak256_4way_close( &ctx, state );
 }

 int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -25,6 +21,8 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
   uint32_t vdata[24*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[25]);   // 3*8+1
+   uint32_t lane_hash[8];
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
   uint32_t n = pdata[19];
@@ -49,13 +47,16 @@ int scanhash_keccak_4way( int thr_id, struct work *work, uint32_t max_nonce,
 	
      keccakhash_4way( hash, vdata );

-      for ( int i = 0; i < 4; i++ )
-      if ( ( ( (hash+(i<<3))[7] & 0xFFFFFF00 ) == 0 )
-           && fulltest( hash+(i<<3), ptarget ) )
+      for ( int lane = 0; lane < 4; lane++ )
+      if ( ( ( hash7[ lane<<1 ] & 0xFFFFFF00 ) == 0 ) )
      {
-         pdata[19] = n+i;
-         nonces[ num_found++ ] = n+i;
-         work_set_target_ratio( work, hash+(i<<3) );
+          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) )
+          {
+              pdata[19] = n + lane;
+              nonces[ num_found++ ] = n + lane;
+              work_set_target_ratio( work, lane_hash );
+          }
      }
      n += 4;

--- a/algo/luffa/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
-                      mm_bswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
+                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
    if ( state->rembytes  )
    {
      // remaining data bytes
-      casti_m128i( state->buffer, 0 ) = mm_bswap_32( cast_m128i( data ) );
+      casti_m128i( state->buffer, 0 ) = mm128_bswap_32( cast_m128i( data ) );
      // padding of partial block
      casti_m128i( state->buffer, 1 ) =
            _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    // full blocks
    for ( i = 0; i < blocks; i++ )
    {
-       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
-                      mm_bswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm128_bswap_32( casti_m128i( data, 1 ) ),
+                      mm128_bswap_32( casti_m128i( data, 0 ) ) );
       data += MSG_BLOCK_BYTE_LEN;
    }

@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
    {
      // padding of partial block
      rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
-                      mm_bswap_32( cast_m128i( data ) ) );
+                      mm128_bswap_32( cast_m128i( data ) ) );
    }
    else
    {
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 0 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 1 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 0 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 1 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );

    rnd512( state, zero, zero );

@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
    _mm_store_si128((__m128i*)&hash[0], t[0]);
    _mm_store_si128((__m128i*)&hash[4], t[1]);

-    casti_m128i( b, 2 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 3 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 2 ) = mm128_bswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 3 ) = mm128_bswap_32( casti_m128i( hash, 1 ) );
 }
 #endif

--- a/algo/lyra2/allium-4way.c
+++ b/algo/lyra2/allium-4way.c
@@ -1,4 +1,4 @@
-#include "allium-gate.h"
+#include "lyra2-gate.h"
 #include <memory.h>
 #include <mm_malloc.h>

@@ -7,7 +7,7 @@
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/groestl/aes_ni/hash-groestl256.h"

 typedef struct {
@@ -108,7 +108,7 @@ int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
      ( (uint32_t*)ptarget )[7] = 0x0000ff;

   swab32_array( edata, pdata, 20 );
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   blake256_4way_init( &allium_4way_ctx.blake );
   blake256_4way( &allium_4way_ctx.blake, vdata, 64 );

--- a/algo/lyra2/allium-gate.c
+++ b/algo/lyra2/allium-gate.c
@@ -1,22 +0,0 @@
-#include "allium-gate.h"
-
-int64_t get_max64_0xFFFFLL() { return 0xFFFFLL; }
-
-bool register_allium_algo( algo_gate_t* gate )
-{
-#if defined (ALLIUM_4WAY)
-  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
-  gate->scanhash  = (void*)&scanhash_allium_4way;
-  gate->hash      = (void*)&allium_4way_hash;
-#else
-  gate->miner_thread_init = (void*)&init_allium_ctx;
-  gate->scanhash  = (void*)&scanhash_allium;
-  gate->hash      = (void*)&allium_hash;
-#endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
-  gate->set_target        = (void*)&alt_set_target;
-  gate->get_max64         = (void*)&get_max64_0xFFFFLL;
-  return true;
-};
-
-
--- a/algo/lyra2/allium-gate.h
+++ b/algo/lyra2/allium-gate.h
@@ -1,29 +0,0 @@
-#ifndef ALLIUM_GATE_H__
-#define ALLIUM_GATE_H__ 1
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-#include "lyra2.h"
-
-#if defined(__AVX2__) && defined(__AES__)
-  #define ALLIUM_4WAY
-#endif
-
-bool register_allium_algo( algo_gate_t* gate );
-
-#if defined(ALLIUM_4WAY)
-
-void allium_4way_hash( void *state, const void *input );
-int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                          uint64_t *hashes_done );
-bool init_allium_4way_ctx();
-
-#endif
-
-void allium_hash( void *state, const void *input );
-int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
-                     uint64_t *hashes_done );
-bool init_allium_ctx();
-
-#endif
-
--- a/algo/lyra2/allium.c
+++ b/algo/lyra2/allium.c
@@ -1,9 +1,9 @@
-#include "allium-gate.h"
+#include "lyra2-gate.h"
 #include <memory.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 
 #if defined(__AES__)
 #include "algo/groestl/aes_ni/hash-groestl256.h"
 #else
--- a/algo/lyra2/lyra2-gate.c
+++ b/algo/lyra2/lyra2-gate.c
@@ -0,0 +1,178 @@
+#include "lyra2-gate.h"
+
+
+__thread uint64_t* l2v3_wholeMatrix;
+
+bool lyra2rev3_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   l2v3_wholeMatrix = _mm_malloc( size, 64 );
+#if defined (LYRA2REV3_4WAY)
+   init_lyra2rev3_4way_ctx();;
+#else
+   init_lyra2rev3_ctx();
+#endif
+   return l2v3_wholeMatrix;
+}
+
+bool register_lyra2rev3_algo( algo_gate_t* gate )
+{
+#if defined (LYRA2REV3_4WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev3_4way;
+  gate->hash      = (void*)&lyra2rev3_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_lyra2rev3;
+  gate->hash      = (void*)&lyra2rev3_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->miner_thread_init = (void*)&lyra2rev3_thread_init;
+  gate->set_target        = (void*)&alt_set_target;
+  return true;
+};
+
+//////////////////////////////////
+
+__thread uint64_t* l2v2_wholeMatrix;
+
+bool lyra2rev2_thread_init()
+{
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+
+   int size = (int64_t)ROW_LEN_BYTES * 4; // nRows;
+   l2v2_wholeMatrix = _mm_malloc( size, 64 );
+#if defined (LYRA2REV2_4WAY)
+   init_lyra2rev2_4way_ctx();;
+#else
+   init_lyra2rev2_ctx();
+#endif
+   return l2v2_wholeMatrix;
+}
+
+bool register_lyra2rev2_algo( algo_gate_t* gate )
+{
+#if defined (LYRA2REV2_4WAY)
+  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
+  gate->hash      = (void*)&lyra2rev2_4way_hash;
+#else
+  gate->scanhash  = (void*)&scanhash_lyra2rev2;
+  gate->hash      = (void*)&lyra2rev2_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
+  gate->set_target        = (void*)&alt_set_target;
+  return true;
+};
+
+/////////////////////////////
+
+bool register_lyra2z_algo( algo_gate_t* gate )
+{
+#if defined(LYRA2Z_8WAY)
+  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
+  gate->hash       = (void*)&lyra2z_8way_hash;
+#elif defined(LYRA2Z_4WAY)
+  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
+  gate->hash       = (void*)&lyra2z_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&lyra2z_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2z;
+  gate->hash       = (void*)&lyra2z_hash;
+#endif
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&alt_set_target;
+  return true;
+};
+
+
+////////////////////////
+
+bool register_lyra2h_algo( algo_gate_t* gate )
+{
+#ifdef LYRA2H_4WAY
+  gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h_4way;
+  gate->hash       = (void*)&lyra2h_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&lyra2h_thread_init;
+  gate->scanhash   = (void*)&scanhash_lyra2h;
+  gate->hash       = (void*)&lyra2h_hash;
+#endif
+  gate->optimizations = SSE42_OPT | AVX2_OPT;
+  gate->get_max64  = (void*)&get_max64_0xffffLL;
+  gate->set_target = (void*)&alt_set_target;
+  return true;
+};
+
+/////////////////////////////////
+
+int64_t allium_get_max64_0xFFFFLL() { return 0xFFFFLL; }
+
+bool register_allium_algo( algo_gate_t* gate )
+{
+#if defined (ALLIUM_4WAY)
+  gate->miner_thread_init = (void*)&init_allium_4way_ctx;
+  gate->scanhash  = (void*)&scanhash_allium_4way;
+  gate->hash      = (void*)&allium_4way_hash;
+#else
+  gate->miner_thread_init = (void*)&init_allium_ctx;
+  gate->scanhash  = (void*)&scanhash_allium;
+  gate->hash      = (void*)&allium_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+  gate->set_target        = (void*)&alt_set_target;
+  gate->get_max64         = (void*)&allium_get_max64_0xFFFFLL;
+  return true;
+};
+
+/////////////////////////////////////////
+
+bool phi2_has_roots;
+bool phi2_use_roots = false;
+
+int phi2_get_work_data_size() { return phi2_use_roots ? 144 : 128; }
+
+void phi2_decode_extra_data( struct work *work )
+{
+   if ( work->data[0] & ( 1<<30 ) ) phi2_use_roots = true;
+   else for ( int i = 20; i < 36; i++ )
+   {
+      if (work->data[i]) { phi2_use_roots = true; break; }
+   }
+}
+
+void phi2_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
+{
+   uchar merkle_tree[64] = { 0 };
+   size_t t;
+
+   algo_gate.gen_merkle_root( merkle_tree, sctx );
+   // Increment extranonce2
+   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
+   // Assemble block header
+   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
+                  (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_tree,
+                  le32dec( sctx->job.ntime ), le32dec(sctx->job.nbits) );
+   for ( t = 0; t < 16; t++ )
+      g_work->data[ 20+t ] = ((uint32_t*)sctx->job.extra)[t];
+}
+
+
+bool register_phi2_algo( algo_gate_t* gate )
+{
+   init_phi2_ctx();
+   gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
+   gate->get_work_data_size = (void*)&phi2_get_work_data_size;
+   gate->decode_extra_data  = (void*)&phi2_decode_extra_data;
+   gate->build_extraheader  = (void*)&phi2_build_extraheader;
+   gate->set_target         = (void*)&alt_set_target; 
+   gate->get_max64          = (void*)&get_max64_0xffffLL;
+   gate->scanhash           = (void*)&scanhash_phi2;
+   return true;
+}
--- a/algo/lyra2/lyra2-gate.h
+++ b/algo/lyra2/lyra2-gate.h
@@ -0,0 +1,154 @@
+#ifndef LYRA2_GATE_H__
+#define LYRA2_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+#include "lyra2.h"
+
+#if defined(__AVX2__)
+  #define LYRA2REV3_4WAY
+#endif
+
+extern __thread uint64_t* l2v3_wholeMatrix;
+
+bool register_lyra2rev3_algo( algo_gate_t* gate );
+
+#if defined(LYRA2REV3_4WAY)
+
+void lyra2rev3_4way_hash( void *state, const void *input );
+int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done );
+bool init_lyra2rev3_4way_ctx();
+
+#else
+
+void lyra2rev3_hash( void *state, const void *input );
+int scanhash_lyra2rev3( int thr_id, struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done );
+bool init_lyra2rev3_ctx();
+
+#endif
+
+//////////////////////////////////
+
+#if defined(__AVX2__)
+  #define LYRA2REV2_4WAY
+#endif
+
+extern __thread uint64_t* l2v2_wholeMatrix;
+
+bool register_lyra2rev2_algo( algo_gate_t* gate );
+
+#if defined(LYRA2REV2_4WAY)
+
+void lyra2rev2_4way_hash( void *state, const void *input );
+int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done );
+bool init_lyra2rev2_4way_ctx();
+
+#else
+
+void lyra2rev2_hash( void *state, const void *input );
+int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
+                        uint64_t *hashes_done );
+bool init_lyra2rev2_ctx();
+
+#endif
+
+/////////////////////////
+
+#if defined(__SSE4_2__)
+  #define LYRA2Z_4WAY
+#endif
+#if defined(__AVX2__)
+//  #define LYRA2Z_8WAY
+#endif
+
+
+#define LYRA2Z_MATRIX_SIZE  BLOCK_LEN_INT64 * 8 * 8 * 8
+
+#if defined(LYRA2Z_8WAY)
+
+void lyra2z_8way_hash( void *state, const void *input );
+int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+bool lyra2z_8way_thread_init();
+
+#elif defined(LYRA2Z_4WAY)
+
+void lyra2z_4way_hash( void *state, const void *input );
+int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+bool lyra2z_4way_thread_init();
+
+#else
+
+void lyra2z_hash( void *state, const void *input );
+int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+bool lyra2z_thread_init();
+
+#endif
+
+////////////////////
+
+#if defined(__AVX2__)
+  #define LYRA2H_4WAY
+#endif
+
+#define LYRA2H_MATRIX_SIZE  BLOCK_LEN_INT64 * 16 * 16 * 8
+
+#if defined(LYRA2H_4WAY)
+
+void lyra2h_4way_hash( void *state, const void *input );
+int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+bool lyra2h_4way_thread_init();
+
+#else
+
+void lyra2h_hash( void *state, const void *input );
+int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+bool lyra2h_thread_init();
+
+#endif
+
+//////////////////////////////////
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define ALLIUM_4WAY
+#endif
+
+bool register_allium_algo( algo_gate_t* gate );
+
+#if defined(ALLIUM_4WAY)
+
+void allium_4way_hash( void *state, const void *input );
+int scanhash_allium_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                          uint64_t *hashes_done );
+bool init_allium_4way_ctx();
+
+#else
+
+void allium_hash( void *state, const void *input );
+int scanhash_allium( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+bool init_allium_ctx();
+
+#endif 
+
+/////////////////////////////////////////
+
+bool phi2_has_roots;
+
+bool register_phi2_algo( algo_gate_t* gate );
+
+void phi2_hash( void *state, const void *input );
+int scanhash_phi2( int thr_id, struct work *work, uint32_t max_nonce,
+                     uint64_t *hashes_done );
+void init_phi2_ctx();
+
+#endif  // LYRA2_GATE_H__
+
+
--- a/algo/lyra2/lyra2.c
+++ b/algo/lyra2/lyra2.c
@@ -211,6 +211,186 @@ int LYRA2REV2( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
   return 0;
 }

+/////////////////////////////////////////////////
+
+int LYRA2REV3( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
+               const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
+               const uint64_t timeCost, const uint64_t nRows,
+               const uint64_t nCols )
+{
+   //====================== Basic variables ============================//
+   uint64_t _ALIGN(256) state[16];
+   int64_t row = 2; //index of row to be processed
+   int64_t prev = 1; //index of prev (last row ever computed/modified)
+   int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+   int64_t tau; //Time Loop iterator
+   int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+   int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+   int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+   int64_t i; //auxiliary iteration counter
+   int64_t v64; // 64bit var for memcpy
+   uint64_t instance = 0;
+   //====================================================================/
+
+   //=== Initializing the Memory Matrix and pointers to it =============//
+   //Tries to allocate enough space for the whole memory matrix
+
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
+/*
+   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+//   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+   // for Lyra2REv2, nCols = 4, v1 was using 8
+   const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64
+                                          : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+*/
+
+   uint64_t *ptrWord = wholeMatrix;
+
+//   memset( wholeMatrix, 0, ROW_LEN_BYTES * nRows );
+
+   //=== Getting the password + salt + basil padded with 10*1 ==========//
+   //OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+   //but this ensures that the password copied locally will be overwritten as soon as possible
+
+   //First, we clean enough blocks for the password, salt, basil and padding
+   int64_t nBlocksInput = ( ( saltlen + pwdlen + 6 * sizeof(uint64_t) )
+                              / BLOCK_LEN_BLAKE2_SAFE_BYTES ) + 1;
+
+   byte *ptrByte = (byte*) wholeMatrix;
+
+   //Prepends the password
+   memcpy(ptrByte, pwd, pwdlen);
+   ptrByte += pwdlen;
+
+   //Concatenates the salt
+   memcpy(ptrByte, salt, saltlen);
+   ptrByte += saltlen;
+
+   memset( ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES
+                       - (saltlen + pwdlen) );
+
+   //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+   memcpy(ptrByte, &kLen, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = pwdlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = saltlen;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = timeCost;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nRows;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+   v64 = nCols;
+   memcpy(ptrByte, &v64, sizeof(int64_t));
+   ptrByte += sizeof(uint64_t);
+
+   //Now comes the padding
+   *ptrByte = 0x80; //first byte of padding: right after the password
+   ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+   ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+   *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+
+// from here on it's all simd acces to state and matrix
+// define vector pointers and adjust sizes and pointer offsets
+
+   //================= Initializing the Sponge State ====================//
+   //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+
+   initState( state );
+
+   //========================= Setup Phase =============================//
+   //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+
+   ptrWord = wholeMatrix;
+   for (i = 0; i < nBlocksInput; i++)
+   {
+       absorbBlockBlake2Safe( state, ptrWord ); //absorbs each block of pad(pwd || salt || basil)
+       ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+   }
+   //Initializes M[0] and M[1]
+   reducedSqueezeRow0( state, &wholeMatrix[0], nCols ); //The locally copied password is most likely overwritten here
+
+   reducedDuplexRow1( state, &wholeMatrix[0], &wholeMatrix[ROW_LEN_INT64],
+                      nCols);
+
+   do
+   {
+      //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+      reducedDuplexRowSetup( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+
+      //updates the value of row* (deterministically picked during Setup))
+      rowa = (rowa + step) & (window - 1);
+      //update prev: it now points to the last row ever computed
+
+      prev = row;
+      //updates row: goes to the next row to be computed
+      row++;
+
+      //Checks if all rows in the window where visited.
+      if (rowa == 0)
+      {
+         step = window + gap; //changes the step: approximately doubles its value
+         window *= 2; //doubles the size of the re-visitation window
+         gap = -gap; //inverts the modifier to the step
+      }
+
+   } while (row < nRows);
+
+   //===================== Wandering Phase =============================//
+   row = 0; //Resets the visitation to the first row of the memory matrix
+   for (tau = 1; tau <= timeCost; tau++)
+   {
+       //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+       step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+//       step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+       do
+       {
+           //Selects a pseudorandom index row*
+           //-----------------------------------------------
+             instance = state[instance & 0xF];
+             rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
+//           rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+
+           //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //-------------------------------------------
+
+           //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+           reducedDuplexRow( state, &wholeMatrix[prev*ROW_LEN_INT64],
+                             &wholeMatrix[rowa*ROW_LEN_INT64],
+                             &wholeMatrix[row*ROW_LEN_INT64], nCols );
+           //update prev: it now points to the last row ever computed
+           prev = row;
+
+           //updates row: goes to the next row to be computed
+           //----------------------------------------------------
+           row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+           //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+           //----------------------------------------------------
+
+       } while (row != 0);
+   }
+
+   //===================== Wrap-up Phase ===============================//
+   //Absorbs the last block of the memory matrix
+   absorbBlock(state, &wholeMatrix[rowa*ROW_LEN_INT64]);
+   //Squeezes the key
+   squeeze(state, K, (unsigned int) kLen);
+
+   return 0;
+}
+
+
+
+//////////////////////////////////////////////////
 int LYRA2Z( uint64_t* wholeMatrix, void *K, uint64_t kLen, const void *pwd,
            const uint64_t pwdlen, const void *salt, const uint64_t saltlen,
            const uint64_t timeCost, const uint64_t nRows,
--- a/algo/lyra2/lyra2.h
+++ b/algo/lyra2/lyra2.h
@@ -50,6 +50,10 @@ int LYRA2REV2( uint64_t*, void *K, uint64_t kLen, const void *pwd,
               uint64_t pwdlen, const void *salt, uint64_t saltlen,
               uint64_t timeCost, uint64_t nRows, uint64_t nCols );

+int LYRA2REV3( uint64_t*, void *K, uint64_t kLen, const void *pwd,
+               uint64_t pwdlen, const void *salt, uint64_t saltlen,
+               uint64_t timeCost, uint64_t nRows, uint64_t nCols );
+
 int LYRA2Z( uint64_t*, void *K, uint64_t kLen, const void *pwd,
            uint64_t pwdlen, const void *salt, uint64_t saltlen,
            uint64_t timeCost, uint64_t nRows, uint64_t nCols );
--- a/algo/lyra2/lyra2h-4way.c
+++ b/algo/lyra2/lyra2h-4way.c
@@ -1,4 +1,4 @@
-#include "lyra2h-gate.h"
+#include "lyra2-gate.h"

 #ifdef LYRA2H_4WAY

@@ -36,7 +36,7 @@ void lyra2h_4way_hash( void *state, const void *input )
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

     LYRA2Z( lyra2h_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 16, 16, 16 );
     LYRA2Z( lyra2h_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 16, 16, 16 );
@@ -67,10 +67,10 @@ int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   for ( int i=0; i < 19; i++ )
+   for ( int i=0; i < 20; i++ )
      be32enc( &edata[i], pdata[i] );

-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

   lyra2h_4way_midstate( vdata );

--- a/algo/lyra2/lyra2h-gate.c
+++ b/algo/lyra2/lyra2h-gate.c
@@ -1,25 +0,0 @@
-#include "lyra2h-gate.h"
-#include "lyra2.h"
-
-void lyra2h_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool register_lyra2h_algo( algo_gate_t* gate )
-{
-#ifdef LYRA2H_4WAY
-  gate->miner_thread_init = (void*)&lyra2h_4way_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2h_4way;
-  gate->hash       = (void*)&lyra2h_4way_hash;
-#else
-  gate->miner_thread_init = (void*)&lyra2h_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2h;
-  gate->hash       = (void*)&lyra2h_hash;
-#endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&lyra2h_set_target;
-  return true;
-};
-
--- a/algo/lyra2/lyra2h-gate.h
+++ b/algo/lyra2/lyra2h-gate.h
@@ -1,32 +0,0 @@
-#ifndef LYRA2H_GATE_H__
-#define LYRA2H_GATE_H__
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#if defined(__AVX2__)
-  #define LYRA2H_4WAY
-#endif
-
-#define LYRA2H_MATRIX_SIZE  BLOCK_LEN_INT64 * 16 * 16 * 8
-
-#if defined(LYRA2H_4WAY)
-
-void lyra2h_4way_hash( void *state, const void *input );
-
-int scanhash_lyra2h_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
-bool lyra2h_4way_thread_init();
-
-#endif
-
-void lyra2h_hash( void *state, const void *input );
-
-int scanhash_lyra2h( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-
-bool lyra2h_thread_init();
-
-#endif
-
--- a/algo/lyra2/lyra2h.c
+++ b/algo/lyra2/lyra2h.c
@@ -1,4 +1,4 @@
-#include "lyra2h-gate.h"
+#include "lyra2-gate.h"
 #include <memory.h>
 #include <mm_malloc.h>
 #include "lyra2.h"
--- a/algo/lyra2/lyra2rev2-4way.c
+++ b/algo/lyra2/lyra2rev2-4way.c
@@ -1,13 +1,13 @@
-#include "lyra2rev2-gate.h"
+#include "lyra2-gate.h"
 #include <memory.h>

-#if defined (__AVX2__)	
+#if defined (LYRA2REV2_4WAY)	

 #include "algo/blake/blake-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 

 typedef struct {
   blake256_4way_context     blake;
@@ -74,11 +74,11 @@ void lyra2rev2_4way_hash( void *state, const void *input )
   cubehashReinit( &ctx.cube );
   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );

-   mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
+   mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
   bmw256_4way( &ctx.bmw, vhash, 32 );
   bmw256_4way_close( &ctx.bmw, vhash );

-   mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+   mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
 }

 int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -101,7 +101,7 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,

   swab32_array( edata, pdata, 20 );

-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

   blake256_4way_init( &l2v2_4way_ctx.blake );
   blake256_4way( &l2v2_4way_ctx.blake, vdata, 64 );
--- a/algo/lyra2/lyra2rev2-gate.c
+++ b/algo/lyra2/lyra2rev2-gate.c
@@ -1,40 +0,0 @@
-#include "lyra2rev2-gate.h"
-
-__thread uint64_t* l2v2_wholeMatrix;
-
-void lyra2rev2_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool lyra2rev2_thread_init()
-{
-   const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * 4; // nCols
-   const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
-
-   int i = (int64_t)ROW_LEN_BYTES * 4; // nRows;
-   l2v2_wholeMatrix = _mm_malloc( i, 64 );
-#if defined (LYRA2REV2_4WAY)
-   init_lyra2rev2_4way_ctx();;
-#else
-   init_lyra2rev2_ctx();
-#endif
-   return l2v2_wholeMatrix;
-}
-
-bool register_lyra2rev2_algo( algo_gate_t* gate )
-{
-#if defined (LYRA2REV2_4WAY)
-  gate->scanhash  = (void*)&scanhash_lyra2rev2_4way;
-  gate->hash      = (void*)&lyra2rev2_4way_hash;
-#else
-  gate->scanhash  = (void*)&scanhash_lyra2rev2;
-  gate->hash      = (void*)&lyra2rev2_hash;
-#endif
-  gate->optimizations = SSE2_OPT | AES_OPT | SSE42_OPT | AVX2_OPT;
-  gate->miner_thread_init = (void*)&lyra2rev2_thread_init;
-  gate->set_target        = (void*)&lyra2rev2_set_target;
-  return true;
-};
-
-
--- a/algo/lyra2/lyra2rev2-gate.h
+++ b/algo/lyra2/lyra2rev2-gate.h
@@ -1,35 +0,0 @@
-#ifndef LYRA2REV2_GATE_H__
-#define LYRA2REV2_GATE_H__ 1
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-#include "lyra2.h"
-
-#if defined(__AVX2__)
-  #define LYRA2REV2_4WAY
-#endif
-
-extern __thread uint64_t* l2v2_wholeMatrix;
-
-bool register_lyra2rev2_algo( algo_gate_t* gate );
-
-#if defined(LYRA2REV2_4WAY)
-
-void lyra2rev2_4way_hash( void *state, const void *input );
-
-int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
-bool init_lyra2rev2_4way_ctx();
-
-#endif
-
-void lyra2rev2_hash( void *state, const void *input );
-
-int scanhash_lyra2rev2( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-
-bool init_lyra2rev2_ctx();
-
-#endif
-
--- a/algo/lyra2/lyra2rev2.c
+++ b/algo/lyra2/lyra2rev2.c
@@ -1,11 +1,11 @@
-#include "lyra2rev2-gate.h"
+#include "lyra2-gate.h"
 #include <memory.h>
 #include "algo/blake/sph_blake.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/bmw/sph_bmw.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 
 //#include "lyra2.h"

 typedef struct {
--- a/algo/lyra2/lyra2rev3-4way.c
+++ b/algo/lyra2/lyra2rev3-4way.c
@@ -0,0 +1,110 @@
+#include "lyra2-gate.h"
+#include <memory.h>
+
+#if defined (LYRA2REV3_4WAY)	
+
+#include "algo/blake/blake-hash-4way.h"
+#include "algo/bmw/bmw-hash-4way.h"
+#include "algo/cubehash/cubehash_sse2.h" 
+
+typedef struct {
+   blake256_4way_context     blake;
+   cubehashParam             cube;
+   bmw256_4way_context       bmw;
+} lyra2v3_4way_ctx_holder;
+
+static lyra2v3_4way_ctx_holder l2v3_4way_ctx;
+
+bool init_lyra2rev3_4way_ctx()
+{
+   blake256_4way_init( &l2v3_4way_ctx.blake );
+   cubehashInit( &l2v3_4way_ctx.cube, 256, 16, 32 );
+   bmw256_4way_init( &l2v3_4way_ctx.bmw );
+   return true;
+}
+
+void lyra2rev3_4way_hash( void *state, const void *input )
+{
+   uint32_t vhash[8*4] __attribute__ ((aligned (64)));
+   uint32_t hash0[8] __attribute__ ((aligned (64)));
+   uint32_t hash1[8] __attribute__ ((aligned (32)));
+   uint32_t hash2[8] __attribute__ ((aligned (32)));
+   uint32_t hash3[8] __attribute__ ((aligned (32)));
+   lyra2v3_4way_ctx_holder ctx __attribute__ ((aligned (64))); 
+   memcpy( &ctx, &l2v3_4way_ctx, sizeof(l2v3_4way_ctx) );
+
+   blake256_4way( &ctx.blake, input, 80 );
+   blake256_4way_close( &ctx.blake, vhash );
+   mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+
+   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
+   
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash0, (const byte*) hash0, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash1, (const byte*) hash1, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash2, (const byte*) hash2, 32 );
+   cubehashReinit( &ctx.cube );
+   cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 );
+
+   LYRA2REV3( l2v3_wholeMatrix, hash0, 32, hash0, 32, hash0, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash1, 32, hash1, 32, hash1, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash2, 32, hash2, 32, hash2, 32, 1, 4, 4 );
+   LYRA2REV3( l2v3_wholeMatrix, hash3, 32, hash3, 32, hash3, 32, 1, 4, 4 );
+
+   mm128_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 256 );
+   bmw256_4way( &ctx.bmw, vhash, 32 );
+   bmw256_4way_close( &ctx.bmw, vhash );
+
+   mm128_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 );
+}
+
+int scanhash_lyra2rev3_4way( int thr_id, struct work *work, uint32_t max_nonce,
+                             uint64_t *hashes_done )
+{
+   uint32_t hash[8*4] __attribute__ ((aligned (64)));
+   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
+   uint32_t edata[20] __attribute__ ((aligned (64)));
+   uint32_t *pdata = work->data;
+   uint32_t *ptarget = work->target;
+   const uint32_t first_nonce = pdata[19];
+   uint32_t n = first_nonce;
+   const uint32_t Htarg = ptarget[7];
+   uint32_t *nonces = work->nonces;
+   int num_found = 0;
+   uint32_t *noncep = vdata + 76; // 19*4
+
+   if ( opt_benchmark )
+      ( (uint32_t*)ptarget )[7] = 0x0000ff;
+
+   swab32_array( edata, pdata, 20 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+
+   do {
+      be32enc( noncep,   n   );
+      be32enc( noncep+1, n+1 );
+      be32enc( noncep+2, n+2 );
+      be32enc( noncep+3, n+3 );
+
+      lyra2rev3_4way_hash( hash, vdata );
+      pdata[19] = n;
+
+      for ( int i = 0; i < 4; i++ )
+      if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+      {
+          pdata[19] = n+i;         
+          nonces[ num_found++ ] = n+i;
+          work_set_target_ratio( work, hash+(i<<3) );
+      }
+      n += 4;
+   } while ( (num_found == 0) && (n < max_nonce-4)
+                   && !work_restart[thr_id].restart);
+
+   *hashes_done = n - first_nonce + 1;
+   return num_found;
+}
+
+#endif
--- a/algo/lyra2/lyra2rev3.c
+++ b/algo/lyra2/lyra2rev3.c
@@ -0,0 +1,102 @@
+#include "lyra2-gate.h"
+#include <memory.h>
+#include "algo/blake/sph_blake.h"
+#include "algo/cubehash/sph_cubehash.h"
+#include "algo/bmw/sph_bmw.h"
+#include "algo/cubehash/cubehash_sse2.h" 
+//#include "lyra2.h"
+
+typedef struct {
+        cubehashParam           cube;
+//        cubehashParam           cube2;
+        sph_blake256_context     blake;
+        sph_bmw256_context       bmw;
+
+} lyra2v3_ctx_holder;
+
+static lyra2v3_ctx_holder lyra2v3_ctx;
+static __thread sph_blake256_context l2v3_blake_mid;
+
+bool init_lyra2rev3_ctx()
+{
+        cubehashInit( &lyra2v3_ctx.cube, 256, 16, 32 );
+//        cubehashInit( &lyra2v3_ctx.cube2, 256, 16, 32 );
+        sph_blake256_init( &lyra2v3_ctx.blake );
+        sph_bmw256_init( &lyra2v3_ctx.bmw );
+        return true;
+}
+
+void l2v3_blake256_midstate( const void* input )
+{
+    memcpy( &l2v3_blake_mid, &lyra2v3_ctx.blake, sizeof l2v3_blake_mid );
+    sph_blake256( &l2v3_blake_mid, input, 64 );
+}
+
+void lyra2rev3_hash( void *state, const void *input )
+{
+        lyra2v3_ctx_holder ctx __attribute__ ((aligned (64))); 
+        memcpy( &ctx, &lyra2v3_ctx, sizeof(lyra2v3_ctx) );
+        uint8_t hash[128] __attribute__ ((aligned (64)));
+        #define hashA hash
+        #define hashB hash+64
+        const int midlen = 64;            // bytes
+        const int tail   = 80 - midlen;   // 16
+
+        memcpy( &ctx.blake, &l2v3_blake_mid, sizeof l2v3_blake_mid );
+	sph_blake256( &ctx.blake, (uint8_t*)input + midlen, tail );
+	sph_blake256_close( &ctx.blake, hash );
+
+        LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 );
+
+        cubehashUpdateDigest( &ctx.cube, (byte*) hashA,
+                              (const byte*) hash, 32 );
+
+	LYRA2REV3( l2v3_wholeMatrix, hash, 32, hash, 32, hash, 32, 1, 4, 4 );
+
+	sph_bmw256( &ctx.bmw, hash, 32 );
+	sph_bmw256_close( &ctx.bmw, hash );
+
+	memcpy( state, hash, 32 );
+}
+
+int scanhash_lyra2rev3(int thr_id, struct work *work,
+	uint32_t max_nonce, uint64_t *hashes_done)
+{
+        uint32_t *pdata = work->data;
+        uint32_t *ptarget = work->target;
+	uint32_t endiandata[20] __attribute__ ((aligned (64)));
+        uint32_t hash[8] __attribute__((aligned(64)));
+	const uint32_t first_nonce = pdata[19];
+	uint32_t nonce = first_nonce;
+        const uint32_t Htarg = ptarget[7];
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0000ff;
+
+        swab32_array( endiandata, pdata, 20 );
+
+        l2v3_blake256_midstate( endiandata );
+
+	do {
+		be32enc(&endiandata[19], nonce);
+		lyra2rev3_hash(hash, endiandata);
+
+		if (hash[7] <= Htarg )
+                {
+                   if( fulltest(hash, ptarget) )
+                   {
+			pdata[19] = nonce;
+                        work_set_target_ratio( work, hash );
+			*hashes_done = pdata[19] - first_nonce;
+		   	return 1;
+		   }
+                }
+		nonce++;
+
+	} while (nonce < max_nonce && !work_restart[thr_id].restart);
+
+	pdata[19] = nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
+
--- a/algo/lyra2/lyra2z-4way.c
+++ b/algo/lyra2/lyra2z-4way.c
@@ -1,4 +1,4 @@
-#include "lyra2z-gate.h"
+#include "lyra2-gate.h"

 #ifdef LYRA2Z_4WAY

@@ -36,7 +36,7 @@ void lyra2z_4way_hash( void *state, const void *input )
     blake256_4way( &ctx_blake, input + (64*4), 16 );
     blake256_4way_close( &ctx_blake, vhash );

-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );
+     mm128_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 256 );

     LYRA2Z( lyra2z_4way_matrix, hash0, 32, hash0, 32, hash0, 32, 8, 8, 8 );
     LYRA2Z( lyra2z_4way_matrix, hash1, 32, hash1, 32, hash1, 32, 8, 8, 8 );
@@ -67,10 +67,10 @@ int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
   if ( opt_benchmark )
      ptarget[7] = 0x0000ff;

-   for ( int i=0; i < 19; i++ )
+   for ( int i=0; i < 20; i++ )
      be32enc( &edata[i], pdata[i] );

-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );

   lyra2z_4way_midstate( vdata );

--- a/algo/lyra2/lyra2z-gate.c
+++ b/algo/lyra2/lyra2z-gate.c
@@ -1,29 +0,0 @@
-#include "lyra2z-gate.h"
-#include "lyra2.h"
-
-void lyra2z_set_target( struct work* work, double job_diff )
-{
- work_set_target( work, job_diff / (256.0 * opt_diff_factor) );
-}
-
-bool register_lyra2z_algo( algo_gate_t* gate )
-{
-#if defined(LYRA2Z_8WAY)
-  gate->miner_thread_init = (void*)&lyra2z_8way_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2z_8way;
-  gate->hash       = (void*)&lyra2z_8way_hash;
-#elif defined(LYRA2Z_4WAY)
-  gate->miner_thread_init = (void*)&lyra2z_4way_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2z_4way;
-  gate->hash       = (void*)&lyra2z_4way_hash;
-#else
-  gate->miner_thread_init = (void*)&lyra2z_thread_init;
-  gate->scanhash   = (void*)&scanhash_lyra2z;
-  gate->hash       = (void*)&lyra2z_hash;
-#endif
-  gate->optimizations = SSE42_OPT | AVX2_OPT;
-  gate->get_max64  = (void*)&get_max64_0xffffLL;
-  gate->set_target = (void*)&lyra2z_set_target;
-  return true;
-};
-
--- a/algo/lyra2/lyra2z-gate.h
+++ b/algo/lyra2/lyra2z-gate.h
@@ -1,46 +0,0 @@
-#ifndef LYRA2Z_GATE_H__
-#define LYRA2Z_GATE_H__ 1
-
-#include "algo-gate-api.h"
-#include <stdint.h>
-
-#if defined(__SSE4_2__)
-  #define LYRA2Z_4WAY
-#endif
-#if defined(__AVX2__)
-//  #define LYRA2Z_8WAY
-#endif
-
-
-#define LYRA2Z_MATRIX_SIZE  BLOCK_LEN_INT64 * 8 * 8 * 8
-
-#if defined(LYRA2Z_8WAY)
-
-void lyra2z_8way_hash( void *state, const void *input );
-
-int scanhash_lyra2z_8way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
-bool lyra2z_8way_thread_init();
-
-#elif defined(LYRA2Z_4WAY)
-
-void lyra2z_4way_hash( void *state, const void *input );
-
-int scanhash_lyra2z_4way( int thr_id, struct work *work, uint32_t max_nonce,
-                         uint64_t *hashes_done );
-
-bool lyra2z_4way_thread_init();
-
-#else
-
-void lyra2z_hash( void *state, const void *input );
-
-int scanhash_lyra2z( int thr_id, struct work *work, uint32_t max_nonce,
-                    uint64_t *hashes_done );
-
-bool lyra2z_thread_init();
-
-#endif
-
-#endif
--- a/algo/lyra2/lyra2z.c
+++ b/algo/lyra2/lyra2z.c
@@ -1,6 +1,6 @@
 #include <memory.h>
 #include <mm_malloc.h>
-#include "lyra2z-gate.h"
+#include "lyra2-gate.h"
 #include "lyra2.h"
 #include "algo/blake/sph_blake.h"
 #include "avxdefs.h"
--- a/algo/lyra2/phi2.c
+++ b/algo/lyra2/phi2.c
@@ -0,0 +1,133 @@
+/**
+ * Phi-2 algo Implementation
+ */
+
+#include "lyra2-gate.h"
+#include "algo/skein/sph_skein.h"
+#include "algo/jh/sph_jh.h"
+#include "algo/gost/sph_gost.h"
+#include "algo/cubehash/cubehash_sse2.h"
+#ifdef __AES__
+  #include "algo/echo/aes_ni/hash_api.h"
+#else
+  #include "algo/echo/sph_echo.h"
+#endif
+
+typedef struct {
+     cubehashParam           cube;
+     sph_jh512_context       jh;
+#if  defined(__AES__)
+     hashState_echo          echo1;
+     hashState_echo          echo2;
+#else
+     sph_echo512_context     echo1;
+     sph_echo512_context     echo2;
+#endif
+     sph_gost512_context     gost;
+     sph_skein512_context    skein;
+} phi2_ctx_holder;
+
+phi2_ctx_holder phi2_ctx;
+
+void init_phi2_ctx()
+{
+   cubehashInit( &phi2_ctx.cube, 512, 16, 32 );
+   sph_jh512_init(&phi2_ctx.jh);
+#if defined(__AES__)
+   init_echo( &phi2_ctx.echo1, 512 );
+   init_echo( &phi2_ctx.echo2, 512 );
+#else
+   sph_echo512_init(&phi2_ctx.echo1);
+   sph_echo512_init(&phi2_ctx.echo2);
+#endif
+   sph_gost512_init(&phi2_ctx.gost);
+   sph_skein512_init(&phi2_ctx.skein);
+};
+
+void phi2_hash(void *state, const void *input)
+{
+	unsigned char _ALIGN(128) hash[64];
+	unsigned char _ALIGN(128) hashA[64];
+	unsigned char _ALIGN(128) hashB[64];
+
+        phi2_ctx_holder ctx __attribute__ ((aligned (64)));
+        memcpy( &ctx, &phi2_ctx, sizeof(phi2_ctx) );
+
+        cubehashUpdateDigest( &ctx.cube, (byte*)hashB, (const byte*)input,
+		              phi2_has_roots ? 144 : 80 );
+
+	LYRA2RE( &hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8 );
+	LYRA2RE( &hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8 );
+
+	sph_jh512( &ctx.jh, (const void*)hashA, 64 );
+	sph_jh512_close( &ctx.jh, (void*)hash );
+
+	if ( hash[0] & 1 )
+       	{
+           sph_gost512( &ctx.gost, (const void*)hash, 64 );
+	   sph_gost512_close( &ctx.gost, (void*)hash );
+	}
+       	else
+       	{
+#if defined(__AES__)
+           update_final_echo ( &ctx.echo1, (BitSequence *)hash,
+                               (const BitSequence *)hash, 512 );
+           update_final_echo ( &ctx.echo2, (BitSequence *)hash,
+                               (const BitSequence *)hash, 512 );
+#else
+	   sph_echo512( &ctx.echo1, (const void*)hash, 64 );
+	   sph_echo512_close( &ctx.echo1, (void*)hash );
+
+	   sph_echo512( &ctx.echo2, (const void*)hash, 64 );
+	   sph_echo512_close( &ctx.echo2, (void*)hash );
+#endif
+	}
+
+	sph_skein512( &ctx.skein, (const void*)hash, 64 );
+	sph_skein512_close( &ctx.skein, (void*)hash );
+
+	for (int i=0; i<4; i++)
+		((uint64_t*)hash)[i] ^= ((uint64_t*)hash)[i+4];
+
+	memcpy(state, hash, 32);
+}
+
+int scanhash_phi2(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
+{
+	uint32_t _ALIGN(128) hash[8];
+	uint32_t _ALIGN(128) endiandata[36];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t first_nonce = pdata[19];
+	uint32_t n = first_nonce;
+
+	if(opt_benchmark){
+		ptarget[7] = 0x00ff;
+	}
+
+	phi2_has_roots = false;
+	for (int i=0; i < 36; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+		if (i >= 20 && pdata[i]) phi2_has_roots = true;
+	}
+
+	do {
+		be32enc(&endiandata[19], n);
+		phi2_hash(hash, endiandata);
+
+		if (hash[7] < Htarg && fulltest(hash, ptarget)) {
+			work_set_target_ratio(work, hash);
+			*hashes_done = n - first_nonce + 1;
+			pdata[19] = n;
+			return 1;
+		}
+		n++;
+
+	} while (n < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = n - first_nonce + 1;
+	pdata[19] = n;
+	return 0;
+}
--- a/algo/lyra2/sponge.h
+++ b/algo/lyra2/sponge.h
@@ -48,6 +48,10 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    return ( w >> c ) | ( w << ( 64 - c ) );
 }

+// serial data is only 32 bytes so AVX2 is the limit for that dimension.
+// However, 2 way parallel looks trivial to code for AVX512 except for
+// a data dependency with rowa.
+
 #if defined __AVX2__
 // only available with avx2

@@ -65,13 +69,13 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){

 #define LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_ror256_1x64( s1); \
+   s1 = mm256_ror_1x64( s1); \
   s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_rol256_1x64( s3 ); \
+   s3 = mm256_rol_1x64( s3 ); \
   G_4X64( s0, s1, s2, s3 ); \
-   s1 = mm256_rol256_1x64( s1 ); \
+   s1 = mm256_rol_1x64( s1 ); \
   s2 = mm256_swap_128( s2 ); \
-   s3 = mm256_ror256_1x64( s3 );
+   s3 = mm256_ror_1x64( s3 );

 #define LYRA_12_ROUNDS_AVX2( s0, s1, s2, s3 ) \
   LYRA_ROUND_AVX2( s0, s1, s2, s3 ) \
@@ -93,25 +97,25 @@ static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
 // returns void, all args updated
 #define G_2X64(a,b,c,d) \
   a = _mm_add_epi64( a, b ); \
-   d = mm_ror_64( _mm_xor_si128( d, a), 32 ); \
+   d = mm128_ror_64( _mm_xor_si128( d, a), 32 ); \
   c = _mm_add_epi64( c, d ); \
-   b = mm_ror_64( _mm_xor_si128( b, c ), 24 ); \
+   b = mm128_ror_64( _mm_xor_si128( b, c ), 24 ); \
   a = _mm_add_epi64( a, b ); \
-   d = mm_ror_64( _mm_xor_si128( d, a ), 16 ); \
+   d = mm128_ror_64( _mm_xor_si128( d, a ), 16 ); \
   c = _mm_add_epi64( c, d ); \
-   b = mm_ror_64( _mm_xor_si128( b, c ), 63 );
+   b = mm128_ror_64( _mm_xor_si128( b, c ), 63 );

 #define LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm_ror256_1x64( s2, s3 ); \
-   mm_swap_128( s4, s5 ); \
-   mm_rol256_1x64( s6, s7 ); \
+   mm128_ror256_1x64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 ); \
+   mm128_rol256_1x64( s6, s7 ); \
   G_2X64( s0, s2, s4, s6 ); \
   G_2X64( s1, s3, s5, s7 ); \
-   mm_rol256_1x64( s2, s3 ); \
-   mm_swap_128( s4, s5 ); \
-   mm_ror256_1x64( s6, s7 );
+   mm128_rol256_1x64( s2, s3 ); \
+   mm128_swap256_128( s4, s5 ); \
+   mm128_ror256_1x64( s6, s7 );

 #define LYRA_12_ROUNDS_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
   LYRA_ROUND_AVX(s0,s1,s2,s3,s4,s5,s6,s7) \
--- a/algo/neoscrypt/neoscrypt.c
+++ b/algo/neoscrypt/neoscrypt.c
@@ -1080,6 +1080,8 @@ void neoscrypt_wait_for_diff( struct stratum_ctx *stratum )
   }
 }

+int neoscrypt_get_work_data_size () { return 80; }
+
 bool register_neoscrypt_algo( algo_gate_t* gate )
 {
  gate->optimizations         = SSE2_OPT;
@@ -1092,7 +1094,7 @@ bool register_neoscrypt_algo( algo_gate_t* gate )
  gate->work_decode           = (void*)&std_be_work_decode;
  gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
  gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
-  gate->work_data_size        = 80;
+  gate->get_work_data_size    = (void*)&neoscrypt_get_work_data_size;
  return true;
 };

--- a/algo/nist5/nist5-4way.c
+++ b/algo/nist5/nist5-4way.c
@@ -62,15 +62,15 @@ void nist5hash_4way( void *out, const void *input )

     skein512_4way_init( &ctx_skein );
     skein512_4way( &ctx_skein, vhash, 64 );
-     skein512_4way_close( &ctx_skein, vhash );
-
-     mm256_deinterleave_4x64( out, out+32, out+64, out+96, vhash, 256 );
+     skein512_4way_close( &ctx_skein, out );
 }

 int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,
                         uint64_t *hashes_done)
 {
-     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t hash[4*16] __attribute__ ((aligned (64)));
+     uint32_t *hash7 = &(hash[25]);
+     uint32_t lane_hash[8];
     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
     uint32_t endiandata[20] __attribute__((aligned(64)));
     uint32_t *pdata = work->data;
@@ -120,15 +120,16 @@ int scanhash_nist5_4way( int thr_id, struct work *work, uint32_t max_nonce,

              nist5hash_4way( hash, vdata );

-              pdata[19] = n;
-
-              for ( int i = 0; i < 4; i++ )
-              if ( ( !( (hash+(i<<3))[7] & mask ) == 0 )
-                 && fulltest( hash+(i<<3), ptarget ) )
+              for ( int lane = 0; lane < 4; lane++ )
+              if ( ( hash7[ lane ] & mask ) == 0 )
              {
-                 pdata[19] = n+i;         
-                 nonces[ num_found++ ] = n+i;
-                 work_set_target_ratio( work, hash+(i<<3) );
+                 mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
+                 if ( fulltest( lane_hash, ptarget ) )
+                 {
+                    pdata[19] = n + lane;
+                    nonces[ num_found++ ] = n + lane;
+                    work_set_target_ratio( work, lane_hash );
+                 }
              }
              n += 4;
           } while ( ( num_found == 0 ) && ( n < max_nonce )
--- a/algo/nist5/zr5.c
+++ b/algo/nist5/zr5.c
@@ -219,6 +219,8 @@ void zr5_display_pok( struct work* work )
        applog(LOG_BLUE, "POK received: %08xx", work->data[0] );
 }

+int zr5_get_work_data_size() { return 80; }
+
 bool register_zr5_algo( algo_gate_t* gate )
 {
    gate->optimizations = SSE2_OPT | AES_OPT;
@@ -227,12 +229,12 @@ bool register_zr5_algo( algo_gate_t* gate )
    gate->scanhash              = (void*)&scanhash_zr5;
    gate->hash                  = (void*)&zr5hash;
    gate->get_max64             = (void*)&zr5_get_max64;
-    gate->display_extra_data    = (void*)&zr5_display_pok;
+    gate->decode_extra_data     = (void*)&zr5_display_pok;
    gate->build_stratum_request = (void*)&std_be_build_stratum_request;
    gate->work_decode           = (void*)&std_be_work_decode;
    gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
    gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
-    gate->work_data_size        = 80;
+    gate->get_work_data_size    = (void*)&zr5_get_work_data_size;
    gate->work_cmp_size         = 72;
    return true;
 };
--- a/algo/qubit/deep-2way.c
+++ b/algo/qubit/deep-2way.c
@@ -7,7 +7,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/aes_ni/hash_api.h"

--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -4,7 +4,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/luffa/luffa_for_sse2.h" 
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
 #else
--- a/algo/qubit/qubit-2way.c
+++ b/algo/qubit/qubit-2way.c
@@ -7,7 +7,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -4,7 +4,7 @@
 #include <string.h>
 #include <stdio.h>
 #include "algo/luffa/luffa_for_sse2.h" 
-#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/cubehash/cubehash_sse2.h" 
 #include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
 #ifndef NO_AES_NI
--- a/algo/ripemd/lbry-4way.c
+++ b/algo/ripemd/lbry-4way.c
@@ -184,7 +184,8 @@ void lbry_4way_hash( void* output, const void* input )
   sha256_4way( &ctx_sha256, vhashA, 32 );
   sha256_4way_close( &ctx_sha256, vhashA );

-   mm_deinterleave_4x32( output, output+32, output+64, output+96, vhashA, 256 );
+   mm128_deinterleave_4x32( output, output+32, output+64, output+96,
+		            vhashA, 256 );
 }

 int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -209,7 +210,7 @@ int scanhash_lbry_4way( int thr_id, struct work *work, uint32_t max_nonce,

   // we need bigendian data...
   swab32_array( edata, pdata, 32 );
-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 1024 );
   sha256_4way_init( &sha256_mid );
   sha256_4way( &sha256_mid, vdata, LBRY_MIDSTATE );

--- a/algo/ripemd/lbry-gate.c
+++ b/algo/ripemd/lbry-gate.c
@@ -41,8 +41,6 @@ void lbry_le_build_stratum_request( char *req, struct work *work,
   free(xnonce2str);
 }

-// don't use lbry_build_block_header, it can't handle clasim, do it inline
-// in lbry_build_extraheader. The side effect is no gbt support for lbry.
 void lbry_build_block_header( struct work* g_work, uint32_t version,
                             uint32_t *prevhash, uint32_t *merkle_root,
                             uint32_t ntime, uint32_t nbits )
@@ -61,9 +59,6 @@ void lbry_build_block_header( struct work* g_work, uint32_t version,
   for ( i = 0; i < 8; i++ )
      g_work->data[9 + i] = be32dec( merkle_root + i );

-//   for ( int i = 0; i < 8; i++ )
-//        g_work->data[17 + i] = claim[i];
-
   g_work->data[ LBRY_NTIME_INDEX ] = ntime;
   g_work->data[ LBRY_NBITS_INDEX ] = nbits;
   g_work->data[28] = 0x80000000;
@@ -80,10 +75,6 @@ void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
   for ( t = 0; t < sctx->xnonce2_size && !( ++sctx->job.xnonce2[t] ); t++ );
   // Assemble block header 

-//   algo_gate.build_block_header( g_work, le32dec( sctx->job.version ),
-//          (uint32_t*) sctx->job.prevhash, (uint32_t*) merkle_root,
-//          le32dec( sctx->job.ntime ), le32dec( sctx->job.nbits ) );
-
   memset( g_work->data, 0, sizeof(g_work->data) );
   g_work->data[0] = le32dec( sctx->job.version );

@@ -94,7 +85,7 @@ void lbry_build_extraheader( struct work* g_work, struct stratum_ctx* sctx )
      g_work->data[9 + i] = be32dec( (uint32_t *) merkle_root + i );

   for ( int i = 0; i < 8; i++ )
-        g_work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
+        g_work->data[17 + i] = ((uint32_t*)sctx->job.extra)[i];

   g_work->data[ LBRY_NTIME_INDEX ] = le32dec(sctx->job.ntime);
   g_work->data[ LBRY_NBITS_INDEX ] = le32dec(sctx->job.nbits);
@@ -108,6 +99,8 @@ void lbry_set_target( struct work* work, double job_diff )

 int64_t lbry_get_max64() { return 0x1ffffLL; }

+int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; }
+
 bool register_lbry_algo( algo_gate_t* gate )
 {
  gate->optimizations = AVX2_OPT | SHA_OPT;
@@ -130,7 +123,7 @@ bool register_lbry_algo( algo_gate_t* gate )
  gate->ntime_index           = LBRY_NTIME_INDEX;
  gate->nbits_index           = LBRY_NBITS_INDEX;
  gate->nonce_index           = LBRY_NONCE_INDEX;
-  gate->work_data_size        = LBRY_WORK_DATA_SIZE;
+  gate->get_work_data_size    = (void*)&lbry_get_work_data_size;
  return true;
 }

--- a/algo/ripemd/ripemd-hash-4way.c
+++ b/algo/ripemd/ripemd-hash-4way.c
@@ -32,20 +32,20 @@ static const uint32_t IV[5] =
   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( y, z ), x ), z )

 #define F3(x, y, z) \
-   _mm_xor_si128( _mm_or_si128( x, mm_not( y ) ), z )
+   _mm_xor_si128( _mm_or_si128( x, mm128_not( y ) ), z )

 #define F4(x, y, z) \
   _mm_xor_si128( _mm_and_si128( _mm_xor_si128( x, y ), z ), y )

 #define F5(x, y, z) \
-   _mm_xor_si128( x, _mm_or_si128( y, mm_not( z ) ) )
+   _mm_xor_si128( x, _mm_or_si128( y, mm128_not( z ) ) )

 #define RR(a, b, c, d, e, f, s, r, k) \
 do{ \
-   a = _mm_add_epi32( mm_rol_32( _mm_add_epi32( _mm_add_epi32( \
+   a = _mm_add_epi32( mm128_rol_32( _mm_add_epi32( _mm_add_epi32( \
                _mm_add_epi32( a, f( b ,c, d ) ), r ), \
                                 _mm_set1_epi32( k ) ), s ), e ); \
-   c = mm_rol_32( c, 10 );\
+   c = mm128_rol_32( c, 10 );\
 } while (0)

 #define ROUND1(a, b, c, d, e, f, s, r, k)  \
--- a/algo/sha/sha2-hash-4way.c
+++ b/algo/sha/sha2-hash-4way.c
@@ -98,19 +98,19 @@ static const sph_u32 K256[64] = {

 #define BSG2_0(x) \
   _mm_xor_si128( _mm_xor_si128( \
-        mm_ror_32(x,  2), mm_ror_32(x, 13) ), mm_ror_32( x, 22) )
+        mm128_ror_32(x,  2), mm128_ror_32(x, 13) ), mm128_ror_32( x, 22) )

 #define BSG2_1(x) \
   _mm_xor_si128( _mm_xor_si128( \
-        mm_ror_32(x,  6), mm_ror_32(x, 11) ), mm_ror_32( x, 25) )
+        mm128_ror_32(x,  6), mm128_ror_32(x, 11) ), mm128_ror_32( x, 25) )

 #define SSG2_0(x) \
   _mm_xor_si128( _mm_xor_si128( \
-        mm_ror_32(x,  7), mm_ror_32(x, 18) ), _mm_srli_epi32(x, 3) ) 
+        mm128_ror_32(x,  7), mm128_ror_32(x, 18) ), _mm_srli_epi32(x, 3) ) 

 #define SSG2_1(x) \
   _mm_xor_si128( _mm_xor_si128( \
-        mm_ror_32(x, 17), mm_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )
+        mm128_ror_32(x, 17), mm128_ror_32(x, 19) ), _mm_srli_epi32(x, 10) )

 #define SHA2s_4WAY_STEP(A, B, C, D, E, F, G, H, i, j) \
 do { \
@@ -129,22 +129,22 @@ sha256_4way_round( __m128i *in, __m128i r[8] )
   register  __m128i A, B, C, D, E, F, G, H;
   __m128i W[16];

-   W[ 0] = mm_bswap_32( in[ 0] );
-   W[ 1] = mm_bswap_32( in[ 1] );
-   W[ 2] = mm_bswap_32( in[ 2] );
-   W[ 3] = mm_bswap_32( in[ 3] );
-   W[ 4] = mm_bswap_32( in[ 4] );
-   W[ 5] = mm_bswap_32( in[ 5] );
-   W[ 6] = mm_bswap_32( in[ 6] );
-   W[ 7] = mm_bswap_32( in[ 7] );
-   W[ 8] = mm_bswap_32( in[ 8] );
-   W[ 9] = mm_bswap_32( in[ 9] );
-   W[10] = mm_bswap_32( in[10] );
-   W[11] = mm_bswap_32( in[11] );
-   W[12] = mm_bswap_32( in[12] );
-   W[13] = mm_bswap_32( in[13] );
-   W[14] = mm_bswap_32( in[14] );
-   W[15] = mm_bswap_32( in[15] );
+   W[ 0] = mm128_bswap_32( in[ 0] );
+   W[ 1] = mm128_bswap_32( in[ 1] );
+   W[ 2] = mm128_bswap_32( in[ 2] );
+   W[ 3] = mm128_bswap_32( in[ 3] );
+   W[ 4] = mm128_bswap_32( in[ 4] );
+   W[ 5] = mm128_bswap_32( in[ 5] );
+   W[ 6] = mm128_bswap_32( in[ 6] );
+   W[ 7] = mm128_bswap_32( in[ 7] );
+   W[ 8] = mm128_bswap_32( in[ 8] );
+   W[ 9] = mm128_bswap_32( in[ 9] );
+   W[10] = mm128_bswap_32( in[10] );
+   W[11] = mm128_bswap_32( in[11] );
+   W[12] = mm128_bswap_32( in[12] );
+   W[13] = mm128_bswap_32( in[13] );
+   W[14] = mm128_bswap_32( in[14] );
+   W[15] = mm128_bswap_32( in[15] );

   A = r[0];
   B = r[1];
@@ -289,13 +289,13 @@ void sha256_4way_close( sha256_4way_context *sc, void *dst )
    low = low << 3;

    sc->buf[ pad >> 2 ] =
-                 mm_bswap_32( _mm_set1_epi32( high ) );
+                 mm128_bswap_32( _mm_set1_epi32( high ) );
    sc->buf[ ( pad+4 ) >> 2 ] =
-                 mm_bswap_32( _mm_set1_epi32( low ) );
+                 mm128_bswap_32( _mm_set1_epi32( low ) );
    sha256_4way_round( sc->buf, sc->val );

    for ( u = 0; u < 8; u ++ )
-       ((__m128i*)dst)[u] = mm_bswap_32( sc->val[u] );
+       ((__m128i*)dst)[u] = mm128_bswap_32( sc->val[u] );
 }

 #if defined(__AVX2__)
--- a/algo/sha/sha256t-4way.c
+++ b/algo/sha/sha256t-4way.c
@@ -4,7 +4,6 @@
 #include <string.h>
 #include <stdio.h>
 #include "sha2-hash-4way.h"
-//#include <openssl/sha.h>

 #if defined(SHA256T_8WAY)

@@ -25,11 +24,8 @@ void sha256t_8way_hash( void* output, const void* input )

   sha256_8way_init( &ctx );
   sha256_8way( &ctx, vhash, 32 );
-   sha256_8way_close( &ctx, vhash );
+   sha256_8way_close( &ctx, output );

-   mm256_deinterleave_8x32( output,     output+ 32, output+ 64, output+ 96,
-                            output+128, output+160, output+192, output+224,
-                            vhash, 256 );
 }

 int scanhash_sha256t_8way( int thr_id, struct work *work,
@@ -60,7 +56,7 @@ int scanhash_sha256t_8way( int thr_id, struct work *work,
                               0xFFFF0000,
                                        0 };

-   for ( int k = 0; k < 19; k++ )
+   for ( int k = 0; k < 20; k++ )
      be32enc( &edata[k], pdata[k] );

   mm256_interleave_8x32( vdata, edata, edata, edata, edata,
@@ -84,14 +80,22 @@ int scanhash_sha256t_8way( int thr_id, struct work *work,

         sha256t_8way_hash( hash, vdata );

-         for ( int i = 0; i < 8; i++ )
-         if ( ( !( ( hash+(i<<3) )[7] & mask ) )
-              && fulltest( hash+(i<<3), ptarget ) )
-         {
-            pdata[19] = n+i;
-            nonces[ num_found++ ] = n+i;
-            work_set_target_ratio( work, hash+(i<<3) );
-         }
+         uint32_t *hash7 = &(hash[7<<3]); 
+	 
+         for ( int lane = 0; lane < 8; lane++ )
+         if ( !( hash7[ lane ] & mask ) )
+         { 
+            // deinterleave hash for lane
+	    uint32_t lane_hash[8];
+	    mm256_extract_lane_8x32( lane_hash, hash, lane, 256 );
+
+	    if ( fulltest( lane_hash, ptarget ) )
+            {
+	       pdata[19] = n + lane;
+               nonces[ num_found++ ] = n + lane;
+               work_set_target_ratio( work, lane_hash );
+	    }
+	 }
         n += 8;

      } while ( (num_found == 0) && (n < max_nonce)
@@ -122,10 +126,8 @@ void sha256t_4way_hash( void* output, const void* input )

   sha256_4way_init( &ctx );
   sha256_4way( &ctx, vhash, 32 );
-   sha256_4way_close( &ctx, vhash );
+   sha256_4way_close( &ctx, output );

-   mm_deinterleave_4x32( output,     output+ 32, output+ 64, output+ 96,
-                         vhash, 256 );
 }

 int scanhash_sha256t_4way( int thr_id, struct work *work,
@@ -133,6 +135,8 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
 {
   uint32_t vdata[20*4] __attribute__ ((aligned (64)));
   uint32_t hash[8*4] __attribute__ ((aligned (32)));
+   uint32_t *hash7 = &(hash[7<<2]);
+   uint32_t lane_hash[8];
   uint32_t edata[20] __attribute__ ((aligned (32)));;
   uint32_t *pdata = work->data;
   uint32_t *ptarget = work->target;
@@ -159,7 +163,7 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,
   for ( int k = 0; k < 19; k++ )
      be32enc( &edata[k], pdata[k] );

-   mm_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
+   mm128_interleave_4x32( vdata, edata, edata, edata, edata, 640 );
   sha256_4way_init( &sha256_ctx4 );
   sha256_4way( &sha256_ctx4, vdata, 64 );

@@ -175,15 +179,20 @@ int scanhash_sha256t_4way( int thr_id, struct work *work,

         sha256t_4way_hash( hash, vdata );

-         for ( int i = 0; i < 4; i++ )
-         if ( ( !( ( hash+(i<<3) )[7] & mask ) )
-              && fulltest( hash+(i<<3), ptarget ) )
+         for ( int lane = 0; lane < 4; lane++ )
+         if ( !( hash7[ lane ] & mask ) )
         {
-            pdata[19] = n+i;
-            nonces[ num_found++ ] = n+i;
-            work_set_target_ratio( work, hash+(i<<3) );
+            mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+
+            if ( fulltest( lane_hash, ptarget ) )
+            {
+               pdata[19] = n + lane;
+               nonces[ num_found++ ] = n + lane;
+               work_set_target_ratio( work, lane_hash );
+            }
         }
-         n += 4;
+
+	 n += 4;

      } while ( (num_found == 0) && (n < max_nonce)
                && !work_restart[thr_id].restart );
--- a/algo/sha/sha256t-gate.c
+++ b/algo/sha/sha256t-gate.c
@@ -3,16 +3,18 @@
 bool register_sha256t_algo( algo_gate_t* gate )
 {
 #if defined(SHA256T_8WAY)
+    gate->optimizations = SSE42_OPT | AVX2_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_8way;
    gate->hash       = (void*)&sha256t_8way_hash;
 #elif defined(SHA256T_4WAY)
+    gate->optimizations = SSE42_OPT | AVX2_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t_4way;
    gate->hash       = (void*)&sha256t_4way_hash;
 #else
+    gate->optimizations = SSE42_OPT | AVX2_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_sha256t;
    gate->hash       = (void*)&sha256t_hash;
 #endif
-    gate->optimizations = SSE42_OPT | AVX2_OPT | SHA_OPT;
    gate->get_max64  = (void*)&get_max64_0x3ffff;
    return true;
 }
--- a/algo/shabal/shabal-hash-4way.c
+++ b/algo/shabal/shabal-hash-4way.c
@@ -248,22 +248,22 @@ do { \
 */
 #define SWAP_BC \
 do { \
-    mm_swap_128( B0, C0 ); \
-    mm_swap_128( B1, C1 ); \
-    mm_swap_128( B2, C2 ); \
-    mm_swap_128( B3, C3 ); \
-    mm_swap_128( B4, C4 ); \
-    mm_swap_128( B5, C5 ); \
-    mm_swap_128( B6, C6 ); \
-    mm_swap_128( B7, C7 ); \
-    mm_swap_128( B8, C8 ); \
-    mm_swap_128( B9, C9 ); \
-    mm_swap_128( BA, CA ); \
-    mm_swap_128( BB, CB ); \
-    mm_swap_128( BC, CC ); \
-    mm_swap_128( BD, CD ); \
-    mm_swap_128( BE, CE ); \
-    mm_swap_128( BF, CF ); \
+    mm128_swap256_128( B0, C0 ); \
+    mm128_swap256_128( B1, C1 ); \
+    mm128_swap256_128( B2, C2 ); \
+    mm128_swap256_128( B3, C3 ); \
+    mm128_swap256_128( B4, C4 ); \
+    mm128_swap256_128( B5, C5 ); \
+    mm128_swap256_128( B6, C6 ); \
+    mm128_swap256_128( B7, C7 ); \
+    mm128_swap256_128( B8, C8 ); \
+    mm128_swap256_128( B9, C9 ); \
+    mm128_swap256_128( BA, CA ); \
+    mm128_swap256_128( BB, CB ); \
+    mm128_swap256_128( BC, CC ); \
+    mm128_swap256_128( BD, CD ); \
+    mm128_swap256_128( BE, CE ); \
+    mm128_swap256_128( BF, CF ); \
 } while (0)

 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) \
@@ -271,9 +271,9 @@ do { \
   xa0 = _mm_xor_si128( xm, _mm_xor_si128( xb1, _mm_xor_si128(  \
            _mm_andnot_si128( xb3, xb2 ), \
            _mm_mullo_epi32( _mm_xor_si128( xa0, _mm_xor_si128( xc, \
-               _mm_mullo_epi32(  mm_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
+               _mm_mullo_epi32(  mm128_rol_32( xa1, 15 ), _mm_set1_epi32(5UL) ) \
                   ) ), _mm_set1_epi32(3UL) ) ) ) ); \
-   xb0 = mm_not( _mm_xor_si128( xa0, mm_rol_32( xb0, 1 ) ) ); \
+   xb0 = mm128_not( _mm_xor_si128( xa0, mm128_rol_32( xb0, 1 ) ) ); \
 } while (0)

 #define PERM_STEP_0   do { \
@@ -335,22 +335,22 @@ do { \

 #define APPLY_P \
 do { \
-    B0 = mm_ror_32( B0, 15 ); \
-    B1 = mm_ror_32( B1, 15 ); \
-    B2 = mm_ror_32( B2, 15 ); \
-    B3 = mm_ror_32( B3, 15 ); \
-    B4 = mm_ror_32( B4, 15 ); \
-    B5 = mm_ror_32( B5, 15 ); \
-    B6 = mm_ror_32( B6, 15 ); \
-    B7 = mm_ror_32( B7, 15 ); \
-    B8 = mm_ror_32( B8, 15 ); \
-    B9 = mm_ror_32( B9, 15 ); \
-    BA = mm_ror_32( BA, 15 ); \
-    BB = mm_ror_32( BB, 15 ); \
-    BC = mm_ror_32( BC, 15 ); \
-    BD = mm_ror_32( BD, 15 ); \
-    BE = mm_ror_32( BE, 15 ); \
-    BF = mm_ror_32( BF, 15 ); \
+    B0 = mm128_ror_32( B0, 15 ); \
+    B1 = mm128_ror_32( B1, 15 ); \
+    B2 = mm128_ror_32( B2, 15 ); \
+    B3 = mm128_ror_32( B3, 15 ); \
+    B4 = mm128_ror_32( B4, 15 ); \
+    B5 = mm128_ror_32( B5, 15 ); \
+    B6 = mm128_ror_32( B6, 15 ); \
+    B7 = mm128_ror_32( B7, 15 ); \
+    B8 = mm128_ror_32( B8, 15 ); \
+    B9 = mm128_ror_32( B9, 15 ); \
+    BA = mm128_ror_32( BA, 15 ); \
+    BB = mm128_ror_32( BB, 15 ); \
+    BC = mm128_ror_32( BC, 15 ); \
+    BD = mm128_ror_32( BD, 15 ); \
+    BE = mm128_ror_32( BE, 15 ); \
+    BF = mm128_ror_32( BF, 15 ); \
    PERM_STEP_0; \
    PERM_STEP_1; \
    PERM_STEP_2; \
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -64,11 +64,11 @@ static const sph_u32 IV512[] = {
 // a[3:0] = { b[0], a[3], a[2], a[1] }
 #if defined(__SSSE3__)

-#define mm_ror256hi_1x32( a, b )  _mm_alignr_epi8( b, a, 4 )
+#define mm128_ror256hi_1x32( a, b )  _mm_alignr_epi8( b, a, 4 )

 #else  // SSE2

-#define mm_ror256hi_1x32( a, b ) \
+#define mm128_ror256hi_1x32( a, b ) \
   _mm_or_si128( _mm_srli_si128( a,  4 ), \
                 _mm_slli_si128( b, 12 ) )

@@ -136,7 +136,7 @@ c512( sph_shavite_big_context *sc, const void *msg )
   for ( r = 0; r < 3; r ++ )
   {
      // round 1, 5, 9
-      k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
+      k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
      k00 = _mm_xor_si128( k00, k13 ); 

      if ( r == 0 )
@@ -145,7 +145,7 @@ c512( sph_shavite_big_context *sc, const void *msg )

      x = _mm_xor_si128( p0, k00 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
+      k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
      k01 = _mm_xor_si128( k01, k00 );

      if ( r == 1 )
@@ -154,33 +154,33 @@ c512( sph_shavite_big_context *sc, const void *msg )

      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
+      k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
      k02 = _mm_xor_si128( k02, k01 );

      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
+      k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
      k03 = _mm_xor_si128( k03, k02 );

      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, m128_zero );
      p3 = _mm_xor_si128( p3, x );
-      k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
+      k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
      k10 = _mm_xor_si128( k10, k03 );

      x = _mm_xor_si128( p2, k10 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
+      k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
      k11 = _mm_xor_si128( k11, k10 );

      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
+      k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
      k12 = _mm_xor_si128( k12, k11 );

      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
+      k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
      k13 = _mm_xor_si128( k13, k12 );

      if ( r == 2 )
@@ -193,80 +193,80 @@ c512( sph_shavite_big_context *sc, const void *msg )

      // round 2, 6, 10

-      k00 = _mm_xor_si128( k00, mm_ror256hi_1x32( k12, k13 ) );
+      k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );
      x = _mm_xor_si128( p3, k00 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k01 = _mm_xor_si128( k01, mm_ror256hi_1x32( k13, k00 ) );
+      k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );
      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k02 = _mm_xor_si128( k02, mm_ror256hi_1x32( k00, k01 ) );
+      k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );
      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k03 = _mm_xor_si128( k03, mm_ror256hi_1x32( k01, k02 ) );
+      k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );
      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, m128_zero );

      p2 = _mm_xor_si128( p2, x );
-      k10 = _mm_xor_si128( k10, mm_ror256hi_1x32( k02, k03 ) );
+      k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );
      x = _mm_xor_si128( p1, k10 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k11 = _mm_xor_si128( k11, mm_ror256hi_1x32( k03, k10 ) );
+      k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );
      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k12 = _mm_xor_si128( k12, mm_ror256hi_1x32( k10, k11 ) );
+      k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );
      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k13 = _mm_xor_si128( k13, mm_ror256hi_1x32( k11, k12 ) );
+      k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );
      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, m128_zero );
      p0 = _mm_xor_si128( p0, x );

      // round 3, 7, 11

-      k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
+      k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
      k00 = _mm_xor_si128( k00, k13 );

      x = _mm_xor_si128( p2, k00 );
      x = _mm_aesenc_si128( x, m128_zero );

-      k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
+      k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) );
      k01 = _mm_xor_si128( k01, k00 );

      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
+      k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
      k02 = _mm_xor_si128( k02, k01 );

      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
+      k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
      k03 = _mm_xor_si128( k03, k02 );

      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, m128_zero );
      p1 = _mm_xor_si128( p1, x );
-      k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
+      k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
      k10 = _mm_xor_si128( k10, k03 );

      x = _mm_xor_si128( p0, k10 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
+      k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
      k11 = _mm_xor_si128( k11, k10 );

      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
+      k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
      k12 = _mm_xor_si128( k12, k11 );

      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
+      k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
      k13 = _mm_xor_si128( k13, k12 );

      x = _mm_xor_si128( x, k13 );
@@ -275,36 +275,36 @@ c512( sph_shavite_big_context *sc, const void *msg )

      // round 4, 8, 12

-      k00 = _mm_xor_si128( k00, mm_ror256hi_1x32( k12, k13 ) );
+      k00 = _mm_xor_si128( k00, mm128_ror256hi_1x32( k12, k13 ) );

      x = _mm_xor_si128( p1, k00 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k01 = _mm_xor_si128( k01, mm_ror256hi_1x32( k13, k00 ) );
+      k01 = _mm_xor_si128( k01, mm128_ror256hi_1x32( k13, k00 ) );

      x = _mm_xor_si128( x, k01 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k02 = _mm_xor_si128( k02, mm_ror256hi_1x32( k00, k01 ) );
+      k02 = _mm_xor_si128( k02, mm128_ror256hi_1x32( k00, k01 ) );

      x = _mm_xor_si128( x, k02 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k03 = _mm_xor_si128( k03, mm_ror256hi_1x32( k01, k02 ) );
+      k03 = _mm_xor_si128( k03, mm128_ror256hi_1x32( k01, k02 ) );

      x = _mm_xor_si128( x, k03 );
      x = _mm_aesenc_si128( x, m128_zero );
      p0 = _mm_xor_si128( p0, x );
-      k10 = _mm_xor_si128( k10, mm_ror256hi_1x32( k02, k03 ) );
+      k10 = _mm_xor_si128( k10, mm128_ror256hi_1x32( k02, k03 ) );

      x = _mm_xor_si128( p3, k10 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k11 = _mm_xor_si128( k11, mm_ror256hi_1x32( k03, k10 ) );
+      k11 = _mm_xor_si128( k11, mm128_ror256hi_1x32( k03, k10 ) );

      x = _mm_xor_si128( x, k11 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k12 = _mm_xor_si128( k12, mm_ror256hi_1x32( k10, k11 ) );
+      k12 = _mm_xor_si128( k12, mm128_ror256hi_1x32( k10, k11 ) );

      x = _mm_xor_si128( x, k12 );
      x = _mm_aesenc_si128( x, m128_zero );
-      k13 = _mm_xor_si128( k13, mm_ror256hi_1x32( k11, k12 ) );
+      k13 = _mm_xor_si128( k13, mm128_ror256hi_1x32( k11, k12 ) );

      x = _mm_xor_si128( x, k13 );
      x = _mm_aesenc_si128( x, m128_zero );
@@ -313,44 +313,44 @@ c512( sph_shavite_big_context *sc, const void *msg )

   // round 13

-   k00 = mm_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
+   k00 = mm128_ror_1x32( _mm_aesenc_si128( k00, m128_zero ) );
   k00 = _mm_xor_si128( k00, k13 );

   x = _mm_xor_si128( p0, k00 );
   x = _mm_aesenc_si128( x, m128_zero );
-   k01 = mm_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) ); 
+   k01 = mm128_ror_1x32( _mm_aesenc_si128( k01, m128_zero ) ); 
   k01 = _mm_xor_si128( k01, k00 );

   x = _mm_xor_si128( x, k01 );
   x = _mm_aesenc_si128( x, m128_zero );
-   k02 = mm_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
+   k02 = mm128_ror_1x32( _mm_aesenc_si128( k02, m128_zero ) );
   k02 = _mm_xor_si128( k02, k01 );

   x = _mm_xor_si128( x, k02 );
   x = _mm_aesenc_si128( x, m128_zero );
-   k03 = mm_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
+   k03 = mm128_ror_1x32( _mm_aesenc_si128( k03, m128_zero ) );
   k03 = _mm_xor_si128( k03, k02 );

   x = _mm_xor_si128( x, k03 );
   x = _mm_aesenc_si128( x, m128_zero );
   p3 = _mm_xor_si128( p3, x );
-   k10 = mm_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
+   k10 = mm128_ror_1x32( _mm_aesenc_si128( k10, m128_zero ) );
   k10 = _mm_xor_si128( k10, k03 );

   x = _mm_xor_si128( p2, k10 );
   x = _mm_aesenc_si128( x, m128_zero );
-   k11 = mm_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
+   k11 = mm128_ror_1x32( _mm_aesenc_si128( k11, m128_zero ) );
   k11 = _mm_xor_si128( k11, k10 );

   x = _mm_xor_si128( x, k11 );
   x = _mm_aesenc_si128( x, m128_zero );
-   k12 = mm_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
+   k12 = mm128_ror_1x32( _mm_aesenc_si128( k12, m128_zero ) );
   k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
               ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );

   x = _mm_xor_si128( x, k12 );
   x = _mm_aesenc_si128( x, m128_zero );
-   k13 = mm_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
+   k13 = mm128_ror_1x32( _mm_aesenc_si128( k13, m128_zero ) );
   k13 = _mm_xor_si128( k13, k12 );

   x = _mm_xor_si128( x, k13 );
--- a/algo/simd/simd-hash-2way.c
+++ b/algo/simd/simd-hash-2way.c
@@ -198,13 +198,13 @@ do { \
 #undef BUTTERFLY_N

  // Multiply by twiddle factors
-  X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i );
-  X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i );
-  X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i );
-  X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i );
-  X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i );
-  X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i );
-  X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i );
+  X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].v256 );
+  X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].v256 );
+  X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].v256 );
+  X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].v256 );
+  X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].v256 );
+  X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].v256 );
+  X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].v256 );

  // Transpose the FFT state with a revbin order permutation
  // on the rows and the column.
@@ -319,7 +319,7 @@ void fft128_2way( void *a )
    B[ i ]   = REDUCE_FULL_S( B[ i ] );
    A[ i+8 ] = _mm256_sub_epi16( A[ i ], A[ i+8 ] );
    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
-    A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].m256i );
+    A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].v256 );
    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
  }

@@ -347,10 +347,10 @@ void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
 do { \
    __m256i t = X[i]; \
    A[2*i]   = _mm256_unpacklo_epi8( t, m256_zero ); \
-    A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].m256i ); \
+    A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].v256 ); \
    A[2*i+8] = REDUCE(A[2*i+8]); \
    A[2*i+1] = _mm256_unpackhi_epi8( t, m256_zero ); \
-    A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].m256i ); \
+    A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].v256 ); \
    A[2*i+9] = REDUCE(A[2*i+9]); \
 } while(0)

@@ -360,12 +360,12 @@ do { \
    __m256i t = X[i]; \
    __m256i tmp; \
    A[2*i]   = _mm256_unpacklo_epi8( t, m256_zero ); \
-    A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].m256i ); \
+    A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].v256 ); \
    A[2*i+8] = REDUCE( A[ 2*i+8 ] ); \
    tmp      = _mm256_unpackhi_epi8( t, m256_zero ); \
    A[2*i+1] = _mm256_add_epi16( tmp, tw ); \
    A[2*i+9] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
-                                   FFT128_Twiddle[ 2*i+1 ].m256i );\
+                                   FFT128_Twiddle[ 2*i+1 ].v256 );\
    A[2*i+9] = REDUCE( A[ 2*i+9 ] );                       \
 } while(0)

@@ -373,9 +373,9 @@ do { \
  UNPACK( 1 );
  UNPACK( 2 );
  if ( final )
-    UNPACK_TWEAK( 3, FinalTweak.m256i );
+    UNPACK_TWEAK( 3, FinalTweak.v256 );
  else
-    UNPACK_TWEAK( 3, Tweak.m256i );
+    UNPACK_TWEAK( 3, Tweak.v256 );

 #undef UNPACK
 #undef UNPACK_TWEAK
@@ -398,11 +398,11 @@ do { \
    __m256i t = X[i]; \
    A[ 2*i      ] = _mm256_unpacklo_epi8( t, m256_zero ); \
    A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
-                                        FFT256_Twiddle[ 2*i ].m256i ); \
+                                        FFT256_Twiddle[ 2*i ].v256 ); \
    A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
    A[ 2*i +  1 ] = _mm256_unpackhi_epi8( t, m256_zero ); \
    A[ 2*i + 17 ] = _mm256_mullo_epi16( A[ 2*i + 1 ], \
-                                        FFT256_Twiddle[ 2*i + 1 ].m256i ); \
+                                        FFT256_Twiddle[ 2*i + 1 ].v256 ); \
    A[ 2*i + 17 ] = REDUCE( A[ 2*i + 17 ] ); \
 } while(0)

@@ -413,12 +413,12 @@ do { \
    __m256i tmp; \
    A[ 2*i      ] = _mm256_unpacklo_epi8( t, m256_zero ); \
    A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
-                                        FFT256_Twiddle[ 2*i ].m256i ); \
+                                        FFT256_Twiddle[ 2*i ].v256 ); \
    A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
    tmp           = _mm256_unpackhi_epi8( t, m256_zero ); \
    A[ 2*i +  1 ] = _mm256_add_epi16( tmp, tw ); \
    A[ 2*i + 17 ] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
-                                        FFT256_Twiddle[ 2*i + 1 ].m256i ); \
+                                        FFT256_Twiddle[ 2*i + 1 ].v256 ); \
  } while(0)

  UNPACK( 0 );
@@ -429,9 +429,9 @@ do { \
  UNPACK( 5 );
  UNPACK( 6 );
  if ( final )
-    UNPACK_TWEAK( 7, FinalTweak.m256i );
+    UNPACK_TWEAK( 7, FinalTweak.v256 );
  else
-    UNPACK_TWEAK( 7, Tweak.m256i );
+    UNPACK_TWEAK( 7, Tweak.v256 );

 #undef UNPACK
 #undef UNPACK_TWEAK
@@ -447,7 +447,7 @@ void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
  __m256i *S = (__m256i*) state;
  __m256i *M = (__m256i*) msg;
  __m256i *W = (__m256i*) fft;
-  static const m256_v16 code[] = { mm256_setc1_16(185), mm256_setc1_16(233) };
+  static const m256_v16 code[] = { mm256_const1_16(185), mm256_const1_16(233) };

  S0l = _mm256_xor_si256( S[0], M[0] );
  S0h = _mm256_xor_si256( S[1], M[1] );
@@ -612,9 +612,9 @@ do { \
    int a = MSG_##u(hh); \
    int b = MSG_##u(ll); \
    w##l = _mm256_unpacklo_epi16( W[a], W[b] ); \
-    w##l = _mm256_mullo_epi16( w##l, code[z].m256i ); \
+    w##l = _mm256_mullo_epi16( w##l, code[z].v256 ); \
    w##h = _mm256_unpackhi_epi16( W[a], W[b]) ; \
-    w##h = _mm256_mullo_epi16( w##h, code[z].m256i ); \
+    w##h = _mm256_mullo_epi16( w##h, code[z].v256 ); \
 } while(0)

 #define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
--- a/algo/skein/skein-4way.c
+++ b/algo/skein/skein-4way.c
@@ -21,9 +21,10 @@ void skeinhash_4way( void *state, const void *input )

     sha256_4way_init( &ctx_sha256 );
     sha256_4way( &ctx_sha256, vhash32, 64 );
-     sha256_4way_close( &ctx_sha256, vhash32 );
+     sha256_4way_close( &ctx_sha256, state );

-     mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash32, 256 );
+     mm128_deinterleave_4x32( state, state+32, state+64, state+96,
+		              vhash32, 256 );
 }

 int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
@@ -31,6 +32,8 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,
 {
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
+    uint32_t lane_hash[8];
+    uint32_t *hash7 = &(hash[7<<2]);
    uint32_t edata[20] __attribute__ ((aligned (64)));
    uint32_t *pdata = work->data;
    uint32_t *ptarget = work->target;
@@ -58,12 +61,16 @@ int scanhash_skein_4way( int thr_id, struct work *work, uint32_t max_nonce,

       skeinhash_4way( hash, vdata );

-       for ( int i = 0; i < 4; i++ )
-       if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+       for ( int lane = 0; lane < 4; lane++ )
+       if (  hash7[ lane ] <= Htarg )
       {
-           pdata[19] = n+i;
-           nonces[ num_found++ ] = n+i;
-           work_set_target_ratio( work, hash+(i<<3) );
+          mm128_extract_lane_4x32( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) )
+          {
+             pdata[19] = n + lane;
+             nonces[ num_found++ ] = n + lane;
+             work_set_target_ratio( work, lane_hash );
+          }
       }
       n += 4;
    } while ( (num_found == 0) && (n < max_nonce)
--- a/algo/skein/skein2-4way.c
+++ b/algo/skein/skein2-4way.c
@@ -9,7 +9,6 @@ void skein2hash_4way( void *output, const void *input )
 {
   skein512_4way_context ctx;
   uint64_t hash[8*4] __attribute__ ((aligned (64)));
-   uint64_t *out64 = (uint64_t*)output;

   skein512_4way_init( &ctx );
   skein512_4way( &ctx, input, 80 );
@@ -17,15 +16,14 @@ void skein2hash_4way( void *output, const void *input )

   skein512_4way_init( &ctx );
   skein512_4way( &ctx, hash, 64 );
-   skein512_4way_close( &ctx, hash );
-
-   mm256_deinterleave_4x64( out64, out64+4, out64+8, out64+12, hash, 256 );
+   skein512_4way_close( &ctx, output );
 }

 int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
                          uint64_t *hashes_done )
 {
    uint32_t hash[8*4] __attribute__ ((aligned (64)));
+    uint32_t *hash7 = &(hash[25]);
    uint32_t vdata[20*4] __attribute__ ((aligned (64)));
    uint32_t endiandata[20] __attribute__ ((aligned (64)));
    uint64_t *edata = (uint64_t*)endiandata;
@@ -34,7 +32,6 @@ int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,
    const uint32_t Htarg = ptarget[7];
    const uint32_t first_nonce = pdata[19];
    uint32_t n = first_nonce;
-    // hash is returned deinterleaved
    uint32_t *nonces = work->nonces;
    int num_found = 0;

@@ -53,12 +50,18 @@ int scanhash_skein2_4way( int thr_id, struct work *work, uint32_t max_nonce,

       skein2hash( hash, vdata );

-       for ( int i = 0; i < 4; i++ )
-       if ( (hash+(i<<3))[7] <= Htarg && fulltest( hash+(i<<3), ptarget ) )
+       for ( int lane = 0; lane < 4; lane++ )
+       if ( hash7[ lane ] <= Htarg )
       {
-          pdata[19] = n+i;
-          nonces[ num_found++ ] = n+i;
-          work_set_target_ratio( work, hash+(i<<3) );
+          // deinterleave hash for lane
+          uint32_t lane_hash[8];
+          mm256_extract_lane_4x64( lane_hash, hash, lane, 256 );
+          if ( fulltest( lane_hash, ptarget ) )
+          {
+             pdata[19] = n + lane;
+             nonces[ num_found++ ] = n + lane;
+             work_set_target_ratio( work, lane_hash );
+          }
       }
       n += 4;
    } while ( (num_found == 0) && (n < max_nonce)
--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -125,20 +125,20 @@ void sm3_4way_close( void *cc, void *dst )
      memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
   }

-   count[0] = mm_bswap_32(
+   count[0] = mm128_bswap_32(
                  _mm_set1_epi32( ctx->nblocks >> 23 ) );
-   count[1] = mm_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+   count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
                                              ( ctx->num     << 3 ) ) );
   sm3_4way_compress( ctx->digest, block );

   for ( i = 0; i < 8 ; i++ )
-     hash[i] = mm_bswap_32( ctx->digest[i] );
+     hash[i] = mm128_bswap_32( ctx->digest[i] );
 }

-#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rol_32( x,  9 ), \
-                                               mm_rol_32( x, 17 ) ) ) 
-#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm_rol_32( x, 15 ), \
-                                               mm_rol_32( x, 23 ) ) ) 
+#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x,  9 ), \
+                                               mm128_rol_32( x, 17 ) ) ) 
+#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \
+                                               mm128_rol_32( x, 23 ) ) ) 

 #define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) )
 #define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \
@@ -165,13 +165,13 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   int j;

   for ( j = 0; j < 16; j++ )
-      W[j] = mm_bswap_32( block[j] );
+      W[j] = mm128_bswap_32( block[j] );

   for ( j = 16; j < 68; j++ )
      W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
                                                              W[ j-9 ] ),
-                                               mm_rol_32( W[ j-3 ], 15 ) ) ),
-                            _mm_xor_si128( mm_rol_32( W[ j-13 ], 7 ),
+                                               mm128_rol_32( W[ j-3 ], 15 ) ) ),
+                            _mm_xor_si128( mm128_rol_32( W[ j-13 ], 7 ),
                                           W[ j-6 ] ) );

   for( j = 0; j < 64; j++ )
@@ -180,19 +180,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   T = _mm_set1_epi32( 0x79CC4519UL );
   for( j =0; j < 16; j++ )
   {
-      SS1 = mm_rol_32( _mm_add_epi32( _mm_add_epi32( mm_rol_32( A, 12 ), E ),
-                                      mm_rol_32( T, j ) ), 7 );
-      SS2 = _mm_xor_si128( SS1, mm_rol_32( A, 12 ) );
+      SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
+                                      mm128_rol_32( T, j ) ), 7 );
+      SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ),
                                          SS2 ), W1[j] );
      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ),
                                          SS1 ), W[j] );
      D = C;
-      C = mm_rol_32( B, 9 );
+      C = mm128_rol_32( B, 9 );
      B = A;
      A = TT1;
      H = G;
-      G = mm_rol_32( F, 19 );
+      G = mm128_rol_32( F, 19 );
      F = E;
      E = P0( TT2 );
   }
@@ -200,19 +200,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
   T = _mm_set1_epi32( 0x7A879D8AUL );
   for( j =16; j < 64; j++ )
   {
-      SS1 = mm_rol_32( _mm_add_epi32( _mm_add_epi32( mm_rol_32( A, 12 ), E ),
-                                      mm_rol_32( T, j&31 ) ), 7 );
-      SS2 = _mm_xor_si128( SS1, mm_rol_32( A, 12 ) );
+      SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ),
+                                      mm128_rol_32( T, j&31 ) ), 7 );
+      SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) );
      TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ), 
                                          SS2 ), W1[j] );
      TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ),
                                          SS1 ), W[j] );
      D = C;
-      C = mm_rol_32( B, 9 );
+      C = mm128_rol_32( B, 9 );
      B = A;
      A = TT1;
      H = G;
-      G = mm_rol_32( F, 19 );
+      G = mm128_rol_32( F, 19 );
      F = E;
      E = P0( TT2 );
   }
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -13,7 +13,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -23,7 +23,7 @@
 #endif

 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/keccak/sse2/keccak.c"
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -13,7 +13,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"

 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread int permutation[TT8_FUNC_COUNT] = { 0 };
--- a/algo/x11/timetravel.c
+++ b/algo/x11/timetravel.c
@@ -10,7 +10,7 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #ifdef NO_AES_NI
  #include "algo/groestl/sph_groestl.h"
 #else
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -13,7 +13,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"

--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -9,7 +9,7 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/nist.h"

--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -12,7 +12,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -20,7 +20,7 @@
 #endif

 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"  
 #include "algo/keccak/sse2/keccak.c"
--- a/algo/x11/x11evo-4way.c
+++ b/algo/x11/x11evo-4way.c
@@ -15,7 +15,7 @@
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/simd-hash-2way.h"

 typedef struct {
--- a/algo/x11/x11evo-gate.c
+++ b/algo/x11/x11evo-gate.c
@@ -31,11 +31,13 @@ int nextPerm( uint8_t n[], uint32_t count )
      return 0;

   for ( i = count - 1; i>0 && n[i - 1] >= n[i]; i-- );
-      tail = i;
+   tail = i;

   if ( tail > 0 )
+   {
      for ( j = count - 1; j>tail && n[j] <= n[tail - 1]; j-- );
-           evo_swap( &n[tail - 1], &n[j] );
+      evo_swap( &n[tail - 1], &n[j] );
+   }

   for ( i = tail, j = count - 1; i<j; i++, j-- )
      evo_swap( &n[i], &n[j] );
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -23,7 +23,7 @@
 #endif

 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"

 typedef struct {
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -14,7 +14,7 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -11,7 +11,7 @@
 #include "algo/echo/sph_echo.h"

 #include "algo/luffa/luffa_for_sse2.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/keccak/sse2/keccak.c"
--- a/algo/x12/x12-4way.c
+++ b/algo/x12/x12-4way.c
@@ -13,7 +13,7 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/luffa/luffa-hash-2way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
 #include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
--- a/algo/x12/x12.c
+++ b/algo/x12/x12.c
@@ -20,7 +20,7 @@
 //#include "algo/fugue/sph_fugue.h"

 #include "algo/luffa/luffa_for_sse2.h" 
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"   
 #include "algo/bmw/sse2/bmw.c"
--- a/algo/x13/drop.c
+++ b/algo/x13/drop.c
@@ -238,6 +238,8 @@ void drop_display_pok( struct work* work )
        applog(LOG_BLUE, "POK received: %08xx", work->data[0] );
 }

+int drop_get_work_data_size() { return 80; }
+
 // Need to fix POK offset problems like zr5
 bool register_drop_algo( algo_gate_t* gate )
 {
@@ -250,8 +252,8 @@ bool register_drop_algo( algo_gate_t* gate )
    gate->work_decode           = (void*)&std_be_work_decode;
    gate->submit_getwork_result = (void*)&std_be_submit_getwork_result;
    gate->set_work_data_endian  = (void*)&set_work_data_big_endian;
-    gate->display_extra_data    = (void*)&drop_display_pok;
-    gate->work_data_size        = 80;
+    gate->decode_extra_data     = (void*)&drop_display_pok;
+    gate->get_work_data_size    = (void*)&drop_get_work_data_size;
    gate->work_cmp_size         = 72;
    return true;
 };
--- a/algo/x13/phi1612-4way.c
+++ b/algo/x13/phi1612-4way.c
@@ -8,7 +8,7 @@
 #include <stdio.h>
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
-#include "algo/cubehash/sse2/cubehash_sse2.h"
+#include "algo/cubehash/cubehash_sse2.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/echo/aes_ni/hash_api.h"
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Jay D Dee	e1aead3c76	v3.9.0	2019-05-19 13:39:45 -04:00
Jay D Dee	bfd1c002f9	v3.8.8.1	2018-05-11 11:52:36 -04:00
Jay D Dee	9edc650042	v3.8.7.2	2018-04-11 13:44:26 -04:00
Jay D Dee	218cef337a	v3.8.7.1	2018-04-10 21:49:06 -04:00
Jay D Dee	9ffce7bdb7	v3.8.7	2018-04-09 19:14:38 -04:00