From a28daca3ce761b60ab563bdb3a0f7384226f0713 Mon Sep 17 00:00:00 2001
From: Jay D Dee <jayddee246@gmail.com>
Date: Wed, 7 Feb 2018 16:38:45 -0500
Subject: [PATCH] v3.8.1

---
 Makefile.am                            |   12 +-
 README.md                              |   11 +-
 README.txt                             |    9 +
 RELEASE_NOTES                          |   16 +-
 algo/blake/blake-hash-4way.c           |  190 +--
 algo/bmw/bmw-hash-4way.c               |    2 +-
 algo/cubehash/sse2/cubehash_sse2.c     |    2 +-
 algo/hamsi/hamsi-hash-4way.c           |  977 ++++++++----
 algo/hamsi/hamsi-hash-4way.h           |   14 +-
 algo/hamsi/hamsi-helper-4way.c         |  482 ------
 algo/hamsi/sph_hamsi.c.test            |  940 +++++++++++
 algo/haval/haval-4way-helper.c         |    2 +-
 algo/heavy/bastion.c                   |    2 +-
 algo/hodl/hodl-gate.c                  |    1 +
 algo/jh/jha-4way.c                     |    2 +-
 algo/keccak/keccak-hash-4way.c         |   14 +-
 algo/luffa/luffa-hash-2way.c           |  568 +++++++
 algo/luffa/luffa-hash-2way.h           |   69 +
 algo/luffa/{sse2 => }/luffa_for_sse2.c |   24 +-
 algo/luffa/{sse2 => }/luffa_for_sse2.h |    0
 algo/quark/anime-4way.c                |    6 +-
 algo/quark/quark-4way.c                |    6 +-
 algo/qubit/deep-2way.c                 |  130 ++
 algo/qubit/deep-gate.c                 |   17 +
 algo/qubit/deep-gate.h                 |   32 +
 algo/qubit/deep.c                      |   13 +-
 algo/qubit/qubit-2way.c                |  138 ++
 algo/qubit/qubit-gate.c                |   17 +
 algo/qubit/qubit-gate.h                |   32 +
 algo/qubit/qubit.c                     |   19 +-
 algo/scrypt.c                          |    1 +
 algo/sha/md-helper-4way.c              |   12 +-
 algo/sha/sha2-hash-4way.c              |    8 +-
 algo/shavite/sph-shavite-aesni.c       |  156 +-
 algo/simd/{sse2 => }/nist.c            |    0
 algo/simd/{sse2 => }/nist.h            |    0
 algo/simd/{sse2 => }/simd-compat.h     |    0
 algo/simd/simd-hash-2way.c             |  853 ++++++++++
 algo/simd/simd-hash-2way.h             |   27 +
 algo/simd/{sse2 => }/simd_iv.h         |    5 +
 algo/simd/sse2/defs_x5.h               |   23 -
 algo/simd/sse2/sph_types.h             | 1976 ------------------------
 algo/simd/{sse2 => }/vector.c          |   23 +-
 algo/simd/{sse2 => }/vector.h          |    0
 algo/sm3/sm3-hash-4way.c               |    8 +-
 algo/whirlpool/md-helper-4way.c        |   12 +-
 algo/x11/c11-4way.c                    |   49 +-
 algo/x11/c11.c                         |    4 +-
 algo/x11/timetravel-4way.c             |   24 +-
 algo/x11/timetravel.c                  |    2 +-
 algo/x11/timetravel10-4way.c           |   48 +-
 algo/x11/timetravel10.c                |    4 +-
 algo/x11/x11-4way.c                    |   54 +-
 algo/x11/x11.c                         |    6 +-
 algo/x11/x11evo-4way.c                 |   53 +-
 algo/x11/x11evo.c                      |    4 +-
 algo/x11/x11gost-4way.c                |   48 +-
 algo/x11/x11gost.c                     |    4 +-
 algo/x13/x13-4way.c                    |   52 +-
 algo/x13/x13.c                         |    4 +-
 algo/x13/x13sm3-4way.c                 |   57 +-
 algo/x13/x13sm3.c                      |    4 +-
 algo/x14/polytimos-4way.c              |   35 +-
 algo/x14/polytimos.c                   |    2 +-
 algo/x14/x14-4way.c                    |   53 +-
 algo/x14/x14.c                         |    4 +-
 algo/x15/x15-4way.c                    |   70 +-
 algo/x15/x15.c                         |    4 +-
 algo/x17/hmq1725.c                     |    4 +-
 algo/x17/x16r-4way.c                   |   83 +-
 algo/x17/x16r.c                        |    6 +-
 algo/x17/x17-4way.c                    |   56 +-
 algo/x17/x17.c                         |    4 +-
 algo/x17/xevan-4way.c                  |   98 +-
 algo/x17/xevan.c                       |    4 +-
 algo/yescrypt/yescrypt.c               |   19 +-
 avxdefs.h                              | 1313 +++++++++++-----
 build-allarch.sh                       |   10 -
 build.sh                               |    4 +-
 configure                              |   20 +-
 configure.ac                           |    2 +-
 cpu-miner.c                            |    8 +-
 miner.h                                |   10 +-
 83 files changed, 5153 insertions(+), 3924 deletions(-)
 delete mode 100644 algo/hamsi/hamsi-helper-4way.c
 create mode 100644 algo/hamsi/sph_hamsi.c.test
 create mode 100644 algo/luffa/luffa-hash-2way.c
 create mode 100644 algo/luffa/luffa-hash-2way.h
 rename algo/luffa/{sse2 => }/luffa_for_sse2.c (96%)
 rename algo/luffa/{sse2 => }/luffa_for_sse2.h (100%)
 create mode 100644 algo/qubit/deep-2way.c
 create mode 100644 algo/qubit/deep-gate.c
 create mode 100644 algo/qubit/deep-gate.h
 create mode 100644 algo/qubit/qubit-2way.c
 create mode 100644 algo/qubit/qubit-gate.c
 create mode 100644 algo/qubit/qubit-gate.h
 rename algo/simd/{sse2 => }/nist.c (100%)
 rename algo/simd/{sse2 => }/nist.h (100%)
 rename algo/simd/{sse2 => }/simd-compat.h (100%)
 create mode 100644 algo/simd/simd-hash-2way.c
 create mode 100644 algo/simd/simd-hash-2way.h
 rename algo/simd/{sse2 => }/simd_iv.h (95%)
 delete mode 100644 algo/simd/sse2/defs_x5.h
 delete mode 100644 algo/simd/sse2/sph_types.h
 rename algo/simd/{sse2 => }/vector.c (99%)
 rename algo/simd/{sse2 => }/vector.h (100%)

diff --git a/Makefile.am b/Makefile.am
index e91c49e..6f2ff8a 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -100,7 +100,8 @@ cpuminer_SOURCES = \
   algo/lbry.c \
   algo/luffa/sph_luffa.c \
   algo/luffa/luffa.c \
-  algo/luffa/sse2/luffa_for_sse2.c \
+  algo/luffa/luffa_for_sse2.c \
+  algo/luffa/luffa-hash-2way.c \
   algo/lyra2/lyra2.c \
   algo/lyra2/sponge.c \
   algo/lyra2/lyra2rev2-gate.c \
@@ -127,7 +128,11 @@ cpuminer_SOURCES = \
   algo/quark/anime-gate.c \
   algo/quark/anime.c \
   algo/quark/anime-4way.c \
+  algo/qubit/qubit-gate.c \
   algo/qubit/qubit.c \
+  algo/qubit/qubit-2way.c \
+  algo/qubit/deep-gate.c \
+  algo/qubit/deep-2way.c \
   algo/qubit/deep.c \
   algo/ripemd/sph_ripemd.c \
   algo/scrypt.c \
@@ -143,8 +148,9 @@ cpuminer_SOURCES = \
   algo/shavite/sph-shavite-aesni.c \
   algo/shavite/shavite.c \
   algo/simd/sph_simd.c \
-  algo/simd/sse2/nist.c \
-  algo/simd/sse2/vector.c \
+  algo/simd/nist.c \
+  algo/simd/vector.c \
+  algo/simd/simd-hash-2way.c \
   algo/skein/sph_skein.c \
   algo/skein/skein-hash-4way.c \
   algo/skein/skein.c \
diff --git a/README.md b/README.md
index e88afd6..5d92894 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ See file RELEASE_NOTES for change log and compile instructions.
 Supported Algorithms
 --------------------
 
+                          anime        Animecoin
                           argon2
                           axiom        Shabal-256 MemoHash
                           bastion
@@ -78,6 +79,7 @@ Supported Algorithms
                           x13sm3       hsr (Hshare)
                           x14          X14
                           x15          X15
+                          x16r         Ravencoin
                           x17
                           xevan        Bitsend
                           yescrypt     Globalboost-Y (BSTY)
@@ -136,10 +138,13 @@ output from the miner showing the startup and any errors.
 Donations
 ---------
 
-I do not do this for money but I have a donation address if users
-are so inclined.
+cpuminer-opt has no fees of any kind but donations are accepted.
 
-bitcoin:12tdvfF7KmAsihBXQXynT6E6th2c2pByTT?label=donations
+BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
+ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
+LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
+BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
+BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
 
 Happy mining!
 
diff --git a/README.txt b/README.txt
index ac3a484..196557b 100644
--- a/README.txt
+++ b/README.txt
@@ -25,3 +25,12 @@ cpuminer-aes-avx.exe   "-march=corei7-avx"         Sandybridge, Ivybridge
 cpuminer-avx2.exe      "-march=core-avx2"          Haswell...
 cpuminer-avx2-sha.exe  "-march=core-avx2 -msha"    Ryzen
 
+If you like this software feel free to donate:
+
+BTC: 12tdvfF7KmAsihBXQXynT6E6th2c2pByTT
+ETH: 0x72122edabcae9d3f57eab0729305a425f6fef6d0
+LTC: LdUwoHJnux9r9EKqFWNvAi45kQompHk6e8
+BCH: 1QKYkB6atn4P7RFozyziAXLEnurwnUM1cQ
+BTG: GVUyECtRHeC5D58z9F3nGGfVQndwnsPnHQ
+
+
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index 6c588ab..2b9d197 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -98,8 +98,8 @@ Start mining.
 
 Windows
 
-The following in how the Windows binary releases are built. It's old and
-not very good but it works, for me anyway.
+Precompiled Windows binaries are built on a Linux host using Mingw
+with a more recent compiler than the following Windows hosted procedure.
 
 Building on Windows prerequisites:
 
@@ -131,7 +131,7 @@ or similar Windows program.
 In msys shell cd to miner directory.
 cd /c/path/to/cpuminer-opt
 
-Run winbuild.sh to build on Windows or execute the following commands.
+Run build.sh to build on Windows or execute the following commands.
 
 ./autogen.sh
 CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11 -fpermissive" ./configure --with-curl
@@ -159,6 +159,16 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble.
 Change Log
 ----------
 
+v3.8.1
+
+Fixes x16r on CPUs with only SSE2.
+More Optimizations for X algos, qubit & deep.
+Corrected algo optimizations for scrypt and yescrypt, no new optimizations.
+
+v3.8.0.1
+
+Fixed x16r AVX2 low hash rate.
+
 v3.8.0
 
 4way no longer a seperate feature, included in AVX2.
diff --git a/algo/blake/blake-hash-4way.c b/algo/blake/blake-hash-4way.c
index e7b424a..e63d007 100644
--- a/algo/blake/blake-hash-4way.c
+++ b/algo/blake/blake-hash-4way.c
@@ -553,22 +553,22 @@ do { \
                           , _mm_set_epi32( CS6, CS6, CS6, CS6 ) ); \
         VF = _mm_xor_si128( _mm_set_epi32( T1, T1, T1, T1 ), \
                             _mm_set_epi32( CS7, CS7, CS7, CS7 ) ); \
-	M[0x0] = mm_byteswap_32( *(buf +  0) ); \
-	M[0x1] = mm_byteswap_32( *(buf +  1) ); \
-	M[0x2] = mm_byteswap_32( *(buf +  2) ); \
-	M[0x3] = mm_byteswap_32( *(buf +  3) ); \
-	M[0x4] = mm_byteswap_32( *(buf +  4) ); \
-	M[0x5] = mm_byteswap_32( *(buf +  5) ); \
-	M[0x6] = mm_byteswap_32( *(buf +  6) ); \
-	M[0x7] = mm_byteswap_32( *(buf +  7) ); \
-	M[0x8] = mm_byteswap_32( *(buf +  8) ); \
-	M[0x9] = mm_byteswap_32( *(buf +  9) ); \
-	M[0xA] = mm_byteswap_32( *(buf + 10) ); \
-	M[0xB] = mm_byteswap_32( *(buf + 11) ); \
-	M[0xC] = mm_byteswap_32( *(buf + 12) ); \
-	M[0xD] = mm_byteswap_32( *(buf + 13) ); \
-	M[0xE] = mm_byteswap_32( *(buf + 14) ); \
-	M[0xF] = mm_byteswap_32( *(buf + 15) ); \
+	M[0x0] = mm_bswap_32( *(buf +  0) ); \
+	M[0x1] = mm_bswap_32( *(buf +  1) ); \
+	M[0x2] = mm_bswap_32( *(buf +  2) ); \
+	M[0x3] = mm_bswap_32( *(buf +  3) ); \
+	M[0x4] = mm_bswap_32( *(buf +  4) ); \
+	M[0x5] = mm_bswap_32( *(buf +  5) ); \
+	M[0x6] = mm_bswap_32( *(buf +  6) ); \
+	M[0x7] = mm_bswap_32( *(buf +  7) ); \
+	M[0x8] = mm_bswap_32( *(buf +  8) ); \
+	M[0x9] = mm_bswap_32( *(buf +  9) ); \
+	M[0xA] = mm_bswap_32( *(buf + 10) ); \
+	M[0xB] = mm_bswap_32( *(buf + 11) ); \
+	M[0xC] = mm_bswap_32( *(buf + 12) ); \
+	M[0xD] = mm_bswap_32( *(buf + 13) ); \
+	M[0xE] = mm_bswap_32( *(buf + 14) ); \
+	M[0xF] = mm_bswap_32( *(buf + 15) ); \
 	for (r = 0; r < rounds; r ++) \
 		ROUND_S_4WAY(r); \
         H0 = _mm_xor_si128( _mm_xor_si128( \
@@ -615,22 +615,22 @@ do { \
    VD = _mm_xor_si128( _mm_set1_epi32( T0 ), _mm_set1_epi32( CS5 ) ); \
    VE = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS6 ) ); \
    VF = _mm_xor_si128( _mm_set1_epi32( T1 ), _mm_set1_epi32( CS7 ) ); \
-   M0 = mm_byteswap_32( * buf ); \
-   M1 = mm_byteswap_32( *(buf+1) ); \
-   M2 = mm_byteswap_32( *(buf+2) ); \
-   M3 = mm_byteswap_32( *(buf+3) ); \
-   M4 = mm_byteswap_32( *(buf+4) ); \
-   M5 = mm_byteswap_32( *(buf+5) ); \
-   M6 = mm_byteswap_32( *(buf+6) ); \
-   M7 = mm_byteswap_32( *(buf+7) ); \
-   M8 = mm_byteswap_32( *(buf+8) ); \
-   M9 = mm_byteswap_32( *(buf+9) ); \
-   MA = mm_byteswap_32( *(buf+10) ); \
-   MB = mm_byteswap_32( *(buf+11) ); \
-   MC = mm_byteswap_32( *(buf+12) ); \
-   MD = mm_byteswap_32( *(buf+13) ); \
-   ME = mm_byteswap_32( *(buf+14) ); \
-   MF = mm_byteswap_32( *(buf+15) ); \
+   M0 = mm_bswap_32( * buf ); \
+   M1 = mm_bswap_32( *(buf+1) ); \
+   M2 = mm_bswap_32( *(buf+2) ); \
+   M3 = mm_bswap_32( *(buf+3) ); \
+   M4 = mm_bswap_32( *(buf+4) ); \
+   M5 = mm_bswap_32( *(buf+5) ); \
+   M6 = mm_bswap_32( *(buf+6) ); \
+   M7 = mm_bswap_32( *(buf+7) ); \
+   M8 = mm_bswap_32( *(buf+8) ); \
+   M9 = mm_bswap_32( *(buf+9) ); \
+   MA = mm_bswap_32( *(buf+10) ); \
+   MB = mm_bswap_32( *(buf+11) ); \
+   MC = mm_bswap_32( *(buf+12) ); \
+   MD = mm_bswap_32( *(buf+13) ); \
+   ME = mm_bswap_32( *(buf+14) ); \
+   MF = mm_bswap_32( *(buf+15) ); \
    ROUND_S_4WAY(0); \
    ROUND_S_4WAY(1); \
    ROUND_S_4WAY(2); \
@@ -727,22 +727,22 @@ do { \
    VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \
    VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \
    VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \
-   M0 = mm256_byteswap_32( * buf ); \
-   M1 = mm256_byteswap_32( *(buf+1) ); \
-   M2 = mm256_byteswap_32( *(buf+2) ); \
-   M3 = mm256_byteswap_32( *(buf+3) ); \
-   M4 = mm256_byteswap_32( *(buf+4) ); \
-   M5 = mm256_byteswap_32( *(buf+5) ); \
-   M6 = mm256_byteswap_32( *(buf+6) ); \
-   M7 = mm256_byteswap_32( *(buf+7) ); \
-   M8 = mm256_byteswap_32( *(buf+8) ); \
-   M9 = mm256_byteswap_32( *(buf+9) ); \
-   MA = mm256_byteswap_32( *(buf+10) ); \
-   MB = mm256_byteswap_32( *(buf+11) ); \
-   MC = mm256_byteswap_32( *(buf+12) ); \
-   MD = mm256_byteswap_32( *(buf+13) ); \
-   ME = mm256_byteswap_32( *(buf+14) ); \
-   MF = mm256_byteswap_32( *(buf+15) ); \
+   M0 = mm256_bswap_32( * buf ); \
+   M1 = mm256_bswap_32( *(buf+1) ); \
+   M2 = mm256_bswap_32( *(buf+2) ); \
+   M3 = mm256_bswap_32( *(buf+3) ); \
+   M4 = mm256_bswap_32( *(buf+4) ); \
+   M5 = mm256_bswap_32( *(buf+5) ); \
+   M6 = mm256_bswap_32( *(buf+6) ); \
+   M7 = mm256_bswap_32( *(buf+7) ); \
+   M8 = mm256_bswap_32( *(buf+8) ); \
+   M9 = mm256_bswap_32( *(buf+9) ); \
+   MA = mm256_bswap_32( *(buf+10) ); \
+   MB = mm256_bswap_32( *(buf+11) ); \
+   MC = mm256_bswap_32( *(buf+12) ); \
+   MD = mm256_bswap_32( *(buf+13) ); \
+   ME = mm256_bswap_32( *(buf+14) ); \
+   MF = mm256_bswap_32( *(buf+15) ); \
    ROUND_S_8WAY(0); \
    ROUND_S_8WAY(1); \
    ROUND_S_8WAY(2); \
@@ -848,22 +848,22 @@ do { \
                                _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) ); \
         VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
                                _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) ); \
-	M[0x0] = mm256_byteswap_64( *(buf+0) ); \
-	M[0x1] = mm256_byteswap_64( *(buf+1) ); \
-	M[0x2] = mm256_byteswap_64( *(buf+2) ); \
-	M[0x3] = mm256_byteswap_64( *(buf+3) ); \
-	M[0x4] = mm256_byteswap_64( *(buf+4) ); \
-	M[0x5] = mm256_byteswap_64( *(buf+5) ); \
-	M[0x6] = mm256_byteswap_64( *(buf+6) ); \
-	M[0x7] = mm256_byteswap_64( *(buf+7) ); \
-	M[0x8] = mm256_byteswap_64( *(buf+8) ); \
-	M[0x9] = mm256_byteswap_64( *(buf+9) ); \
-	M[0xA] = mm256_byteswap_64( *(buf+10) ); \
-	M[0xB] = mm256_byteswap_64( *(buf+11) ); \
-	M[0xC] = mm256_byteswap_64( *(buf+12) ); \
-	M[0xD] = mm256_byteswap_64( *(buf+13) ); \
-	M[0xE] = mm256_byteswap_64( *(buf+14) ); \
-	M[0xF] = mm256_byteswap_64( *(buf+15) ); \
+	M[0x0] = mm256_bswap_64( *(buf+0) ); \
+	M[0x1] = mm256_bswap_64( *(buf+1) ); \
+	M[0x2] = mm256_bswap_64( *(buf+2) ); \
+	M[0x3] = mm256_bswap_64( *(buf+3) ); \
+	M[0x4] = mm256_bswap_64( *(buf+4) ); \
+	M[0x5] = mm256_bswap_64( *(buf+5) ); \
+	M[0x6] = mm256_bswap_64( *(buf+6) ); \
+	M[0x7] = mm256_bswap_64( *(buf+7) ); \
+	M[0x8] = mm256_bswap_64( *(buf+8) ); \
+	M[0x9] = mm256_bswap_64( *(buf+9) ); \
+	M[0xA] = mm256_bswap_64( *(buf+10) ); \
+	M[0xB] = mm256_bswap_64( *(buf+11) ); \
+	M[0xC] = mm256_bswap_64( *(buf+12) ); \
+	M[0xD] = mm256_bswap_64( *(buf+13) ); \
+	M[0xE] = mm256_bswap_64( *(buf+14) ); \
+	M[0xF] = mm256_bswap_64( *(buf+15) ); \
 	for (r = 0; r < 16; r ++) \
 		ROUND_B_4WAY(r); \
         H0 = _mm256_xor_si256( _mm256_xor_si256( \
@@ -913,22 +913,22 @@ do { \
                             _mm256_set_epi64x( CB6, CB6, CB6, CB6 ) );  \
      VF = _mm256_xor_si256( _mm256_set_epi64x( T1, T1, T1, T1 ), \
                             _mm256_set_epi64x( CB7, CB7, CB7, CB7 ) );  \
-     M0 = mm256_byteswap_64( *(buf + 0) ); \
-     M1 = mm256_byteswap_64( *(buf + 1) ); \
-     M2 = mm256_byteswap_64( *(buf + 2) ); \
-     M3 = mm256_byteswap_64( *(buf + 3) ); \
-     M4 = mm256_byteswap_64( *(buf + 4) ); \
-     M5 = mm256_byteswap_64( *(buf + 5) ); \
-     M6 = mm256_byteswap_64( *(buf + 6) ); \
-     M7 = mm256_byteswap_64( *(buf + 7) ); \
-     M8 = mm256_byteswap_64( *(buf + 8) ); \
-     M9 = mm256_byteswap_64( *(buf + 9) ); \
-     MA = mm256_byteswap_64( *(buf + 10) ); \
-     MB = mm256_byteswap_64( *(buf + 11) ); \
-     MC = mm256_byteswap_64( *(buf + 12) ); \
-     MD = mm256_byteswap_64( *(buf + 13) ); \
-     ME = mm256_byteswap_64( *(buf + 14) ); \
-     MF = mm256_byteswap_64( *(buf + 15) ); \
+     M0 = mm256_bswap_64( *(buf + 0) ); \
+     M1 = mm256_bswap_64( *(buf + 1) ); \
+     M2 = mm256_bswap_64( *(buf + 2) ); \
+     M3 = mm256_bswap_64( *(buf + 3) ); \
+     M4 = mm256_bswap_64( *(buf + 4) ); \
+     M5 = mm256_bswap_64( *(buf + 5) ); \
+     M6 = mm256_bswap_64( *(buf + 6) ); \
+     M7 = mm256_bswap_64( *(buf + 7) ); \
+     M8 = mm256_bswap_64( *(buf + 8) ); \
+     M9 = mm256_bswap_64( *(buf + 9) ); \
+     MA = mm256_bswap_64( *(buf + 10) ); \
+     MB = mm256_bswap_64( *(buf + 11) ); \
+     MC = mm256_bswap_64( *(buf + 12) ); \
+     MD = mm256_bswap_64( *(buf + 13) ); \
+     ME = mm256_bswap_64( *(buf + 14) ); \
+     MF = mm256_bswap_64( *(buf + 15) ); \
      ROUND_B_4WAY(0); \
      ROUND_B_4WAY(1); \
      ROUND_B_4WAY(2); \
@@ -1064,8 +1064,8 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
        if (out_size_w32 == 8)
            u.buf[52>>2] = _mm_or_si128( u.buf[52>>2],
                                         _mm_set1_epi32( 0x01000000UL ) );
-       *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
-       *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
+       *(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
+       *(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
        blake32_4way( sc, u.buf + (ptr>>2), 64 - ptr );
    }
    else
@@ -1077,13 +1077,13 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n,
 	memset_zero_128( u.buf, 56>>2 );
        if (out_size_w32 == 8)
            u.buf[52>>2] = _mm_set1_epi32( 0x01000000UL );
-        *(u.buf+(56>>2)) = mm_byteswap_32( _mm_set1_epi32( th ) );
-        *(u.buf+(60>>2)) = mm_byteswap_32( _mm_set1_epi32( tl ) );
+        *(u.buf+(56>>2)) = mm_bswap_32( _mm_set1_epi32( th ) );
+        *(u.buf+(60>>2)) = mm_bswap_32( _mm_set1_epi32( tl ) );
 	blake32_4way( sc, u.buf, 64 );
    }
    out = (__m128i*)dst;
    for ( k = 0; k < out_size_w32; k++ )
-        out[k] = mm_byteswap_32( sc->H[k] );
+        out[k] = mm_bswap_32( sc->H[k] );
 }
 
 #if defined (__AVX2__)
@@ -1187,8 +1187,8 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
        if (out_size_w32 == 8)
            u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2],
                                            _mm256_set1_epi32( 0x01000000UL ) );
-       *(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
-       *(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
+       *(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
+       *(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
        blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr );
    }
    else
@@ -1200,13 +1200,13 @@ blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n,
         memset_zero_256( u.buf, 56>>2 );
        if (out_size_w32 == 8)
            u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL );
-        *(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) );
-        *(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) );
+        *(u.buf+(56>>2)) = mm256_bswap_32( _mm256_set1_epi32( th ) );
+        *(u.buf+(60>>2)) = mm256_bswap_32( _mm256_set1_epi32( tl ) );
         blake32_8way( sc, u.buf, 64 );
    }
    out = (__m256i*)dst;
    for ( k = 0; k < out_size_w32; k++ )
-        out[k] = mm256_byteswap_32( sc->H[k] );
+        out[k] = mm256_bswap_32( sc->H[k] );
 }
 
 // Blake-512 4 way
@@ -1311,9 +1311,9 @@ blake64_4way_close( blake_4way_big_context *sc,
        if ( out_size_w64 == 8 )
           u.buf[(104>>3)] = _mm256_or_si256( u.buf[(104>>3)],
                                  _mm256_set1_epi64x( 0x0100000000000000ULL ) );
-       *(u.buf+(112>>3)) = mm256_byteswap_64(
+       *(u.buf+(112>>3)) = mm256_bswap_64(
                                     _mm256_set_epi64x( th, th, th, th ) );
-       *(u.buf+(120>>3)) = mm256_byteswap_64(
+       *(u.buf+(120>>3)) = mm256_bswap_64(
                                     _mm256_set_epi64x( tl, tl, tl, tl ) );
 
        blake64_4way( sc, u.buf + (ptr>>3), 128 - ptr );
@@ -1328,16 +1328,16 @@ blake64_4way_close( blake_4way_big_context *sc,
        memset_zero_256( u.buf, 112>>3 ); 
        if ( out_size_w64 == 8 )
            u.buf[104>>3] = _mm256_set1_epi64x( 0x0100000000000000ULL );
-       *(u.buf+(112>>3)) = mm256_byteswap_64(
+       *(u.buf+(112>>3)) = mm256_bswap_64(
                                     _mm256_set_epi64x( th, th, th, th ) );
-       *(u.buf+(120>>3)) = mm256_byteswap_64(
+       *(u.buf+(120>>3)) = mm256_bswap_64(
                                     _mm256_set_epi64x( tl, tl, tl, tl ) );
 
        blake64_4way( sc, u.buf, 128 );
    }
    out = (__m256i*)dst;
    for ( k = 0; k < out_size_w64; k++ )
-       out[k] = mm256_byteswap_64( sc->H[k] );
+       out[k] = mm256_bswap_64( sc->H[k] );
 }
 
 #endif
diff --git a/algo/bmw/bmw-hash-4way.c b/algo/bmw/bmw-hash-4way.c
index 39da2ce..4276aa9 100644
--- a/algo/bmw/bmw-hash-4way.c
+++ b/algo/bmw/bmw-hash-4way.c
@@ -984,7 +984,7 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n,
    }
    memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 );
    buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n );
-   buf[ (buf_size - 4) >> 2 ] = mm_zero;
+   buf[ (buf_size - 4) >> 2 ] = m128_zero;
    compress_small( buf, h, h2 );
 
    for ( u = 0; u < 16; u ++ )
diff --git a/algo/cubehash/sse2/cubehash_sse2.c b/algo/cubehash/sse2/cubehash_sse2.c
index ab36bff..9a9357c 100644
--- a/algo/cubehash/sse2/cubehash_sse2.c
+++ b/algo/cubehash/sse2/cubehash_sse2.c
@@ -129,7 +129,7 @@ static void transform( cubehashParam *sp )
 #endif
 }  // transform
 
-// Ccubehash context initializing is very expensive.
+// Cubehash context initializing is very expensive.
 // Cache the intial value for faster reinitializing.
 cubehashParam cube_ctx_cache __attribute__ ((aligned (64)));
 
diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c
index 412fcf3..d5eaa69 100644
--- a/algo/hamsi/hamsi-hash-4way.c
+++ b/algo/hamsi/hamsi-hash-4way.c
@@ -33,9 +33,10 @@
 #include <stddef.h>
 #include <string.h>
 
+//#include "miner.h"
 #include "hamsi-hash-4way.h"
 
-#if defined(__AVX__)
+#if defined(__AVX2__)
 
 #ifdef __cplusplus
 extern "C"{
@@ -94,28 +95,11 @@ extern "C"{
  * thus avoiding any data-dependent table access pattern.
  */
 
-// Hard coded
-//#define SPH_HAMSI_EXPAND_BIG    1
-
-/*
-#if !defined SPH_HAMSI_EXPAND_SMALL
-#if SPH_SMALL_FOOTPRINT_HAMSI
-#define SPH_HAMSI_EXPAND_SMALL  4
-#else
-#define SPH_HAMSI_EXPAND_SMALL  8
-#endif
-#endif
-
-#if !defined SPH_HAMSI_EXPAND_BIG
-#define SPH_HAMSI_EXPAND_BIG    8
-#endif
-*/
-
 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif
 
-#include "hamsi-helper-4way.c"
+//#include "hamsi-helper-4way.c"
 
 static const sph_u32 IV512[] = {
 	SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
@@ -154,235 +138,694 @@ static const sph_u32 alpha_f[] = {
 	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
 };
 
-/*
-#define s0   m0
-#define s1   m1
-#define s2   c0
-#define s3   c1
-#define s4   c2
-#define s5   c3
-#define s6   m2
-#define s7   m3
-#define s8   m4
-#define s9   m5
-#define sA   c4
-#define sB   c5
-#define sC   c6
-#define sD   c7
-#define sE   m6
-#define sF   m7
-*/
+// imported from hamsi helper
+
+/* Note: this table lists bits within each byte from least
+   siginificant to most significant. */
+static const sph_u32 T512[64][16] = {
+	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
+	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
+	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
+	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
+	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
+	  SPH_C32(0x9e69af68) },
+	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
+	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
+	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
+	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
+	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
+	  SPH_C32(0x0c26f262) },
+	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
+	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
+	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
+	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
+	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
+	  SPH_C32(0xdc24e61f) },
+	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
+	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
+	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
+	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
+	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
+	  SPH_C32(0x3daac2da) },
+	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
+	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
+	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
+	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
+	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
+	  SPH_C32(0x78cace29) },
+	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
+	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
+	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
+	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
+	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
+	  SPH_C32(0x2dd1f9ab) },
+	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
+	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
+	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
+	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
+	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
+	  SPH_C32(0xbf2c0be2) },
+	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
+	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
+	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
+	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
+	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
+	  SPH_C32(0x32219526) },
+	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
+	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
+	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
+	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
+	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
+	  SPH_C32(0xac8e6c88) },
+	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
+	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
+	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
+	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
+	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
+	  SPH_C32(0x7b1bd6b9) },
+	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
+	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
+	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
+	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
+	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
+	  SPH_C32(0xf746c320) },
+	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
+	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
+	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
+	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
+	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
+	  SPH_C32(0x69505b3a) },
+	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
+	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
+	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
+	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
+	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
+	  SPH_C32(0x8a341574) },
+	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
+	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
+	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
+	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
+	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
+	  SPH_C32(0x450360bf) },
+	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
+	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
+	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
+	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
+	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
+	  SPH_C32(0xf3d45758) },
+	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
+	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
+	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
+	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
+	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
+	  SPH_C32(0x925c44e9) },
+	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
+	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
+	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
+	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
+	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
+	  SPH_C32(0xa123ff9f) },
+	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
+	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
+	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
+	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
+	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
+	  SPH_C32(0x1568ff0f) },
+	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
+	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
+	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
+	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
+	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
+	  SPH_C32(0xc5c1eb3e) },
+	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
+	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
+	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
+	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
+	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
+	  SPH_C32(0x1af21fe1) },
+	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
+	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
+	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
+	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
+	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
+	  SPH_C32(0x857f3c2b) },
+	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
+	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
+	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
+	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
+	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
+	  SPH_C32(0x2ba05a55) },
+	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
+	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
+	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
+	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
+	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
+	  SPH_C32(0xfeabf254) },
+	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
+	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
+	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
+	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
+	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
+	  SPH_C32(0xfe1cdc7f) },
+	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
+	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
+	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
+	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
+	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
+	  SPH_C32(0xb0a51834) },
+	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
+	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
+	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
+	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
+	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
+	  SPH_C32(0xa6b8c28d) },
+	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
+	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
+	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
+	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
+	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
+	  SPH_C32(0x3a4e99d7) },
+	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
+	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
+	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
+	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
+	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
+	  SPH_C32(0xe1844257) },
+	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
+	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
+	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
+	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
+	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
+	  SPH_C32(0x2c3b504e) },
+	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
+	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
+	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
+	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
+	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
+	  SPH_C32(0x524a0d59) },
+	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
+	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
+	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
+	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
+	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
+	  SPH_C32(0x378dd173) },
+	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
+	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
+	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
+	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
+	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
+	  SPH_C32(0x8b6c72bd) },
+	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
+	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
+	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
+	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
+	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
+	  SPH_C32(0x8e67b7fa) },
+	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
+	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
+	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
+	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
+	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
+	  SPH_C32(0x443d3004) },
+	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
+	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
+	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
+	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
+	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
+	  SPH_C32(0xf4f6ea7b) },
+	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
+	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
+	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
+	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
+	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
+	  SPH_C32(0x979961d0) },
+	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
+	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
+	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
+	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
+	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
+	  SPH_C32(0x98aa496e) },
+	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
+	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
+	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
+	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
+	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
+	  SPH_C32(0x094e3198) },
+	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
+	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
+	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
+	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
+	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
+	  SPH_C32(0xe86cba2e) },
+	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
+	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
+	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
+	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
+	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
+	  SPH_C32(0x4b7eec55) },
+	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
+	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
+	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
+	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
+	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
+	  SPH_C32(0x1e7536a6) },
+	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
+	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
+	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
+	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
+	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
+	  SPH_C32(0x24314f17) },
+	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
+	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
+	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
+	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
+	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
+	  SPH_C32(0x9075b1ce) },
+	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
+	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
+	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
+	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
+	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
+	  SPH_C32(0x9b6ef888) },
+	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
+	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
+	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
+	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
+	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
+	  SPH_C32(0xd8b61463) },
+	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
+	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
+	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
+	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
+	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
+	  SPH_C32(0x3ea660f7) },
+	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
+	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
+	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
+	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
+	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
+	  SPH_C32(0x7f975691) },
+	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
+	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
+	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
+	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
+	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
+	  SPH_C32(0x2c94459e) },
+	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
+	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
+	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
+	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
+	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
+	  SPH_C32(0x56a7b19f) },
+	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
+	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
+	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
+	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
+	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
+	  SPH_C32(0x81fdf908) },
+	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
+	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
+	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
+	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
+	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
+	  SPH_C32(0x5bd61539) },
+	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
+	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
+	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
+	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
+	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
+	  SPH_C32(0x15b961e7) },
+	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
+	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
+	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
+	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
+	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
+	  SPH_C32(0x2a2c18f0) },
+	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
+	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
+	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
+	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
+	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
+	  SPH_C32(0x551e3d6e) },
+	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
+	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
+	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
+	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
+	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
+	  SPH_C32(0x33c5244f) },
+	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
+	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
+	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
+	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
+	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
+	  SPH_C32(0x8a58e6a4) },
+	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
+	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
+	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
+	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
+	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
+	  SPH_C32(0xda878000) },
+	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
+	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
+	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
+	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
+	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
+	  SPH_C32(0x3c5dfffe) },
+	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
+	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
+	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
+	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
+	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
+	  SPH_C32(0x7b1675d7) },
+	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
+	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
+	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
+	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
+	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
+	  SPH_C32(0x2879ebac) },
+	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
+	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
+	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
+	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
+	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
+	  SPH_C32(0xbe0a679e) },
+	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
+	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
+	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
+	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
+	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
+	  SPH_C32(0x30aebcf7) },
+	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
+	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
+	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
+	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
+	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
+	  SPH_C32(0xc7ff60f0) },
+	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
+	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
+	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
+	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
+	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
+	  SPH_C32(0xe7e00a94) }
+};
+
+#define INPUT_BIG \
+do { \
+  __m256i db = *buf; \
+  const sph_u32 *tp = &T512[0][0]; \
+  m0 = m256_zero; \
+  m1 = m256_zero; \
+  m2 = m256_zero; \
+  m3 = m256_zero; \
+  m4 = m256_zero; \
+  m5 = m256_zero; \
+  m6 = m256_zero; \
+  m7 = m256_zero; \
+  for ( int u = 0; u < 64; u++ ) \
+  { \
+     __m256i dm = _mm256_and_si256( db, m256_one_64 ) ; \
+     dm = mm256_negate_32( _mm256_or_si256( dm, \
+                         _mm256_slli_epi64( dm, 32 ) ) ); \
+     m0 = _mm256_xor_si256( m0, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x1], tp[0x0], tp[0x1], tp[0x0], \
+                                    tp[0x1], tp[0x0], tp[0x1], tp[0x0] ) ) ); \
+     m1 = _mm256_xor_si256( m1, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x3], tp[0x2], tp[0x3], tp[0x2], \
+                                    tp[0x3], tp[0x2], tp[0x3], tp[0x2] ) ) ); \
+     m2 = _mm256_xor_si256( m2, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x5], tp[0x4], tp[0x5], tp[0x4], \
+                                    tp[0x5], tp[0x4], tp[0x5], tp[0x4] ) ) ); \
+     m3 = _mm256_xor_si256( m3, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x7], tp[0x6], tp[0x7], tp[0x6], \
+                                    tp[0x7], tp[0x6], tp[0x7], tp[0x6] ) ) ); \
+     m4 = _mm256_xor_si256( m4, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0x9], tp[0x8], tp[0x9], tp[0x8], \
+                                    tp[0x9], tp[0x8], tp[0x9], tp[0x8] ) ) ); \
+     m5 = _mm256_xor_si256( m5, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0xB], tp[0xA], tp[0xB], tp[0xA], \
+                                    tp[0xB], tp[0xA], tp[0xB], tp[0xA] ) ) ); \
+     m6 = _mm256_xor_si256( m6, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0xD], tp[0xC], tp[0xD], tp[0xC], \
+                                    tp[0xD], tp[0xC], tp[0xD], tp[0xC] ) ) ); \
+     m7 = _mm256_xor_si256( m7, _mm256_and_si256( dm, \
+                  _mm256_set_epi32( tp[0xF], tp[0xE], tp[0xF], tp[0xE], \
+                                    tp[0xF], tp[0xE], tp[0xF], tp[0xE] ) ) ); \
+     tp += 0x10; \
+     db = _mm256_srli_epi64( db, 1 ); \
+  } \
+} while (0)
 
 #define SBOX( a, b, c, d ) \
 do { \
-  __m128i t; \
+  __m256i t; \
   t = a; \
-  a = _mm_xor_si128( d, _mm_and_si128( a, c ) ); \
-  c = _mm_xor_si128( a, _mm_xor_si128( c, b ) ); \
-  d = _mm_xor_si128( b, _mm_or_si128( d, t ) ); \
-  t = _mm_xor_si128( t, c ); \
+  a = _mm256_and_si256( a, c ); \
+  a = _mm256_xor_si256( a, d ); \
+  c = _mm256_xor_si256( c, b ); \
+  c = _mm256_xor_si256( c, a ); \
+  d = _mm256_or_si256( d, t ); \
+  d = _mm256_xor_si256( d, b ); \
+  t = _mm256_xor_si256( t, c ); \
   b = d; \
-  d = _mm_xor_si128( a, _mm_or_si128( d, t ) ); \
-  a = _mm_and_si128( a, b ); \
-  t = _mm_xor_si128( t, a ); \
-  b = _mm_xor_si128( t, _mm_xor_si128( b, d ) ); \
+  d = _mm256_or_si256( d, t ); \
+  d = _mm256_xor_si256( d, a ); \
+  a = _mm256_and_si256( a, b ); \
+  t = _mm256_xor_si256( t, a ); \
+  b = _mm256_xor_si256( b, d ); \
+  b = _mm256_xor_si256( b, t ); \
   a = c; \
   c = b; \
   b = d; \
-  d = mm_not( t ); \
+  d = mm256_not( t ); \
 } while (0)
 
 #define L( a, b, c, d ) \
 do { \
-   a = mm_rotl_32( a, 13 ); \
-   c = mm_rotl_32( c,  3 ); \
-   b = _mm_xor_si128( b, _mm_xor_si128( a, c ) ); \
-   d = _mm_xor_si128( d, _mm_xor_si128( c, _mm_slli_epi32( a, 3 ) ) ); \
-   b = mm_rotl_32( b, 1 ); \
-   d = mm_rotl_32( d, 7 ); \
-   a = _mm_xor_si128( a, _mm_xor_si128( b, d ) ); \
-   c = _mm_xor_si128( c, _mm_xor_si128( d, _mm_slli_epi32( b, 7 ) ) ); \
-   a = mm_rotl_32( a,  5 ); \
-   c = mm_rotl_32( c, 22 ); \
+   a = mm256_rotl_32( a, 13 ); \
+   c = mm256_rotl_32( c,  3 ); \
+   b = _mm256_xor_si256( b, _mm256_xor_si256( a, c ) ); \
+   d = _mm256_xor_si256( d, _mm256_xor_si256( c, \
+                                              _mm256_slli_epi32( a, 3 ) ) ); \
+   b = mm256_rotl_32( b, 1 ); \
+   d = mm256_rotl_32( d, 7 ); \
+   a = _mm256_xor_si256( a, _mm256_xor_si256( b, d ) ); \
+   c = _mm256_xor_si256( c, _mm256_xor_si256( d, \
+                                              _mm256_slli_epi32( b, 7 ) ) ); \
+   a = mm256_rotl_32( a,  5 ); \
+   c = mm256_rotl_32( c, 22 ); \
 } while (0)
 
 #define DECL_STATE_BIG \
-   __m128i c0, c1, c2, c3, c4, c5, c6, c7; \
-   __m128i c8, c9, cA, cB, cC, cD, cE, cF;
+   __m256i c0, c1, c2, c3, c4, c5, c6, c7; \
 
-#define READ_STATE_BIG(sc)   do { \
-		c0 = sc->h[0x0]; \
-		c1 = sc->h[0x1]; \
-		c2 = sc->h[0x2]; \
-		c3 = sc->h[0x3]; \
-		c4 = sc->h[0x4]; \
-		c5 = sc->h[0x5]; \
-		c6 = sc->h[0x6]; \
-		c7 = sc->h[0x7]; \
-		c8 = sc->h[0x8]; \
-		c9 = sc->h[0x9]; \
-		cA = sc->h[0xA]; \
-		cB = sc->h[0xB]; \
-		cC = sc->h[0xC]; \
-		cD = sc->h[0xD]; \
-		cE = sc->h[0xE]; \
-		cF = sc->h[0xF]; \
-	} while (0)
+#define READ_STATE_BIG(sc) \
+do { \
+   c0 = sc->h[0x0]; \
+   c1 = sc->h[0x1]; \
+   c2 = sc->h[0x2]; \
+   c3 = sc->h[0x3]; \
+   c4 = sc->h[0x4]; \
+   c5 = sc->h[0x5]; \
+   c6 = sc->h[0x6]; \
+   c7 = sc->h[0x7]; \
+} while (0)
 
-#define WRITE_STATE_BIG(sc)   do { \
-		sc->h[0x0] = c0; \
-		sc->h[0x1] = c1; \
-		sc->h[0x2] = c2; \
-		sc->h[0x3] = c3; \
-		sc->h[0x4] = c4; \
-		sc->h[0x5] = c5; \
-		sc->h[0x6] = c6; \
-		sc->h[0x7] = c7; \
-		sc->h[0x8] = c8; \
-		sc->h[0x9] = c9; \
-		sc->h[0xA] = cA; \
-		sc->h[0xB] = cB; \
-		sc->h[0xC] = cC; \
-		sc->h[0xD] = cD; \
-		sc->h[0xE] = cE; \
-		sc->h[0xF] = cF; \
-	} while (0)
+#define WRITE_STATE_BIG(sc) \
+do { \
+   sc->h[0x0] = c0; \
+   sc->h[0x1] = c1; \
+   sc->h[0x2] = c2; \
+   sc->h[0x3] = c3; \
+   sc->h[0x4] = c4; \
+   sc->h[0x5] = c5; \
+   sc->h[0x6] = c6; \
+   sc->h[0x7] = c7; \
+} while (0)
 
-#define s00   m0
-#define s01   m1
-#define s02   c0
-#define s03   c1
-#define s04   m2
-#define s05   m3
-#define s06   c2
-#define s07   c3
-#define s08   c4
-#define s09   c5
-#define s0A   m4
-#define s0B   m5
-#define s0C   c6
-#define s0D   c7
-#define s0E   m6
-#define s0F   m7
-#define s10   m8
-#define s11   m9
-#define s12   c8
-#define s13   c9
-#define s14   mA
-#define s15   mB
-#define s16   cA
-#define s17   cB
-#define s18   cC
-#define s19   cD
-#define s1A   mC
-#define s1B   mD
-#define s1C   cE
-#define s1D   cF
-#define s1E   mE
-#define s1F   mF
+#define s0   m0
+#define s1   c0
+#define s2   m1
+#define s3   c1
+#define s4   c2
+#define s5   m2
+#define s6   c3
+#define s7   m3
+#define s8   m4
+#define s9   c4
+#define sA   m5
+#define sB   c5
+#define sC   c6
+#define sD   m6
+#define sE   c7
+#define sF   m7
 
 #define ROUND_BIG(rc, alpha) \
 do { \
-   s00 = _mm_xor_si128( s00, _mm_set1_epi32( alpha[ 0x00 ] ) ); \
-   s01 = _mm_xor_si128( s01, _mm_xor_si128( _mm_set1_epi32( alpha[ 0x01 ] ), \
-                                            _mm_set1_epi32( rc ) ) ); \
-   s02 = _mm_xor_si128( s02, _mm_set1_epi32( alpha[ 0x02 ] ) ); \
-   s03 = _mm_xor_si128( s03, _mm_set1_epi32( alpha[ 0x03 ] ) ); \
-   s04 = _mm_xor_si128( s04, _mm_set1_epi32( alpha[ 0x04 ] ) ); \
-   s05 = _mm_xor_si128( s05, _mm_set1_epi32( alpha[ 0x05 ] ) ); \
-   s06 = _mm_xor_si128( s06, _mm_set1_epi32( alpha[ 0x06 ] ) ); \
-   s07 = _mm_xor_si128( s07, _mm_set1_epi32( alpha[ 0x07 ] ) ); \
-   s08 = _mm_xor_si128( s08, _mm_set1_epi32( alpha[ 0x08 ] ) ); \
-   s09 = _mm_xor_si128( s09, _mm_set1_epi32( alpha[ 0x09 ] ) ); \
-   s0A = _mm_xor_si128( s0A, _mm_set1_epi32( alpha[ 0x0A ] ) ); \
-   s0B = _mm_xor_si128( s0B, _mm_set1_epi32( alpha[ 0x0B ] ) ); \
-   s0C = _mm_xor_si128( s0C, _mm_set1_epi32( alpha[ 0x0C ] ) ); \
-   s0D = _mm_xor_si128( s0D, _mm_set1_epi32( alpha[ 0x0D ] ) ); \
-   s0E = _mm_xor_si128( s0E, _mm_set1_epi32( alpha[ 0x0E ] ) ); \
-   s0F = _mm_xor_si128( s0F, _mm_set1_epi32( alpha[ 0x0F ] ) ); \
-   s10 = _mm_xor_si128( s10, _mm_set1_epi32( alpha[ 0x10 ] ) ); \
-   s11 = _mm_xor_si128( s11, _mm_set1_epi32( alpha[ 0x11 ] ) ); \
-   s12 = _mm_xor_si128( s12, _mm_set1_epi32( alpha[ 0x12 ] ) ); \
-   s13 = _mm_xor_si128( s13, _mm_set1_epi32( alpha[ 0x13 ] ) ); \
-   s14 = _mm_xor_si128( s14, _mm_set1_epi32( alpha[ 0x14 ] ) ); \
-   s15 = _mm_xor_si128( s15, _mm_set1_epi32( alpha[ 0x15 ] ) ); \
-   s16 = _mm_xor_si128( s16, _mm_set1_epi32( alpha[ 0x16 ] ) ); \
-   s17 = _mm_xor_si128( s17, _mm_set1_epi32( alpha[ 0x17 ] ) ); \
-   s18 = _mm_xor_si128( s18, _mm_set1_epi32( alpha[ 0x18 ] ) ); \
-   s19 = _mm_xor_si128( s19, _mm_set1_epi32( alpha[ 0x19 ] ) ); \
-   s1A = _mm_xor_si128( s1A, _mm_set1_epi32( alpha[ 0x1A ] ) ); \
-   s1B = _mm_xor_si128( s1B, _mm_set1_epi32( alpha[ 0x1B ] ) ); \
-   s1C = _mm_xor_si128( s1C, _mm_set1_epi32( alpha[ 0x1C ] ) ); \
-   s1D = _mm_xor_si128( s1D, _mm_set1_epi32( alpha[ 0x1D ] ) ); \
-   s1E = _mm_xor_si128( s1E, _mm_set1_epi32( alpha[ 0x1E ] ) ); \
-   s1F = _mm_xor_si128( s1F, _mm_set1_epi32( alpha[ 0x1F ] ) ); \
-   SBOX( s00, s08, s10, s18); \
-   SBOX( s01, s09, s11, s19); \
-   SBOX( s02, s0A, s12, s1A); \
-   SBOX( s03, s0B, s13, s1B); \
-   SBOX( s04, s0C, s14, s1C); \
-   SBOX( s05, s0D, s15, s1D); \
-   SBOX( s06, s0E, s16, s1E); \
-   SBOX( s07, s0F, s17, s1F); \
-   L( s00, s09, s12, s1B ); \
-   L( s01, s0A, s13, s1C ); \
-   L( s02, s0B, s14, s1D ); \
-   L( s03, s0C, s15, s1E ); \
-   L( s04, s0D, s16, s1F ); \
-   L( s05, s0E, s17, s18 ); \
-   L( s06, s0F, s10, s19 ); \
-   L( s07, s08, s11, s1A ); \
-   L( s00, s02, s05, s07 ); \
-   L( s10, s13, s15, s16 ); \
-   L( s09, s0B, s0C, s0E ); \
-   L( s19, s1A, s1C, s1F ); \
+  __m256i t0, t1, t2, t3; \
+  s0 = _mm256_xor_si256( s0, _mm256_set_epi32( \
+        alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00], \
+        alpha[0x01] ^ (rc), alpha[0x00], alpha[0x01] ^ (rc), alpha[0x00] ) ); \
+  s1 = _mm256_xor_si256( s1, _mm256_set_epi32( \
+                     alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02], \
+                     alpha[0x03], alpha[0x02], alpha[0x03], alpha[0x02] ) ); \
+  s2 = _mm256_xor_si256( s2, _mm256_set_epi32( \
+                     alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04], \
+                     alpha[0x05], alpha[0x04], alpha[0x05], alpha[0x04] ) ); \
+  s3 = _mm256_xor_si256( s3, _mm256_set_epi32( \
+                     alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06], \
+                     alpha[0x07], alpha[0x06], alpha[0x07], alpha[0x06] ) ); \
+  s4 = _mm256_xor_si256( s4, _mm256_set_epi32( \
+                     alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08], \
+                     alpha[0x09], alpha[0x08], alpha[0x09], alpha[0x08] ) ); \
+  s5 = _mm256_xor_si256( s5, _mm256_set_epi32( \
+                     alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A], \
+                     alpha[0x0B], alpha[0x0A], alpha[0x0B], alpha[0x0A] ) ); \
+  s6 = _mm256_xor_si256( s6, _mm256_set_epi32( \
+                     alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C], \
+                     alpha[0x0D], alpha[0x0C], alpha[0x0D], alpha[0x0C] ) ); \
+  s7 = _mm256_xor_si256( s7, _mm256_set_epi32( \
+                     alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E], \
+                     alpha[0x0F], alpha[0x0E], alpha[0x0F], alpha[0x0E] ) ); \
+  s8 = _mm256_xor_si256( s8, _mm256_set_epi32( \
+                     alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10], \
+                     alpha[0x11], alpha[0x10], alpha[0x11], alpha[0x10] ) ); \
+  s9 = _mm256_xor_si256( s9, _mm256_set_epi32( \
+                     alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12], \
+                     alpha[0x13], alpha[0x12], alpha[0x13], alpha[0x12] ) ); \
+  sA = _mm256_xor_si256( sA, _mm256_set_epi32( \
+                     alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14], \
+                     alpha[0x15], alpha[0x14], alpha[0x15], alpha[0x14] ) ); \
+  sB = _mm256_xor_si256( sB, _mm256_set_epi32( \
+                     alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16], \
+                     alpha[0x17], alpha[0x16], alpha[0x17], alpha[0x16] ) ); \
+  sC = _mm256_xor_si256( sC, _mm256_set_epi32( \
+                     alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18], \
+                     alpha[0x19], alpha[0x18], alpha[0x19], alpha[0x18] ) ); \
+  sD = _mm256_xor_si256( sD, _mm256_set_epi32( \
+                     alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A], \
+                     alpha[0x1B], alpha[0x1A], alpha[0x1B], alpha[0x1A] ) ); \
+  sE = _mm256_xor_si256( sE, _mm256_set_epi32( \
+                     alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C], \
+                     alpha[0x1D], alpha[0x1C], alpha[0x1D], alpha[0x1C] ) ); \
+  sF = _mm256_xor_si256( sF, _mm256_set_epi32( \
+                     alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E], \
+                     alpha[0x1F], alpha[0x1E], alpha[0x1F], alpha[0x1E] ) ); \
+\
+  SBOX( s0, s4, s8, sC ); \
+  SBOX( s1, s5, s9, sD ); \
+  SBOX( s2, s6, sA, sE ); \
+  SBOX( s3, s7, sB, sF ); \
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), \
+                           _mm256_bslli_epi128( s5, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sD, 4 ), \
+                           _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
+  L( s0, t1, s9, t3 ); \
+  s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s5 = _mm256_blend_epi32( s5, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sD = _mm256_blend_epi32( sD, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
+                           _mm256_bslli_epi128( s6, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sE, 4 ), \
+                           _mm256_bslli_epi128( sF, 4 ), 0xAA ); \
+  L( s1, t1, sA, t3 ); \
+  s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s6 = _mm256_blend_epi32( s6, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sE = _mm256_blend_epi32( sE, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sF = _mm256_blend_epi32( sF, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s6, 4 ), \
+                           _mm256_bslli_epi128( s7, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sF, 4 ), \
+                           _mm256_bslli_epi128( sC, 4 ), 0xAA ); \
+  L( s2, t1, sB, t3 ); \
+  s6 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s7 = _mm256_blend_epi32( s7, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sF = _mm256_blend_epi32( sF, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sC = _mm256_blend_epi32( sC, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s7, 4 ), \
+                           _mm256_bslli_epi128( s4, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( sC, 4 ), \
+                           _mm256_bslli_epi128( sD, 4 ), 0xAA ); \
+  L( s3, t1, s8, t3 ); \
+  s7 = _mm256_blend_epi32( s7, _mm256_bslli_epi128( t1, 4 ), 0xAA );\
+  s4 = _mm256_blend_epi32( s4, _mm256_bsrli_epi128( t1, 4 ), 0x55 );\
+  sC = _mm256_blend_epi32( sC, _mm256_bslli_epi128( t3, 4 ), 0xAA );\
+  sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t3, 4 ), 0x55 );\
+\
+  t0 = _mm256_blend_epi32( s0, _mm256_bslli_epi128( s8, 4 ), 0xAA ); \
+  t1 = _mm256_blend_epi32( s1, s9, 0xAA ); \
+  t2 = _mm256_blend_epi32( _mm256_bsrli_epi128( s2, 4 ), sA, 0xAA ); \
+  t3 = _mm256_blend_epi32( _mm256_bsrli_epi128( s3, 4 ), \
+                           _mm256_bslli_epi128( sB, 4 ), 0xAA ); \
+  L( t0, t1, t2, t3 ); \
+  s0 = _mm256_blend_epi32( s0, t0, 0x55 ); \
+  s8 = _mm256_blend_epi32( s8, _mm256_bsrli_epi128( t0, 4 ), 0x55 ); \
+  s1 = _mm256_blend_epi32( s1, t1, 0x55 ); \
+  s9 = _mm256_blend_epi32( s9, t1, 0xAA ); \
+  s2 = _mm256_blend_epi32( s2, _mm256_bslli_epi128( t2, 4 ), 0xAA ); \
+  sA = _mm256_blend_epi32( sA, t2, 0xAA ); \
+  s3 = _mm256_blend_epi32( s3, _mm256_bslli_epi128( t3, 4 ), 0xAA ); \
+  sB = _mm256_blend_epi32( sB, _mm256_bsrli_epi128( t3, 4 ), 0x55 ); \
+\
+  t0 = _mm256_blend_epi32( _mm256_bsrli_epi128( s4, 4 ), sC, 0xAA ); \
+  t1 = _mm256_blend_epi32( _mm256_bsrli_epi128( s5, 4 ), \
+                           _mm256_bslli_epi128( sD, 4 ), 0xAA ); \
+  t2 = _mm256_blend_epi32( s6, _mm256_bslli_epi128( sE, 4 ), 0xAA ); \
+  t3 = _mm256_blend_epi32( s7, sF, 0xAA ); \
+  L( t0, t1, t2, t3 ); \
+  s4 = _mm256_blend_epi32( s4, _mm256_bslli_epi128( t0, 4 ), 0xAA ); \
+  sC = _mm256_blend_epi32( sC, t0, 0xAA ); \
+  s5 = _mm256_blend_epi32( s5, _mm256_bslli_epi128( t1, 4 ), 0xAA ); \
+  sD = _mm256_blend_epi32( sD, _mm256_bsrli_epi128( t1, 4 ), 0x55 ); \
+  s6 = _mm256_blend_epi32( s6, t2, 0x55 ); \
+  sE = _mm256_blend_epi32( sE, _mm256_bsrli_epi128( t2, 4 ), 0x55 ); \
+  s7 = _mm256_blend_epi32( s7, t3, 0x55 ); \
+  sF = _mm256_blend_epi32( sF, t3, 0xAA ); \
 } while (0)
 
-#define P_BIG   do { \
-		ROUND_BIG(0, alpha_n); \
-		ROUND_BIG(1, alpha_n); \
-		ROUND_BIG(2, alpha_n); \
-		ROUND_BIG(3, alpha_n); \
-		ROUND_BIG(4, alpha_n); \
-		ROUND_BIG(5, alpha_n); \
-	} while (0)
+#define P_BIG \
+do { \
+   ROUND_BIG(0, alpha_n); \
+   ROUND_BIG(1, alpha_n); \
+   ROUND_BIG(2, alpha_n); \
+   ROUND_BIG(3, alpha_n); \
+   ROUND_BIG(4, alpha_n); \
+   ROUND_BIG(5, alpha_n); \
+} while (0)
 
-#define PF_BIG   do { \
-		ROUND_BIG(0, alpha_f); \
-		ROUND_BIG(1, alpha_f); \
-		ROUND_BIG(2, alpha_f); \
-		ROUND_BIG(3, alpha_f); \
-		ROUND_BIG(4, alpha_f); \
-		ROUND_BIG(5, alpha_f); \
-		ROUND_BIG(6, alpha_f); \
-		ROUND_BIG(7, alpha_f); \
-		ROUND_BIG(8, alpha_f); \
-		ROUND_BIG(9, alpha_f); \
-		ROUND_BIG(10, alpha_f); \
-		ROUND_BIG(11, alpha_f); \
-	} while (0)
+#define PF_BIG \
+do { \
+   ROUND_BIG( 0, alpha_f); \
+   ROUND_BIG( 1, alpha_f); \
+   ROUND_BIG( 2, alpha_f); \
+   ROUND_BIG( 3, alpha_f); \
+   ROUND_BIG( 4, alpha_f); \
+   ROUND_BIG( 5, alpha_f); \
+   ROUND_BIG( 6, alpha_f); \
+   ROUND_BIG( 7, alpha_f); \
+   ROUND_BIG( 8, alpha_f); \
+   ROUND_BIG( 9, alpha_f); \
+   ROUND_BIG(10, alpha_f); \
+   ROUND_BIG(11, alpha_f); \
+} while (0)
 
 #define T_BIG \
 do { /* order is important */ \
-   cF = _mm_xor_si128( sc->h[ 0xF ], s17 ); \
-   cE = _mm_xor_si128( sc->h[ 0xE ], s16 ); \
-   cD = _mm_xor_si128( sc->h[ 0xD ], s15 ); \
-   cC = _mm_xor_si128( sc->h[ 0xC ], s14 ); \
-   cB = _mm_xor_si128( sc->h[ 0xB ], s13 ); \
-   cA = _mm_xor_si128( sc->h[ 0xA ], s12 ); \
-   c9 = _mm_xor_si128( sc->h[ 0x9 ], s11 ); \
-   c8 = _mm_xor_si128( sc->h[ 0x8 ], s10 ); \
-   c7 = _mm_xor_si128( sc->h[ 0x7 ], s07 ); \
-   c6 = _mm_xor_si128( sc->h[ 0x6 ], s06 ); \
-   c5 = _mm_xor_si128( sc->h[ 0x5 ], s05 ); \
-   c4 = _mm_xor_si128( sc->h[ 0x4 ], s04 ); \
-   c3 = _mm_xor_si128( sc->h[ 0x3 ], s03 ); \
-   c2 = _mm_xor_si128( sc->h[ 0x2 ], s02 ); \
-   c1 = _mm_xor_si128( sc->h[ 0x1 ], s01 ); \
-   c0 = _mm_xor_si128( sc->h[ 0x0 ], s00 ); \
+   c7 = sc->h[ 0x7 ] = _mm256_xor_si256( sc->h[ 0x7 ], sB ); \
+   c6 = sc->h[ 0x6 ] = _mm256_xor_si256( sc->h[ 0x6 ], sA ); \
+   c5 = sc->h[ 0x5 ] = _mm256_xor_si256( sc->h[ 0x5 ], s9 ); \
+   c4 = sc->h[ 0x4 ] = _mm256_xor_si256( sc->h[ 0x4 ], s8 ); \
+   c3 = sc->h[ 0x3 ] = _mm256_xor_si256( sc->h[ 0x3 ], s3 ); \
+   c2 = sc->h[ 0x2 ] = _mm256_xor_si256( sc->h[ 0x2 ], s2 ); \
+   c1 = sc->h[ 0x1 ] = _mm256_xor_si256( sc->h[ 0x1 ], s1 ); \
+   c0 = sc->h[ 0x0 ] = _mm256_xor_si256( sc->h[ 0x0 ], s0 ); \
 } while (0)
 
-void hamsi_big( hamsi_4way_big_context *sc, __m128i *buf, size_t num )
+void hamsi_big( hamsi_4way_big_context *sc, __m256i *buf, size_t num )
 {
    DECL_STATE_BIG
    sph_u32 tmp;
@@ -394,30 +837,22 @@ void hamsi_big( hamsi_4way_big_context *sc, __m128i *buf, size_t num )
       sc->count_high++;
 
    READ_STATE_BIG( sc );
-
    while ( num-- > 0 )
    {
-      __m128i m0, m1, m2, m3, m4, m5, m6, m7;
-      __m128i m8, m9, mA, mB, mC, mD, mE, mF;
+      __m256i m0, m1, m2, m3, m4, m5, m6, m7;
 
       INPUT_BIG;
       P_BIG;
       T_BIG;
-
-// Strange kluge. Without the following WRITE_STATE the hash is bad.
-// SPH doesn't do it.
-      WRITE_STATE_BIG( sc );
-      buf += 2;
+      buf++;
    }
    WRITE_STATE_BIG( sc );
 }
 
-void hamsi_big_final( hamsi_4way_big_context *sc, __m128i *buf )
+void hamsi_big_final( hamsi_4way_big_context *sc, __m256i *buf )
 {
-   __m128i m0, m1, m2, m3, m4, m5, m6, m7;
-   __m128i m8, m9, mA, mB, mC, mD, mE, mF;
+   __m256i m0, m1, m2, m3, m4, m5, m6, m7;
    DECL_STATE_BIG
-
    READ_STATE_BIG( sc );
    INPUT_BIG;
    PF_BIG;
@@ -425,18 +860,28 @@ void hamsi_big_final( hamsi_4way_big_context *sc, __m128i *buf )
    WRITE_STATE_BIG( sc );
 }
 
-void hamsi_big_init( hamsi_4way_big_context *sc, const sph_u32 *iv )
+void hamsi512_4way_init( hamsi_4way_big_context *sc )
 {
    sc->partial_len = 0;
+   sph_u32 lo, hi;
    sc->count_high = sc->count_low = 0;
-   for ( int i = 0; i < 16; i ++ )
-      sc->h[i] = _mm_set1_epi32( iv[i] );
+   for ( int i = 0; i < 8; i++ )
+   {
+      lo = 2*i;
+      hi = 2*i + 1;
+      sc->h[i] = _mm256_set_epi32( IV512[hi], IV512[lo], IV512[hi], IV512[lo],
+                                   IV512[hi], IV512[lo], IV512[hi], IV512[lo] );
+   }
 }
 
-void hamsi_big_core( hamsi_4way_big_context *sc, const void *data, size_t len )
+void hamsi512_4way( hamsi_4way_big_context *sc, const void *data, size_t len )
 {
-   __m128i *vdata = (__m128i*)data;
+   __m256i *vdata = (__m256i*)data;
 
+// It looks like the only way to get in here is if core was previously called
+// with a very small len
+// That's not likely even with 80 byte input so deprecate partial len
+/*
    if ( sc->partial_len != 0 )
    {
       size_t mlen;
@@ -444,67 +889,47 @@ void hamsi_big_core( hamsi_4way_big_context *sc, const void *data, size_t len )
       mlen = 8 - sc->partial_len;
       if ( len < mlen )
       {
-         memcpy_128( sc->partial + (sc->partial_len >> 2), data, len>>2 );
+         memcpy_256( sc->partial + (sc->partial_len >> 3), data, len>>3 );
          sc->partial_len += len;
          return;
       }
       else
       {
-         memcpy_128( sc->partial + (sc->partial_len >> 2), data, mlen>>2 );
+         memcpy_256( sc->partial + (sc->partial_len >> 3), data, mlen>>3 );
          len -= mlen;
-         vdata += mlen>>2;
+         vdata += mlen>>3;
          hamsi_big( sc, sc->partial, 1 );
          sc->partial_len = 0;
       }
    }
+*/
 
    hamsi_big( sc, vdata, len>>3 );
-   vdata += ( (len& ~(size_t)7) >> 2 );
+   vdata += ( (len& ~(size_t)7) >> 3 );
    len &= (size_t)7;
-   memcpy_128( sc->partial, vdata, len>>2 );
+   memcpy_256( sc->buf, vdata, len>>3 );
+   sc->partial_len = len;
 }
 
-void hamsi_big_close( hamsi_4way_big_context *sc, void *dst,
-                      size_t out_size_w32 )
+void hamsi512_4way_close( hamsi_4way_big_context *sc, void *dst )
 {
-   __m128i pad[2];
-   size_t ptr, u;
-   __m128i *out = (__m128i*)dst;
+   __m256i *out = (__m256i*)dst;
+   __m256i pad[1];
+   size_t u;
+   int ch, cl;
 
-   ptr = sc->partial_len;
-
-   pad[0] = mm_byteswap_32( _mm_set1_epi32( sc->count_high ) );      
-   pad[1] = mm_byteswap_32( _mm_set1_epi32( sc->count_low + (ptr << 3) ) );
-
-   sc->partial[ ptr>>2 ] = _mm_set1_epi32( 0x80UL );
-
-   if ( ptr < 8 )
-      memset_zero_128( sc->partial + (ptr>>2) + 1, (8-ptr) >> 2 );
-
-   hamsi_big( sc, sc->partial, 1 );
+   sph_enc32be( &ch, sc->count_high );
+   sph_enc32be( &cl, sc->count_low + ( sc->partial_len << 3 ) );
+   pad[0] =  _mm256_set_epi32( cl, ch, cl, ch, cl, ch, cl, ch );
+   sc->buf[0] = _mm256_set_epi32( 0UL, 0x80UL, 0UL, 0x80UL,
+                                  0UL, 0x80UL, 0UL, 0x80UL );
+   hamsi_big( sc, sc->buf, 1 );
    hamsi_big_final( sc, pad );
-
-   for ( u = 0; u < 16; u ++ )
-      out[u] = mm_byteswap_32( sc->h[u] );
-}
-
-void hamsi512_4way_init( void *cc )
-{
-	hamsi_big_init( cc, IV512 );
-}
-
-void hamsi512_4way( void *cc, const void *data, size_t len )
-{
-	hamsi_big_core( cc, data, len );
-}
-
-void hamsi512_4way_close( void *cc, void *dst )
-{
-	hamsi_big_close( cc, dst, 16 );
+   for ( u = 0; u < 8; u ++ )
+      out[u] = mm256_bswap_32( sc->h[u] );
 }
 
 #ifdef __cplusplus
 }
 #endif
-
 #endif
diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h
index 10d0fbe..6122ac8 100644
--- a/algo/hamsi/hamsi-hash-4way.h
+++ b/algo/hamsi/hamsi-hash-4way.h
@@ -48,20 +48,20 @@ extern "C"{
 
 #define SPH_SIZE_hamsi512   512
 
+// Partial is only scalar but needs pointer ref for hamsi-helper
+// deprecate partial_len
 typedef struct {
-   __m128i h[16];
-   __m128i partial[2];
+   __m256i h[8];
+   __m256i buf[1];
    size_t partial_len;
    sph_u32 count_high, count_low;
 } hamsi_4way_big_context;
 
 typedef hamsi_4way_big_context hamsi512_4way_context;
 
-void hamsi512_4way_init(void *cc);
-
-void hamsi512_4way(void *cc, const void *data, size_t len);
-
-void hamsi512_4way_close(void *cc, void *dst);
+void hamsi512_4way_init( hamsi512_4way_context *sc );
+void hamsi512_4way( hamsi512_4way_context *sc, const void *data, size_t len );
+void hamsi512_4way_close( hamsi512_4way_context *sc, void *dst );
 
 #ifdef __cplusplus
 }
diff --git a/algo/hamsi/hamsi-helper-4way.c b/algo/hamsi/hamsi-helper-4way.c
deleted file mode 100644
index 309f3c5..0000000
--- a/algo/hamsi/hamsi-helper-4way.c
+++ /dev/null
@@ -1,482 +0,0 @@
-/* $Id: hamsi_helper.c 202 2010-05-31 15:46:48Z tp $ */
-/*
- * Helper code for Hamsi (input block expansion). This code is
- * automatically generated and includes precomputed tables for
- * expansion code which handles 2 to 8 bits at a time.
- *
- * This file is included from hamsi.c, and is not meant to be compiled
- * independently.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifdef __cplusplus
-extern "C"{
-#endif
-
-/* Note: this table lists bits within each byte from least
-   siginificant to most significant. */
-static const sph_u32 T512[64][16] = {
-	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
-	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
-	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
-	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
-	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
-	  SPH_C32(0x9e69af68) },
-	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
-	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
-	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
-	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
-	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
-	  SPH_C32(0x0c26f262) },
-	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
-	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
-	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
-	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
-	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
-	  SPH_C32(0xdc24e61f) },
-	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
-	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
-	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
-	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
-	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
-	  SPH_C32(0x3daac2da) },
-	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
-	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
-	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
-	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
-	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
-	  SPH_C32(0x78cace29) },
-	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
-	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
-	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
-	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
-	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
-	  SPH_C32(0x2dd1f9ab) },
-	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
-	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
-	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
-	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
-	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
-	  SPH_C32(0xbf2c0be2) },
-	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
-	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
-	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
-	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
-	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
-	  SPH_C32(0x32219526) },
-	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
-	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
-	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
-	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
-	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
-	  SPH_C32(0xac8e6c88) },
-	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
-	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
-	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
-	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
-	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
-	  SPH_C32(0x7b1bd6b9) },
-	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
-	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
-	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
-	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
-	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
-	  SPH_C32(0xf746c320) },
-	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
-	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
-	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
-	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
-	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
-	  SPH_C32(0x69505b3a) },
-	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
-	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
-	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
-	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
-	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
-	  SPH_C32(0x8a341574) },
-	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
-	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
-	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
-	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
-	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
-	  SPH_C32(0x450360bf) },
-	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
-	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
-	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
-	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
-	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
-	  SPH_C32(0xf3d45758) },
-	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
-	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
-	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
-	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
-	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
-	  SPH_C32(0x925c44e9) },
-	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
-	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
-	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
-	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
-	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
-	  SPH_C32(0xa123ff9f) },
-	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
-	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
-	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
-	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
-	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
-	  SPH_C32(0x1568ff0f) },
-	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
-	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
-	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
-	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
-	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
-	  SPH_C32(0xc5c1eb3e) },
-	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
-	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
-	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
-	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
-	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
-	  SPH_C32(0x1af21fe1) },
-	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
-	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
-	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
-	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
-	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
-	  SPH_C32(0x857f3c2b) },
-	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
-	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
-	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
-	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
-	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
-	  SPH_C32(0x2ba05a55) },
-	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
-	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
-	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
-	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
-	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
-	  SPH_C32(0xfeabf254) },
-	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
-	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
-	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
-	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
-	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
-	  SPH_C32(0xfe1cdc7f) },
-	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
-	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
-	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
-	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
-	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
-	  SPH_C32(0xb0a51834) },
-	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
-	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
-	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
-	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
-	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
-	  SPH_C32(0xa6b8c28d) },
-	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
-	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
-	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
-	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
-	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
-	  SPH_C32(0x3a4e99d7) },
-	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
-	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
-	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
-	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
-	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
-	  SPH_C32(0xe1844257) },
-	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
-	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
-	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
-	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
-	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
-	  SPH_C32(0x2c3b504e) },
-	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
-	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
-	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
-	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
-	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
-	  SPH_C32(0x524a0d59) },
-	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
-	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
-	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
-	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
-	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
-	  SPH_C32(0x378dd173) },
-	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
-	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
-	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
-	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
-	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
-	  SPH_C32(0x8b6c72bd) },
-	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
-	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
-	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
-	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
-	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
-	  SPH_C32(0x8e67b7fa) },
-	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
-	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
-	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
-	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
-	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
-	  SPH_C32(0x443d3004) },
-	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
-	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
-	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
-	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
-	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
-	  SPH_C32(0xf4f6ea7b) },
-	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
-	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
-	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
-	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
-	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
-	  SPH_C32(0x979961d0) },
-	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
-	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
-	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
-	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
-	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
-	  SPH_C32(0x98aa496e) },
-	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
-	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
-	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
-	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
-	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
-	  SPH_C32(0x094e3198) },
-	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
-	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
-	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
-	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
-	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
-	  SPH_C32(0xe86cba2e) },
-	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
-	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
-	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
-	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
-	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
-	  SPH_C32(0x4b7eec55) },
-	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
-	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
-	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
-	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
-	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
-	  SPH_C32(0x1e7536a6) },
-	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
-	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
-	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
-	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
-	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
-	  SPH_C32(0x24314f17) },
-	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
-	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
-	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
-	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
-	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
-	  SPH_C32(0x9075b1ce) },
-	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
-	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
-	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
-	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
-	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
-	  SPH_C32(0x9b6ef888) },
-	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
-	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
-	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
-	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
-	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
-	  SPH_C32(0xd8b61463) },
-	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
-	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
-	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
-	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
-	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
-	  SPH_C32(0x3ea660f7) },
-	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
-	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
-	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
-	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
-	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
-	  SPH_C32(0x7f975691) },
-	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
-	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
-	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
-	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
-	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
-	  SPH_C32(0x2c94459e) },
-	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
-	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
-	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
-	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
-	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
-	  SPH_C32(0x56a7b19f) },
-	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
-	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
-	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
-	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
-	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
-	  SPH_C32(0x81fdf908) },
-	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
-	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
-	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
-	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
-	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
-	  SPH_C32(0x5bd61539) },
-	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
-	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
-	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
-	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
-	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
-	  SPH_C32(0x15b961e7) },
-	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
-	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
-	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
-	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
-	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
-	  SPH_C32(0x2a2c18f0) },
-	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
-	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
-	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
-	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
-	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
-	  SPH_C32(0x551e3d6e) },
-	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
-	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
-	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
-	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
-	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
-	  SPH_C32(0x33c5244f) },
-	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
-	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
-	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
-	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
-	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
-	  SPH_C32(0x8a58e6a4) },
-	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
-	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
-	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
-	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
-	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
-	  SPH_C32(0xda878000) },
-	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
-	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
-	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
-	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
-	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
-	  SPH_C32(0x3c5dfffe) },
-	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
-	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
-	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
-	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
-	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
-	  SPH_C32(0x7b1675d7) },
-	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
-	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
-	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
-	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
-	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
-	  SPH_C32(0x2879ebac) },
-	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
-	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
-	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
-	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
-	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
-	  SPH_C32(0xbe0a679e) },
-	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
-	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
-	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
-	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
-	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
-	  SPH_C32(0x30aebcf7) },
-	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
-	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
-	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
-	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
-	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
-	  SPH_C32(0xc7ff60f0) },
-	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
-	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
-	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
-	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
-	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
-	  SPH_C32(0xe7e00a94) }
-};
-
-#define U_BIG( n ) \
-do { \
-  __m128i db = buf[n]; \
-  for ( int u = 0; u < 32; u++ ) \
-  { \
-     __m128i dm = mm_negate_32( _mm_and_si128( db, mm_one_32 ) ); \
-     m0 = _mm_xor_si128( m0, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m1 = _mm_xor_si128( m1, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m2 = _mm_xor_si128( m2, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m3 = _mm_xor_si128( m3, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m4 = _mm_xor_si128( m4, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m5 = _mm_xor_si128( m5, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m6 = _mm_xor_si128( m6, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m7 = _mm_xor_si128( m7, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m8 = _mm_xor_si128( m8, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     m9 = _mm_xor_si128( m9, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     mA = _mm_xor_si128( mA, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     mB = _mm_xor_si128( mB, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     mC = _mm_xor_si128( mC, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     mD = _mm_xor_si128( mD, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     mE = _mm_xor_si128( mE, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     mF = _mm_xor_si128( mF, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \
-     db = _mm_srli_epi32( db, 1 ); \
-  } \
-} while (0);
-
-#define INPUT_BIG \
-do { \
-  const sph_u32 *tp = &T512[0][0]; \
-  m0 = mm_zero; \
-  m1 = mm_zero; \
-  m2 = mm_zero; \
-  m3 = mm_zero; \
-  m4 = mm_zero; \
-  m5 = mm_zero; \
-  m6 = mm_zero; \
-  m7 = mm_zero; \
-  m8 = mm_zero; \
-  m9 = mm_zero; \
-  mA = mm_zero; \
-  mB = mm_zero; \
-  mC = mm_zero; \
-  mD = mm_zero; \
-  mE = mm_zero; \
-  mF = mm_zero; \
-  U_BIG( 0 ); \
-  U_BIG( 1 ); \
-} while (0)
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/algo/hamsi/sph_hamsi.c.test b/algo/hamsi/sph_hamsi.c.test
new file mode 100644
index 0000000..8c481c5
--- /dev/null
+++ b/algo/hamsi/sph_hamsi.c.test
@@ -0,0 +1,940 @@
+/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */
+/*
+ * Hamsi implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_hamsi.h"
+
+#ifdef __cplusplus
+extern "C"{
+#endif
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAMSI
+#define SPH_SMALL_FOOTPRINT_HAMSI   1
+#endif
+
+/*
+ * The SPH_HAMSI_EXPAND_* define how many input bits we handle in one
+ * table lookup during message expansion (1 to 8, inclusive). If we note
+ * w the number of bits per message word (w=32 for Hamsi-224/256, w=64
+ * for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for
+ * Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level,
+ * then we will get t tables (where t=ceil(w/n)) of individual size
+ * 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and
+ * n=5, there are 7 tables, but the last one uses only two bits on
+ * input, not five).
+ *
+ * Also, we read t rows of r words from RAM. Words in a given row are
+ * concatenated in RAM in that order, so most of the cost is about
+ * reading the first row word; comparatively, cache misses are thus
+ * less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8).
+ *
+ * When n=1, tables are "special" in that we omit the first entry of
+ * each table (which always contains 0), so that total table size is
+ * halved.
+ *
+ * We thus have the following (size1 is the cumulative table size of
+ * Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2
+ * are for Hamsi-224/256 and Hamsi-384/512, respectively).
+ *
+ *   n      size1      size2    t1    t2
+ * ---------------------------------------
+ *   1       1024       4096    32    64
+ *   2       2048       8192    16    32
+ *   3       2688      10880    11    22
+ *   4       4096      16384     8    16
+ *   5       6272      25600     7    13
+ *   6      10368      41984     6    11
+ *   7      16896      73856     5    10
+ *   8      32768     131072     4     8
+ *
+ * So there is a trade-off: a lower n makes the tables fit better in
+ * L1 cache, but increases the number of memory accesses. The optimal
+ * value depends on the amount of available L1 cache and the relative
+ * impact of a cache miss.
+ *
+ * Experimentally, in ideal benchmark conditions (which are not necessarily
+ * realistic with regards to L1 cache contention), it seems that n=8 is
+ * the best value on "big" architectures (those with 32 kB or more of L1
+ * cache), while n=4 is better on "small" architectures. This was tested
+ * on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3
+ * (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302
+ * (8 kB L1 cache).
+ *
+ * Note: with n=1, the 32 tables (actually implemented as one big table)
+ * are read entirely and sequentially, regardless of the input data,
+ * thus avoiding any data-dependent table access pattern.
+ */
+
+#if !defined SPH_HAMSI_EXPAND_SMALL
+#if SPH_SMALL_FOOTPRINT_HAMSI
+#define SPH_HAMSI_EXPAND_SMALL  4
+#else
+#define SPH_HAMSI_EXPAND_SMALL  8
+#endif
+#endif
+
+#if !defined SPH_HAMSI_EXPAND_BIG
+#define SPH_HAMSI_EXPAND_BIG    8
+#endif
+
+#ifdef _MSC_VER
+#pragma warning (disable: 4146)
+#endif
+
+#include "sph_hamsi_helper.c"
+
+static const sph_u32 IV224[] = {
+	SPH_C32(0xc3967a67), SPH_C32(0xc3bc6c20), SPH_C32(0x4bc3bcc3),
+	SPH_C32(0xa7c3bc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
+	SPH_C32(0x69656b65), SPH_C32(0x20556e69)
+};
+
+/*
+ * This version is the one used in the Hamsi submission package for
+ * round 2 of the SHA-3 competition; the UTF-8 encoding is wrong and
+ * shall soon be corrected in the official Hamsi specification.
+ *
+static const sph_u32 IV224[] = {
+	SPH_C32(0x3c967a67), SPH_C32(0x3cbc6c20), SPH_C32(0xb4c343c3),
+	SPH_C32(0xa73cbc6b), SPH_C32(0x2c204b61), SPH_C32(0x74686f6c),
+	SPH_C32(0x69656b65), SPH_C32(0x20556e69)
+};
+ */
+
+static const sph_u32 IV256[] = {
+	SPH_C32(0x76657273), SPH_C32(0x69746569), SPH_C32(0x74204c65),
+	SPH_C32(0x7576656e), SPH_C32(0x2c204465), SPH_C32(0x70617274),
+	SPH_C32(0x656d656e), SPH_C32(0x7420456c)
+};
+
+static const sph_u32 IV384[] = {
+	SPH_C32(0x656b7472), SPH_C32(0x6f746563), SPH_C32(0x686e6965),
+	SPH_C32(0x6b2c2043), SPH_C32(0x6f6d7075), SPH_C32(0x74657220),
+	SPH_C32(0x53656375), SPH_C32(0x72697479), SPH_C32(0x20616e64),
+	SPH_C32(0x20496e64), SPH_C32(0x75737472), SPH_C32(0x69616c20),
+	SPH_C32(0x43727970), SPH_C32(0x746f6772), SPH_C32(0x61706879),
+	SPH_C32(0x2c204b61)
+};
+
+static const sph_u32 IV512[] = {
+	SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172),
+	SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062),
+	SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33),
+	SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48),
+	SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c),
+	SPH_C32(0x6769756d)
+};
+
+static const sph_u32 alpha_n[] = {
+	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
+	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
+	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
+	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
+};
+
+static const sph_u32 alpha_f[] = {
+	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
+	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
+	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
+	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
+	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
+	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
+};
+
+#define DECL_STATE_SMALL \
+	sph_u32 c0, c1, c2, c3, c4, c5, c6, c7;
+
+#define READ_STATE_SMALL(sc)   do { \
+		c0 = sc->h[0x0]; \
+		c1 = sc->h[0x1]; \
+		c2 = sc->h[0x2]; \
+		c3 = sc->h[0x3]; \
+		c4 = sc->h[0x4]; \
+		c5 = sc->h[0x5]; \
+		c6 = sc->h[0x6]; \
+		c7 = sc->h[0x7]; \
+	} while (0)
+
+#define WRITE_STATE_SMALL(sc)   do { \
+		sc->h[0x0] = c0; \
+		sc->h[0x1] = c1; \
+		sc->h[0x2] = c2; \
+		sc->h[0x3] = c3; \
+		sc->h[0x4] = c4; \
+		sc->h[0x5] = c5; \
+		sc->h[0x6] = c6; \
+		sc->h[0x7] = c7; \
+	} while (0)
+
+#define s0   m0
+#define s1   m1
+#define s2   c0
+#define s3   c1
+#define s4   c2
+#define s5   c3
+#define s6   m2
+#define s7   m3
+#define s8   m4
+#define s9   m5
+#define sA   c4
+#define sB   c5
+#define sC   c6
+#define sD   c7
+#define sE   m6
+#define sF   m7
+
+#define SBOX(a, b, c, d)   do { \
+		sph_u32 t; \
+		t = (a); \
+		(a) &= (c); \
+		(a) ^= (d); \
+		(c) ^= (b); \
+		(c) ^= (a); \
+		(d) |= t; \
+		(d) ^= (b); \
+		t ^= (c); \
+		(b) = (d); \
+		(d) |= t; \
+		(d) ^= (a); \
+		(a) &= (b); \
+		t ^= (a); \
+		(b) ^= (d); \
+		(b) ^= t; \
+		(a) = (c); \
+		(c) = (b); \
+		(b) = (d); \
+		(d) = SPH_T32(~t); \
+	} while (0)
+
+#define L(a, b, c, d)   do { \
+		(a) = SPH_ROTL32(a, 13); \
+		(c) = SPH_ROTL32(c, 3); \
+		(b) ^= (a) ^ (c); \
+		(d) ^= (c) ^ SPH_T32((a) << 3); \
+		(b) = SPH_ROTL32(b, 1); \
+		(d) = SPH_ROTL32(d, 7); \
+		(a) ^= (b) ^ (d); \
+		(c) ^= (d) ^ SPH_T32((b) << 7); \
+		(a) = SPH_ROTL32(a, 5); \
+		(c) = SPH_ROTL32(c, 22); \
+	} while (0)
+
+#define ROUND_SMALL(rc, alpha)   do { \
+		s0 ^= alpha[0x00]; \
+		s1 ^= alpha[0x01] ^ (sph_u32)(rc); \
+		s2 ^= alpha[0x02]; \
+		s3 ^= alpha[0x03]; \
+		s4 ^= alpha[0x08]; \
+		s5 ^= alpha[0x09]; \
+		s6 ^= alpha[0x0A]; \
+		s7 ^= alpha[0x0B]; \
+		s8 ^= alpha[0x10]; \
+		s9 ^= alpha[0x11]; \
+		sA ^= alpha[0x12]; \
+		sB ^= alpha[0x13]; \
+		sC ^= alpha[0x18]; \
+		sD ^= alpha[0x19]; \
+		sE ^= alpha[0x1A]; \
+		sF ^= alpha[0x1B]; \
+		SBOX(s0, s4, s8, sC); \
+		SBOX(s1, s5, s9, sD); \
+		SBOX(s2, s6, sA, sE); \
+		SBOX(s3, s7, sB, sF); \
+		L(s0, s5, sA, sF); \
+		L(s1, s6, sB, sC); \
+		L(s2, s7, s8, sD); \
+		L(s3, s4, s9, sE); \
+	} while (0)
+
+#define P_SMALL   do { \
+		ROUND_SMALL(0, alpha_n); \
+		ROUND_SMALL(1, alpha_n); \
+		ROUND_SMALL(2, alpha_n); \
+	} while (0)
+
+#define PF_SMALL   do { \
+		ROUND_SMALL(0, alpha_f); \
+		ROUND_SMALL(1, alpha_f); \
+		ROUND_SMALL(2, alpha_f); \
+		ROUND_SMALL(3, alpha_f); \
+		ROUND_SMALL(4, alpha_f); \
+		ROUND_SMALL(5, alpha_f); \
+	} while (0)
+
+#define T_SMALL   do { \
+		/* order is important */ \
+		c7 = (sc->h[7] ^= sB); \
+		c6 = (sc->h[6] ^= sA); \
+		c5 = (sc->h[5] ^= s9); \
+		c4 = (sc->h[4] ^= s8); \
+		c3 = (sc->h[3] ^= s3); \
+		c2 = (sc->h[2] ^= s2); \
+		c1 = (sc->h[1] ^= s1); \
+		c0 = (sc->h[0] ^= s0); \
+	} while (0)
+
+static void
+hamsi_small(sph_hamsi_small_context *sc, const unsigned char *buf, size_t num)
+{
+	DECL_STATE_SMALL
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->count += (sph_u64)num << 5;
+#else
+	tmp = SPH_T32((sph_u32)num << 5);
+	sc->count_low = SPH_T32(sc->count_low + tmp);
+	sc->count_high += (sph_u32)((num >> 13) >> 14);
+	if (sc->count_low < tmp)
+		sc->count_high ++;
+#endif
+	READ_STATE_SMALL(sc);
+	while (num -- > 0) {
+		sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
+
+		INPUT_SMALL;
+		P_SMALL;
+		T_SMALL;
+		buf += 4;
+	}
+	WRITE_STATE_SMALL(sc);
+}
+
+static void
+hamsi_small_final(sph_hamsi_small_context *sc, const unsigned char *buf)
+{
+	sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
+	DECL_STATE_SMALL
+
+	READ_STATE_SMALL(sc);
+	INPUT_SMALL;
+	PF_SMALL;
+	T_SMALL;
+	WRITE_STATE_SMALL(sc);
+}
+
+static void
+hamsi_small_init(sph_hamsi_small_context *sc, const sph_u32 *iv)
+{
+	sc->partial_len = 0;
+	memcpy(sc->h, iv, sizeof sc->h);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+static void
+hamsi_small_core(sph_hamsi_small_context *sc, const void *data, size_t len)
+{
+	if (sc->partial_len != 0) {
+		size_t mlen;
+
+		mlen = 4 - sc->partial_len;
+		if (len < mlen) {
+			memcpy(sc->partial + sc->partial_len, data, len);
+			sc->partial_len += len;
+			return;
+		} else {
+			memcpy(sc->partial + sc->partial_len, data, mlen);
+			len -= mlen;
+			data = (const unsigned char *)data + mlen;
+			hamsi_small(sc, sc->partial, 1);
+			sc->partial_len = 0;
+		}
+	}
+
+	hamsi_small(sc, data, (len >> 2));
+	data = (const unsigned char *)data + (len & ~(size_t)3);
+	len &= (size_t)3;
+	memcpy(sc->partial, data, len);
+	sc->partial_len = len;
+}
+
+static void
+hamsi_small_close(sph_hamsi_small_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	unsigned char pad[12];
+	size_t ptr, u;
+	unsigned z;
+	unsigned char *out;
+
+	ptr = sc->partial_len;
+	memcpy(pad, sc->partial, ptr);
+#if SPH_64
+	sph_enc64be(pad + 4, sc->count + (ptr << 3) + n);
+#else
+	sph_enc32be(pad + 4, sc->count_high);
+	sph_enc32be(pad + 8, sc->count_low + (ptr << 3) + n);
+#endif
+	z = 0x80 >> n;
+	pad[ptr ++] = ((ub & -z) | z) & 0xFF;
+	while (ptr < 4)
+		pad[ptr ++] = 0;
+	hamsi_small(sc, pad, 2);
+	hamsi_small_final(sc, pad + 8);
+	out = dst;
+	for (u = 0; u < out_size_w32; u ++)
+		sph_enc32be(out + (u << 2), sc->h[u]);
+}
+
+#define DECL_STATE_BIG \
+	sph_u32 c0, c1, c2, c3, c4, c5, c6, c7; \
+	sph_u32 c8, c9, cA, cB, cC, cD, cE, cF;
+
+#define READ_STATE_BIG(sc)   do { \
+		c0 = sc->h[0x0]; \
+		c1 = sc->h[0x1]; \
+		c2 = sc->h[0x2]; \
+		c3 = sc->h[0x3]; \
+		c4 = sc->h[0x4]; \
+		c5 = sc->h[0x5]; \
+		c6 = sc->h[0x6]; \
+		c7 = sc->h[0x7]; \
+		c8 = sc->h[0x8]; \
+		c9 = sc->h[0x9]; \
+		cA = sc->h[0xA]; \
+		cB = sc->h[0xB]; \
+		cC = sc->h[0xC]; \
+		cD = sc->h[0xD]; \
+		cE = sc->h[0xE]; \
+		cF = sc->h[0xF]; \
+	} while (0)
+
+#define WRITE_STATE_BIG(sc)   do { \
+		sc->h[0x0] = c0; \
+		sc->h[0x1] = c1; \
+		sc->h[0x2] = c2; \
+		sc->h[0x3] = c3; \
+		sc->h[0x4] = c4; \
+		sc->h[0x5] = c5; \
+		sc->h[0x6] = c6; \
+		sc->h[0x7] = c7; \
+		sc->h[0x8] = c8; \
+		sc->h[0x9] = c9; \
+		sc->h[0xA] = cA; \
+		sc->h[0xB] = cB; \
+		sc->h[0xC] = cC; \
+		sc->h[0xD] = cD; \
+		sc->h[0xE] = cE; \
+		sc->h[0xF] = cF; \
+	} while (0)
+
+#define s00   m0
+#define s01   m1
+#define s02   c0
+#define s03   c1
+#define s04   m2
+#define s05   m3
+#define s06   c2
+#define s07   c3
+#define s08   c4
+#define s09   c5
+#define s0A   m4
+#define s0B   m5
+#define s0C   c6
+#define s0D   c7
+#define s0E   m6
+#define s0F   m7
+#define s10   m8
+#define s11   m9
+#define s12   c8
+#define s13   c9
+#define s14   mA
+#define s15   mB
+#define s16   cA
+#define s17   cB
+#define s18   cC
+#define s19   cD
+#define s1A   mC
+#define s1B   mD
+#define s1C   cE
+#define s1D   cF
+#define s1E   mE
+#define s1F   mF
+
+#define ROUND_BIG(rc, alpha)   do { \
+		s00 ^= alpha[0x00]; \
+		s01 ^= alpha[0x01] ^ (sph_u32)(rc); \
+		s02 ^= alpha[0x02]; \
+		s03 ^= alpha[0x03]; \
+		s04 ^= alpha[0x04]; \
+		s05 ^= alpha[0x05]; \
+		s06 ^= alpha[0x06]; \
+		s07 ^= alpha[0x07]; \
+		s08 ^= alpha[0x08]; \
+		s09 ^= alpha[0x09]; \
+		s0A ^= alpha[0x0A]; \
+		s0B ^= alpha[0x0B]; \
+		s0C ^= alpha[0x0C]; \
+		s0D ^= alpha[0x0D]; \
+		s0E ^= alpha[0x0E]; \
+		s0F ^= alpha[0x0F]; \
+		s10 ^= alpha[0x10]; \
+		s11 ^= alpha[0x11]; \
+		s12 ^= alpha[0x12]; \
+		s13 ^= alpha[0x13]; \
+		s14 ^= alpha[0x14]; \
+		s15 ^= alpha[0x15]; \
+		s16 ^= alpha[0x16]; \
+		s17 ^= alpha[0x17]; \
+		s18 ^= alpha[0x18]; \
+		s19 ^= alpha[0x19]; \
+		s1A ^= alpha[0x1A]; \
+		s1B ^= alpha[0x1B]; \
+		s1C ^= alpha[0x1C]; \
+		s1D ^= alpha[0x1D]; \
+		s1E ^= alpha[0x1E]; \
+		s1F ^= alpha[0x1F]; \
+		SBOX(s00, s08, s10, s18); \
+		SBOX(s01, s09, s11, s19); \
+		SBOX(s02, s0A, s12, s1A); \
+		SBOX(s03, s0B, s13, s1B); \
+		SBOX(s04, s0C, s14, s1C); \
+		SBOX(s05, s0D, s15, s1D); \
+		SBOX(s06, s0E, s16, s1E); \
+		SBOX(s07, s0F, s17, s1F); \
+		L(s00, s09, s12, s1B); \
+		L(s01, s0A, s13, s1C); \
+		L(s02, s0B, s14, s1D); \
+		L(s03, s0C, s15, s1E); \
+		L(s04, s0D, s16, s1F); \
+		L(s05, s0E, s17, s18); \
+		L(s06, s0F, s10, s19); \
+		L(s07, s08, s11, s1A); \
+/*if (rc == 0 ) { \
+printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
+}*/ \
+		L(s00, s02, s05, s07); \
+		L(s10, s13, s15, s16); \
+/*if (rc == 0 ) { \
+printf("S L5 post s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
+}*/ \
+		L(s09, s0B, s0C, s0E); \
+		L(s19, s1A, s1C, s1F); \
+	} while (0)
+
+#if SPH_SMALL_FOOTPRINT_HAMSI
+
+#define P_BIG   do { \
+		unsigned r; \
+		for (r = 0; r < 6; r ++) \
+			ROUND_BIG(r, alpha_n); \
+	} while (0)
+
+#define PF_BIG   do { \
+		unsigned r; \
+		for (r = 0; r < 12; r ++) \
+			ROUND_BIG(r, alpha_f); \
+	} while (0)
+
+#else
+
+#define P_BIG   do { \
+		ROUND_BIG(0, alpha_n); \
+/*printf("S R0 s00 %08lx s01 %08lx s02 %08lx s03 %08lx\n",s00,s01,s02,s03); \
+printf("S R0 s04 %08lx s05 %08lx s06 %08lx s07 %08lx\n",s04,s05,s06,s07); \
+printf("S R0 s08 %08lx s09 %08lx s0A %08lx s0B %08lx\n",s08,s09,s0A,s0B); \
+printf("S R0 s0C %08lx s0D %08lx s0E %08lx s0F %08lx\n",s0C,s0D,s0E,s0F); \
+printf("S R0 s10 %08lx s11 %08lx s12 %08lx s13 %08lx\n",s10,s11,s12,s13); \
+printf("S R0 s14 %08lx s15 %08lx s16 %08lx s17 %08lx\n",s14,s15,s16,s17); \
+printf("S R0 s18 %08lx s19 %08lx s1A %08lx s1B %08lx\n",s18,s19,s1A,s1B); \
+printf("S R0 s1C %08lx s1D %08lx s1E %08lx s1F %08lx\n",s1C,s1D,s1E,s1F); \
+*/\
+		ROUND_BIG(1, alpha_n); \
+		ROUND_BIG(2, alpha_n); \
+		ROUND_BIG(3, alpha_n); \
+		ROUND_BIG(4, alpha_n); \
+		ROUND_BIG(5, alpha_n); \
+	} while (0)
+
+#define PF_BIG   do { \
+		ROUND_BIG(0, alpha_f); \
+		ROUND_BIG(1, alpha_f); \
+		ROUND_BIG(2, alpha_f); \
+		ROUND_BIG(3, alpha_f); \
+		ROUND_BIG(4, alpha_f); \
+		ROUND_BIG(5, alpha_f); \
+		ROUND_BIG(6, alpha_f); \
+		ROUND_BIG(7, alpha_f); \
+		ROUND_BIG(8, alpha_f); \
+		ROUND_BIG(9, alpha_f); \
+		ROUND_BIG(10, alpha_f); \
+		ROUND_BIG(11, alpha_f); \
+	} while (0)
+
+#endif
+
+#define T_BIG   do { \
+		/* order is important */ \
+		cF = (sc->h[0xF] ^= s17); \
+		cE = (sc->h[0xE] ^= s16); \
+		cD = (sc->h[0xD] ^= s15); \
+		cC = (sc->h[0xC] ^= s14); \
+		cB = (sc->h[0xB] ^= s13); \
+		cA = (sc->h[0xA] ^= s12); \
+		c9 = (sc->h[0x9] ^= s11); \
+		c8 = (sc->h[0x8] ^= s10); \
+		c7 = (sc->h[0x7] ^= s07); \
+		c6 = (sc->h[0x6] ^= s06); \
+		c5 = (sc->h[0x5] ^= s05); \
+		c4 = (sc->h[0x4] ^= s04); \
+		c3 = (sc->h[0x3] ^= s03); \
+		c2 = (sc->h[0x2] ^= s02); \
+		c1 = (sc->h[0x1] ^= s01); \
+		c0 = (sc->h[0x0] ^= s00); \
+	} while (0)
+
+static void
+hamsi_big(sph_hamsi_big_context *sc, const unsigned char *buf, size_t num)
+{
+	DECL_STATE_BIG
+#if !SPH_64
+	sph_u32 tmp;
+#endif
+
+#if SPH_64
+	sc->count += (sph_u64)num << 6;
+#else
+	tmp = SPH_T32((sph_u32)num << 6);
+	sc->count_low = SPH_T32(sc->count_low + tmp);
+	sc->count_high += (sph_u32)((num >> 13) >> 13);
+	if (sc->count_low < tmp)
+		sc->count_high ++;
+#endif
+	READ_STATE_BIG(sc);
+/*
+uint32_t* b = (uint32_t*)buf;
+//printf("S s64: %016llx\n",*ss);
+//printf("S buf: %08lx %08lx\n",b[0], b[1]);
+
+int n1 = 1;
+int n2 = 1;
+*/
+	while (num -- > 0) {
+		sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
+		sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
+
+		INPUT_BIG;
+/*if ( n1 ) 
+{
+n1 = 0;
+printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m0,m1,m2,m3 );
+printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m4,m5,m6,m7);
+printf("S INPUT m: %08lx %08lx %08lx %08lx\n",m8,m9,mA,mB );
+printf("S INPUT m: %08lx %08lx %08lx %08lx\n",mC,mD,mE,mF);
+}
+*/
+
+		P_BIG;
+
+/*if ( n2 )        
+{
+n2 = 0;
+printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s00,s01,s02,s03 );
+printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s04,s05,s07,s07);
+printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s08,s09,s0A,s0B );
+printf("S P_BIG s: %08lx %08lx %08lx %08lx\n",s0C,s0D,s0E,s0F);
+}
+*/
+
+		T_BIG;
+		buf += 8;
+	}
+	WRITE_STATE_BIG(sc);
+}
+
+static void
+hamsi_big_final(sph_hamsi_big_context *sc, const unsigned char *buf)
+{
+	sph_u32 m0, m1, m2, m3, m4, m5, m6, m7;
+	sph_u32 m8, m9, mA, mB, mC, mD, mE, mF;
+	DECL_STATE_BIG
+
+	READ_STATE_BIG(sc);
+	INPUT_BIG;
+	PF_BIG;
+	T_BIG;
+	WRITE_STATE_BIG(sc);
+}
+
+static void
+hamsi_big_init(sph_hamsi_big_context *sc, const sph_u32 *iv)
+{
+	sc->partial_len = 0;
+	memcpy(sc->h, iv, sizeof sc->h);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+static void
+hamsi_big_core(sph_hamsi_big_context *sc, const void *data, size_t len)
+{
+uint64_t* d = (uint64_t*)data;
+uint64_t* h = (uint64_t*)sc->h;
+/*
+printf("S core1 len = %d\n",len);
+printf("S data: %016llx %016llx %016llx %016llx\n",d[0],d[1],d[2],d[3]);
+printf("S data: %016llx %016llx %016llx %016llx\n",d[4],d[5],d[6],d[7]);
+printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
+*/
+	if (sc->partial_len != 0) {
+//printf("WARNING partial_len != 0\n");
+
+		size_t mlen;
+
+		mlen = 8 - sc->partial_len;
+		if (len < mlen) {
+			memcpy(sc->partial + sc->partial_len, data, len);
+			sc->partial_len += len;
+			return;
+		} else {
+			memcpy(sc->partial + sc->partial_len, data, mlen);
+			len -= mlen;
+			data = (const unsigned char *)data + mlen;
+			hamsi_big(sc, sc->partial, 1);
+			sc->partial_len = 0;
+		}
+	}
+
+	hamsi_big(sc, data, (len >> 3));
+/*
+printf("S core2\n");
+printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
+*/
+	data = (const unsigned char *)data + (len & ~(size_t)7);
+	len &= (size_t)7;
+	memcpy(sc->partial, data, len);
+	sc->partial_len = len;
+}
+
+static void
+hamsi_big_close(sph_hamsi_big_context *sc,
+	unsigned ub, unsigned n, void *dst, size_t out_size_w32)
+{
+	unsigned char pad[8];
+	size_t ptr, u;
+	unsigned z;
+	unsigned char *out;
+//uint64_t* h = (uint64_t*)sc->h;
+
+	ptr = sc->partial_len;
+#if SPH_64
+	sph_enc64be(pad, sc->count + (ptr << 3) + n);
+#else
+	sph_enc32be(pad, sc->count_high);
+	sph_enc32be(pad + 4, sc->count_low + (ptr << 3) + n);
+#endif
+	z = 0x80 >> n;
+	sc->partial[ptr ++] = ((ub & -z) | z) & 0xFF;
+	while (ptr < 8)
+		sc->partial[ptr ++] = 0;
+
+//printf("S close1\n");
+//printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
+
+	hamsi_big(sc, sc->partial, 1);
+
+//printf("S close2\n");
+//printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
+
+
+	hamsi_big_final(sc, pad);
+
+//printf("S close3\n");
+//printf("S H:    %016llx %016llx %016llx %016llx\n",h[0],h[1],h[2],h[3]);
+
+
+	out = dst;
+	if (out_size_w32 == 12) {
+		sph_enc32be(out +  0, sc->h[ 0]);
+		sph_enc32be(out +  4, sc->h[ 1]);
+		sph_enc32be(out +  8, sc->h[ 3]);
+		sph_enc32be(out + 12, sc->h[ 4]);
+		sph_enc32be(out + 16, sc->h[ 5]);
+		sph_enc32be(out + 20, sc->h[ 6]);
+		sph_enc32be(out + 24, sc->h[ 8]);
+		sph_enc32be(out + 28, sc->h[ 9]);
+		sph_enc32be(out + 32, sc->h[10]);
+		sph_enc32be(out + 36, sc->h[12]);
+		sph_enc32be(out + 40, sc->h[13]);
+		sph_enc32be(out + 44, sc->h[15]);
+	} else {
+		for (u = 0; u < 16; u ++)
+			sph_enc32be(out + (u << 2), sc->h[u]);
+	}
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi224_init(void *cc)
+{
+	hamsi_small_init(cc, IV224);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi224(void *cc, const void *data, size_t len)
+{
+	hamsi_small_core(cc, data, len);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi224_close(void *cc, void *dst)
+{
+	hamsi_small_close(cc, 0, 0, dst, 7);
+//	hamsi_small_init(cc, IV224);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	hamsi_small_close(cc, ub, n, dst, 7);
+//	hamsi_small_init(cc, IV224);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi256_init(void *cc)
+{
+	hamsi_small_init(cc, IV256);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi256(void *cc, const void *data, size_t len)
+{
+	hamsi_small_core(cc, data, len);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi256_close(void *cc, void *dst)
+{
+	hamsi_small_close(cc, 0, 0, dst, 8);
+//	hamsi_small_init(cc, IV256);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	hamsi_small_close(cc, ub, n, dst, 8);
+//	hamsi_small_init(cc, IV256);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi384_init(void *cc)
+{
+	hamsi_big_init(cc, IV384);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi384(void *cc, const void *data, size_t len)
+{
+	hamsi_big_core(cc, data, len);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi384_close(void *cc, void *dst)
+{
+	hamsi_big_close(cc, 0, 0, dst, 12);
+//	hamsi_big_init(cc, IV384);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	hamsi_big_close(cc, ub, n, dst, 12);
+//	hamsi_big_init(cc, IV384);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi512_init(void *cc)
+{
+	hamsi_big_init(cc, IV512);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi512(void *cc, const void *data, size_t len)
+{
+	hamsi_big_core(cc, data, len);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi512_close(void *cc, void *dst)
+{
+	hamsi_big_close(cc, 0, 0, dst, 16);
+//	hamsi_big_init(cc, IV512);
+}
+
+/* see sph_hamsi.h */
+void
+sph_hamsi512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	hamsi_big_close(cc, ub, n, dst, 16);
+//	hamsi_big_init(cc, IV512);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/algo/haval/haval-4way-helper.c b/algo/haval/haval-4way-helper.c
index 87de1de..c9e7ad8 100644
--- a/algo/haval/haval-4way-helper.c
+++ b/algo/haval/haval-4way-helper.c
@@ -83,7 +83,7 @@ SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc,
 
    current = (unsigned)sc->count_low & 127UL;
 
-   sc->buf[ current>>2 ] = mm_one_32;
+   sc->buf[ current>>2 ] = m128_one_32;
    current += 4;   
    RSTATE;
    if ( current > 116UL )
diff --git a/algo/heavy/bastion.c b/algo/heavy/bastion.c
index 1ca2c2d..fd12b2e 100644
--- a/algo/heavy/bastion.c
+++ b/algo/heavy/bastion.c
@@ -15,7 +15,7 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/skein/sse2/skein.c"
 
 #ifndef NO_AES_NI
diff --git a/algo/hodl/hodl-gate.c b/algo/hodl/hodl-gate.c
index ba065c8..7fa6791 100644
--- a/algo/hodl/hodl-gate.c
+++ b/algo/hodl/hodl-gate.c
@@ -99,6 +99,7 @@ int hodl_scanhash( int thr_id, struct work* work, uint32_t max_nonce,
   pthread_barrier_wait( &hodl_barrier );
   return scanhash_hodl_wolf( thr_id, work, max_nonce, hashes_done );
 #endif
+  return false;
 }
 
 bool register_hodl_algo( algo_gate_t* gate )
diff --git a/algo/jh/jha-4way.c b/algo/jh/jha-4way.c
index 83029a2..4749472 100644
--- a/algo/jh/jha-4way.c
+++ b/algo/jh/jha-4way.c
@@ -44,7 +44,7 @@ void jha_hash_4way( void *out, const void *input )
     for ( int round = 0; round < 3; round++ )
     {
        vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256(
-               vh[0], _mm256_set1_epi64x( 1 ) ), mm256_zero );
+               vh[0], _mm256_set1_epi64x( 1 ) ), m256_zero );
 
        mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
        init_groestl( &ctx_groestl, 64 );
diff --git a/algo/keccak/keccak-hash-4way.c b/algo/keccak/keccak-hash-4way.c
index 7f4f473..dcf4079 100644
--- a/algo/keccak/keccak-hash-4way.c
+++ b/algo/keccak/keccak-hash-4way.c
@@ -59,7 +59,7 @@ static const sph_u64 RC[] = {
 #define XOR64(d, a, b)   (d = _mm256_xor_si256(a,b))
 #define AND64(d, a, b)   (d = _mm256_and_si256(a,b))
 #define OR64(d, a, b)    (d = _mm256_or_si256(a,b))
-#define NOT64(d, s)      (d = _mm256_xor_si256(s,mm256_neg1))
+#define NOT64(d, s)      (d = _mm256_xor_si256(s,m256_neg1))
 #define ROL64(d, v, n)   (d = mm256_rotl_64(v, n))
 #define XOR64_IOTA       XOR64
 
@@ -375,12 +375,12 @@ static void keccak64_init( keccak64_ctx_m256i *kc, unsigned out_size )
           kc->w[i] = _mm256_setzero_si256();
 
    // Initialization for the "lane complement".
-   kc->w[ 1] = mm256_neg1;
-   kc->w[ 2] = mm256_neg1;
-   kc->w[ 8] = mm256_neg1;
-   kc->w[12] = mm256_neg1;
-   kc->w[17] = mm256_neg1;
-   kc->w[20] = mm256_neg1;
+   kc->w[ 1] = m256_neg1;
+   kc->w[ 2] = m256_neg1;
+   kc->w[ 8] = m256_neg1;
+   kc->w[12] = m256_neg1;
+   kc->w[17] = m256_neg1;
+   kc->w[20] = m256_neg1;
    kc->ptr = 0;
    kc->lim = 200 - (out_size >> 2);
 }
diff --git a/algo/luffa/luffa-hash-2way.c b/algo/luffa/luffa-hash-2way.c
new file mode 100644
index 0000000..ea490a0
--- /dev/null
+++ b/algo/luffa/luffa-hash-2way.c
@@ -0,0 +1,568 @@
+/*
+ * luffa_for_sse2.c
+ * Version 2.0 (Sep 15th 2009)
+ *
+ * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
+ *
+ * Hitachi, Ltd. is the owner of this software and hereby grant
+ * the U.S. Government and any interested party the right to use
+ * this software for the purposes of the SHA-3 evaluation process,
+ * notwithstanding that this software is copyrighted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <string.h>
+#include <immintrin.h>
+#include "luffa-hash-2way.h"
+
+#if defined(__AVX2__)
+
+#include "avxdefs.h"
+
+#define MASK _mm256_set_epi32( 0UL, 0UL, 0UL, 0xffffffffUL, \
+                               0UL, 0UL, 0UL, 0xffffffffUL )
+
+#define ADD_CONSTANT(a,b,c0,c1)\
+    a = _mm256_xor_si256(a,c0);\
+    b = _mm256_xor_si256(b,c1);\
+
+#define MULT2(a0,a1) \
+do { \
+  __m256i b = _mm256_xor_si256( a0, \
+                   _mm256_shuffle_epi32( _mm256_and_si256(a1,MASK), 16 ) ); \
+  a0 = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a1,12) ); \
+  a1 = _mm256_or_si256( _mm256_srli_si256(a1,4), _mm256_slli_si256(b,12) );  \
+} while(0)
+
+// confirm pointer arithmetic
+// ok but use array indexes
+#define STEP_PART(x,c,t)\
+    SUBCRUMB(*x,*(x+1),*(x+2),*(x+3),*t);\
+    SUBCRUMB(*(x+5),*(x+6),*(x+7),*(x+4),*t);\
+    MIXWORD(*x,*(x+4),*t,*(t+1));\
+    MIXWORD(*(x+1),*(x+5),*t,*(t+1));\
+    MIXWORD(*(x+2),*(x+6),*t,*(t+1));\
+    MIXWORD(*(x+3),*(x+7),*t,*(t+1));\
+    ADD_CONSTANT(*x, *(x+4), *c, *(c+1));
+
+#define SUBCRUMB(a0,a1,a2,a3,t)\
+    t  = _mm256_load_si256(&a0);\
+    a0 = _mm256_or_si256(a0,a1);\
+    a2 = _mm256_xor_si256(a2,a3);\
+    a1 = _mm256_andnot_si256(a1, m256_neg1 );\
+    a0 = _mm256_xor_si256(a0,a3);\
+    a3 = _mm256_and_si256(a3,t);\
+    a1 = _mm256_xor_si256(a1,a3);\
+    a3 = _mm256_xor_si256(a3,a2);\
+    a2 = _mm256_and_si256(a2,a0);\
+    a0 = _mm256_andnot_si256(a0, m256_neg1 );\
+    a2 = _mm256_xor_si256(a2,a1);\
+    a1 = _mm256_or_si256(a1,a3);\
+    t  = _mm256_xor_si256(t,a1);\
+    a3 = _mm256_xor_si256(a3,a2);\
+    a2 = _mm256_and_si256(a2,a1);\
+    a1 = _mm256_xor_si256(a1,a0);\
+    a0 = _mm256_load_si256(&t);\
+
+#define MIXWORD(a,b,t1,t2)\
+    b  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(a,2);\
+    t2 = _mm256_srli_epi32(a,30);\
+     a = _mm256_or_si256(t1,t2);\
+    a  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(b,14);\
+    t2 = _mm256_srli_epi32(b,18);\
+    b  = _mm256_or_si256(t1,t2);\
+    b  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(a,10);\
+    t2 = _mm256_srli_epi32(a,22);\
+    a  = _mm256_or_si256(t1,t2);\
+    a  = _mm256_xor_si256(a,b);\
+    t1 = _mm256_slli_epi32(b,1);\
+    t2 = _mm256_srli_epi32(b,31);\
+    b  = _mm256_or_si256(t1,t2);
+
+#define STEP_PART2(a0,a1,t0,t1,c0,c1,tmp0,tmp1)\
+    a1 = _mm256_shuffle_epi32(a1,147);\
+    t0 = _mm256_load_si256(&a1);\
+    a1 = _mm256_unpacklo_epi32(a1,a0);\
+    t0 = _mm256_unpackhi_epi32(t0,a0);\
+    t1 = _mm256_shuffle_epi32(t0,78);\
+    a0 = _mm256_shuffle_epi32(a1,78);\
+    SUBCRUMB(t1,t0,a0,a1,tmp0);\
+    t0 = _mm256_unpacklo_epi32(t0,t1);\
+    a1 = _mm256_unpacklo_epi32(a1,a0);\
+    a0 = _mm256_load_si256(&a1);\
+    a0 = _mm256_unpackhi_epi64(a0,t0);\
+    a1 = _mm256_unpacklo_epi64(a1,t0);\
+    a1 = _mm256_shuffle_epi32(a1,57);\
+    MIXWORD(a0,a1,tmp0,tmp1);\
+    ADD_CONSTANT(a0,a1,c0,c1);
+
+#define NMLTOM768(r0,r1,r2,s0,s1,s2,s3,p0,p1,p2,q0,q1,q2,q3)\
+    s2 = _mm256_load_si256(&r1);\
+    q2 = _mm256_load_si256(&p1);\
+    r2 = _mm256_shuffle_epi32(r2,216);\
+    p2 = _mm256_shuffle_epi32(p2,216);\
+    r1 = _mm256_unpacklo_epi32(r1,r0);\
+    p1 = _mm256_unpacklo_epi32(p1,p0);\
+    s2 = _mm256_unpackhi_epi32(s2,r0);\
+    q2 = _mm256_unpackhi_epi32(q2,p0);\
+    s0 = _mm256_load_si256(&r2);\
+    q0 = _mm256_load_si256(&p2);\
+    r2 = _mm256_unpacklo_epi64(r2,r1);\
+    p2 = _mm256_unpacklo_epi64(p2,p1);\
+    s1 = _mm256_load_si256(&s0);\
+    q1 = _mm256_load_si256(&q0);\
+    s0 = _mm256_unpackhi_epi64(s0,r1);\
+    q0 = _mm256_unpackhi_epi64(q0,p1);\
+    r2 = _mm256_shuffle_epi32(r2,225);\
+    p2 = _mm256_shuffle_epi32(p2,225);\
+    r0 = _mm256_load_si256(&s1);\
+    p0 = _mm256_load_si256(&q1);\
+    s0 = _mm256_shuffle_epi32(s0,225);\
+    q0 = _mm256_shuffle_epi32(q0,225);\
+    s1 = _mm256_unpacklo_epi64(s1,s2);\
+    q1 = _mm256_unpacklo_epi64(q1,q2);\
+    r0 = _mm256_unpackhi_epi64(r0,s2);\
+    p0 = _mm256_unpackhi_epi64(p0,q2);\
+    s2 = _mm256_load_si256(&r0);\
+    q2 = _mm256_load_si256(&p0);\
+    s3 = _mm256_load_si256(&r2);\
+    q3 = _mm256_load_si256(&p2);\
+
+#define MIXTON768(r0,r1,r2,r3,s0,s1,s2,p0,p1,p2,p3,q0,q1,q2)\
+    s0 = _mm256_load_si256(&r0);\
+    q0 = _mm256_load_si256(&p0);\
+    s1 = _mm256_load_si256(&r2);\
+    q1 = _mm256_load_si256(&p2);\
+    r0 = _mm256_unpackhi_epi32(r0,r1);\
+    p0 = _mm256_unpackhi_epi32(p0,p1);\
+    r2 = _mm256_unpackhi_epi32(r2,r3);\
+    p2 = _mm256_unpackhi_epi32(p2,p3);\
+    s0 = _mm256_unpacklo_epi32(s0,r1);\
+    q0 = _mm256_unpacklo_epi32(q0,p1);\
+    s1 = _mm256_unpacklo_epi32(s1,r3);\
+    q1 = _mm256_unpacklo_epi32(q1,p3);\
+    r1 = _mm256_load_si256(&r0);\
+    p1 = _mm256_load_si256(&p0);\
+    r0 = _mm256_unpackhi_epi64(r0,r2);\
+    p0 = _mm256_unpackhi_epi64(p0,p2);\
+    s0 = _mm256_unpackhi_epi64(s0,s1);\
+    q0 = _mm256_unpackhi_epi64(q0,q1);\
+    r1 = _mm256_unpacklo_epi64(r1,r2);\
+    p1 = _mm256_unpacklo_epi64(p1,p2);\
+    s2 = _mm256_load_si256(&r0);\
+    q2 = _mm256_load_si256(&p0);\
+    s1 = _mm256_load_si256(&r1);\
+    q1 = _mm256_load_si256(&p1);\
+
+#define NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    s1 = _mm256_load_si256(&r3);\
+    q1 = _mm256_load_si256(&p3);\
+    s3 = _mm256_load_si256(&r3);\
+    q3 = _mm256_load_si256(&p3);\
+    s1 = _mm256_unpackhi_epi32(s1,r2);\
+    q1 = _mm256_unpackhi_epi32(q1,p2);\
+    s3 = _mm256_unpacklo_epi32(s3,r2);\
+    q3 = _mm256_unpacklo_epi32(q3,p2);\
+    s0 = _mm256_load_si256(&s1);\
+    q0 = _mm256_load_si256(&q1);\
+    s2 = _mm256_load_si256(&s3);\
+    q2 = _mm256_load_si256(&q3);\
+    r3 = _mm256_load_si256(&r1);\
+    p3 = _mm256_load_si256(&p1);\
+    r1 = _mm256_unpacklo_epi32(r1,r0);\
+    p1 = _mm256_unpacklo_epi32(p1,p0);\
+    r3 = _mm256_unpackhi_epi32(r3,r0);\
+    p3 = _mm256_unpackhi_epi32(p3,p0);\
+    s0 = _mm256_unpackhi_epi64(s0,r3);\
+    q0 = _mm256_unpackhi_epi64(q0,p3);\
+    s1 = _mm256_unpacklo_epi64(s1,r3);\
+    q1 = _mm256_unpacklo_epi64(q1,p3);\
+    s2 = _mm256_unpackhi_epi64(s2,r1);\
+    q2 = _mm256_unpackhi_epi64(q2,p1);\
+    s3 = _mm256_unpacklo_epi64(s3,r1);\
+    q3 = _mm256_unpacklo_epi64(q3,p1);
+
+#define MIXTON1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3)\
+    NMLTOM1024(r0,r1,r2,r3,s0,s1,s2,s3,p0,p1,p2,p3,q0,q1,q2,q3);
+
+/* initial values of chaining variables */
+static const uint32 IV[40] __attribute((aligned(32))) = {
+    0xdbf78465,0x4eaa6fb4,0x44b051e0,0x6d251e69,
+    0xdef610bb,0xee058139,0x90152df4,0x6e292011,
+    0xde099fa3,0x70eee9a0,0xd9d2f256,0xc3b44b95,
+    0x746cd581,0xcf1ccf0e,0x8fc944b3,0x5d9b0557,
+    0xad659c05,0x04016ce5,0x5dba5781,0xf7efc89d,
+    0x8b264ae7,0x24aa230a,0x666d1836,0x0306194f,
+    0x204b1f67,0xe571f7d7,0x36d79cce,0x858075d5,
+    0x7cde72ce,0x14bcb808,0x57e9e923,0x35870c6a,
+    0xaffb4363,0xc825b7c7,0x5ec41e22,0x6c68e9be,
+    0x03e86cea,0xb07224cc,0x0fc688f1,0xf5df3999
+};
+
+/* Round Constants */
+static const uint32 CNS_INIT[128] __attribute((aligned(32))) = {
+    0xb213afa5,0xfc20d9d2,0xb6de10ed,0x303994a6,
+    0xe028c9bf,0xe25e72c1,0x01685f3d,0xe0337818,
+    0xc84ebe95,0x34552e25,0x70f47aae,0xc0e65299,
+    0x44756f91,0xe623bb72,0x05a17cf4,0x441ba90d,
+    0x4e608a22,0x7ad8818f,0x0707a3d4,0x6cc33a12,
+    0x7e8fce32,0x5c58a4a4,0xbd09caca,0x7f34d442,
+    0x56d858fe,0x8438764a,0x1c1e8f51,0xdc56983e,
+    0x956548be,0x1e38e2e7,0xf4272b28,0x9389217f,
+    0x343b138f,0xbb6de032,0x707a3d45,0x1e00108f,
+    0xfe191be2,0x78e38b9d,0x144ae5cc,0xe5a8bce6,
+    0xd0ec4e3d,0xedb780c8,0xaeb28562,0x7800423d,
+    0x3cb226e5,0x27586719,0xfaa7ae2b,0x5274baf4,
+    0x2ceb4882,0xd9847356,0xbaca1589,0x8f5b7882,
+    0x5944a28e,0x36eda57f,0x2e48f1c1,0x26889ba7,
+    0xb3ad2208,0xa2c78434,0x40a46f3e,0x96e1db12,
+    0xa1c4c355,0x703aace7,0xb923c704,0x9a226e9d,
+    0x00000000,0x00000000,0x00000000,0xf0d2e9e3,
+    0x00000000,0x00000000,0x00000000,0x5090d577,
+    0x00000000,0x00000000,0x00000000,0xac11d7fa,
+    0x00000000,0x00000000,0x00000000,0x2d1925ab,
+    0x00000000,0x00000000,0x00000000,0x1bcb66f2,
+    0x00000000,0x00000000,0x00000000,0xb46496ac,
+    0x00000000,0x00000000,0x00000000,0x6f2d9bc9,
+    0x00000000,0x00000000,0x00000000,0xd1925ab0,
+    0x00000000,0x00000000,0x00000000,0x78602649,
+    0x00000000,0x00000000,0x00000000,0x29131ab6,
+    0x00000000,0x00000000,0x00000000,0x8edae952,
+    0x00000000,0x00000000,0x00000000,0x0fc053c3,
+    0x00000000,0x00000000,0x00000000,0x3b6ba548,
+    0x00000000,0x00000000,0x00000000,0x3f014f0c,
+    0x00000000,0x00000000,0x00000000,0xedae9520,
+    0x00000000,0x00000000,0x00000000,0xfc053c31
+};
+
+__m256i CNS[32];
+
+/***************************************************/
+/* Round function         */
+/* state: hash context    */
+
+static void rnd512_2way( luffa_2way_context *state, __m256i msg1, __m256i msg0 )
+{
+    __m256i t[2];
+    __m256i *chainv = state->chainv;
+    __m256i tmp[2];
+    __m256i x[8];
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+
+    t[0] = _mm256_xor_si256( t[0], chainv[2] );
+    t[1] = _mm256_xor_si256( t[1], chainv[3] );
+    t[0] = _mm256_xor_si256( t[0], chainv[4] );
+    t[1] = _mm256_xor_si256( t[1], chainv[5] );
+    t[0] = _mm256_xor_si256( t[0], chainv[6] );
+    t[1] = _mm256_xor_si256( t[1], chainv[7] );
+    t[0] = _mm256_xor_si256( t[0], chainv[8] );
+    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+
+    MULT2( t[0], t[1] );
+
+    msg0 = _mm256_shuffle_epi32( msg0, 27 );
+    msg1 = _mm256_shuffle_epi32( msg1, 27 );
+
+    chainv[0] = _mm256_xor_si256( chainv[0], t[0] );
+    chainv[1] = _mm256_xor_si256( chainv[1], t[1] );
+    chainv[2] = _mm256_xor_si256( chainv[2], t[0] );
+    chainv[3] = _mm256_xor_si256( chainv[3], t[1] );
+    chainv[4] = _mm256_xor_si256( chainv[4], t[0] );
+    chainv[5] = _mm256_xor_si256( chainv[5], t[1] );
+    chainv[6] = _mm256_xor_si256( chainv[6], t[0] );
+    chainv[7] = _mm256_xor_si256( chainv[7], t[1] );
+    chainv[8] = _mm256_xor_si256( chainv[8], t[0] );
+    chainv[9] = _mm256_xor_si256( chainv[9], t[1] );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+
+    MULT2( chainv[0], chainv[1]);
+    chainv[0] = _mm256_xor_si256( chainv[0], chainv[2] );
+    chainv[1] = _mm256_xor_si256( chainv[1], chainv[3] );
+
+    MULT2( chainv[2], chainv[3]);
+    chainv[2] = _mm256_xor_si256(chainv[2], chainv[4]);
+    chainv[3] = _mm256_xor_si256(chainv[3], chainv[5]);
+
+    MULT2( chainv[4], chainv[5]);
+    chainv[4] = _mm256_xor_si256(chainv[4], chainv[6]);
+    chainv[5] = _mm256_xor_si256(chainv[5], chainv[7]);
+
+    MULT2( chainv[6], chainv[7]);
+    chainv[6] = _mm256_xor_si256(chainv[6], chainv[8]);
+    chainv[7] = _mm256_xor_si256(chainv[7], chainv[9]);
+
+    MULT2( chainv[8], chainv[9]);
+    chainv[8] = _mm256_xor_si256( chainv[8], t[0] );
+    chainv[9] = _mm256_xor_si256( chainv[9], t[1] );
+
+    t[0] = chainv[8];
+    t[1] = chainv[9];
+
+    MULT2( chainv[8], chainv[9]);
+    chainv[8] = _mm256_xor_si256( chainv[8], chainv[6] );
+    chainv[9] = _mm256_xor_si256( chainv[9], chainv[7] );
+
+    MULT2( chainv[6], chainv[7]);
+    chainv[6] = _mm256_xor_si256( chainv[6], chainv[4] );
+    chainv[7] = _mm256_xor_si256( chainv[7], chainv[5] );
+
+    MULT2( chainv[4], chainv[5]);
+    chainv[4] = _mm256_xor_si256( chainv[4], chainv[2] );
+    chainv[5] = _mm256_xor_si256( chainv[5], chainv[3] );
+
+    MULT2( chainv[2], chainv[3] );
+    chainv[2] = _mm256_xor_si256( chainv[2], chainv[0] );
+    chainv[3] = _mm256_xor_si256( chainv[3], chainv[1] );
+
+    MULT2( chainv[0], chainv[1] );
+    chainv[0] = _mm256_xor_si256( _mm256_xor_si256( chainv[0], t[0] ), msg0 );
+    chainv[1] = _mm256_xor_si256( _mm256_xor_si256( chainv[1], t[1] ), msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[2] = _mm256_xor_si256( chainv[2], msg0 );
+    chainv[3] = _mm256_xor_si256( chainv[3], msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[4] = _mm256_xor_si256( chainv[4], msg0 );
+    chainv[5] = _mm256_xor_si256( chainv[5], msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[6] = _mm256_xor_si256( chainv[6], msg0 );
+    chainv[7] = _mm256_xor_si256( chainv[7], msg1 );
+
+    MULT2( msg0, msg1);
+    chainv[8] = _mm256_xor_si256( chainv[8], msg0 );
+    chainv[9] = _mm256_xor_si256( chainv[9], msg1 );
+
+    MULT2( msg0, msg1);
+
+    chainv[3] = _mm256_or_si256( _mm256_slli_epi32( chainv[3],  1 ),
+                                 _mm256_srli_epi32( chainv[3], 31 ) );
+    chainv[5] = _mm256_or_si256( _mm256_slli_epi32( chainv[5],  2 ),
+                                 _mm256_srli_epi32( chainv[5], 30 ) );
+    chainv[7] = _mm256_or_si256( _mm256_slli_epi32( chainv[7],  3 ),
+                                 _mm256_srli_epi32( chainv[7], 29 ) );
+    chainv[9] = _mm256_or_si256( _mm256_slli_epi32( chainv[9],  4 ),
+                                 _mm256_srli_epi32( chainv[9], 28 ) );
+
+
+    NMLTOM1024( chainv[0], chainv[2], chainv[4], chainv[6],
+                x[0], x[1], x[2], x[3],
+                chainv[1],chainv[3],chainv[5],chainv[7],
+                x[4], x[5], x[6], x[7] );
+
+    STEP_PART( &x[0], &CNS[ 0], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 2], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 4], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 6], &tmp[0] );
+    STEP_PART( &x[0], &CNS[ 8], &tmp[0] );
+    STEP_PART( &x[0], &CNS[10], &tmp[0] );
+    STEP_PART( &x[0], &CNS[12], &tmp[0] );
+    STEP_PART( &x[0], &CNS[14], &tmp[0] );
+
+    MIXTON1024( x[0], x[1], x[2], x[3],
+                chainv[0], chainv[2], chainv[4],chainv[6],
+                x[4], x[5], x[6], x[7],
+                chainv[1],chainv[3],chainv[5],chainv[7]);
+
+    /* Process last 256-bit block */
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[16], CNS[17],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[18], CNS[19],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[20], CNS[21],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[22], CNS[23],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[24], CNS[25],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[26], CNS[27],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[28], CNS[29],
+                tmp[0], tmp[1] );
+    STEP_PART2( chainv[8], chainv[9], t[0], t[1], CNS[30], CNS[31],
+                tmp[0], tmp[1] );
+}
+
+
+/***************************************************/
+/* Finalization function  */
+/* state: hash context    */
+/* b[8]: hash values      */
+
+static void finalization512_2way( luffa_2way_context *state, uint32 *b )
+{
+    uint32 hash[8] __attribute((aligned(64)));
+    __m256i* chainv = state->chainv;
+    __m256i t[2];
+
+    /*---- blank round with m=0 ----*/
+    rnd512_2way( state, m256_zero, m256_zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+
+    t[0] = _mm256_xor_si256( t[0], chainv[2] );
+    t[1] = _mm256_xor_si256( t[1], chainv[3] );
+    t[0] = _mm256_xor_si256( t[0], chainv[4] );
+    t[1] = _mm256_xor_si256( t[1], chainv[5] );
+    t[0] = _mm256_xor_si256( t[0], chainv[6] );
+    t[1] = _mm256_xor_si256( t[1], chainv[7] );
+    t[0] = _mm256_xor_si256( t[0], chainv[8] );
+    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+
+    t[0] = _mm256_shuffle_epi32( t[0], 27 );
+    t[1] = _mm256_shuffle_epi32( t[1], 27 );
+
+    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
+    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
+
+    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
+
+    rnd512_2way( state, m256_zero, m256_zero );
+
+    t[0] = chainv[0];
+    t[1] = chainv[1];
+    t[0] = _mm256_xor_si256( t[0], chainv[2] );
+    t[1] = _mm256_xor_si256( t[1], chainv[3] );
+    t[0] = _mm256_xor_si256( t[0], chainv[4] );
+    t[1] = _mm256_xor_si256( t[1], chainv[5] );
+    t[0] = _mm256_xor_si256( t[0], chainv[6] );
+    t[1] = _mm256_xor_si256( t[1], chainv[7] );
+    t[0] = _mm256_xor_si256( t[0], chainv[8] );
+    t[1] = _mm256_xor_si256( t[1], chainv[9] );
+
+    t[0] = _mm256_shuffle_epi32( t[0], 27 );
+    t[1] = _mm256_shuffle_epi32( t[1], 27 );
+
+    _mm256_store_si256( (__m256i*)&hash[0], t[0] );
+    _mm256_store_si256( (__m256i*)&hash[8], t[1] );
+
+    casti_m256i( b, 2 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 3 ) = mm256_bswap_32( casti_m256i( hash, 1 ) );
+}
+
+int luffa_2way_init( luffa_2way_context *state, int hashbitlen )
+{
+    int i;
+    state->hashbitlen = hashbitlen;
+
+    for ( i=0; i<32; i++ ) CNS[i] =
+          _mm256_set_epi32( CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
+                            CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2)    ],
+                            CNS_INIT[ (i<<2) + 3 ], CNS_INIT[ (i<<2) +2 ],
+                            CNS_INIT[ (i<<2) + 1 ], CNS_INIT[ (i<<2)    ] );
+
+    for ( i=0; i<10; i++ ) state->chainv[i] =
+          _mm256_set_epi32( IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
+                            IV[ (i<<2) +1 ], IV[ (i<<2)    ],
+                            IV[ (i<<2) +3 ], IV[ (i<<2) +2 ],
+                            IV[ (i<<2) +1 ], IV[ (i<<2)    ] );
+
+    ((__m256i*)state->buffer)[0] = m256_zero;
+    ((__m256i*)state->buffer)[1] = m256_zero;
+
+    return 0;
+}
+
+// Do not call luffa_update_close after having called luffa_update.
+// Once luffa_update has been called only call luffa_update or luffa_close.
+int luffa_2way_update( luffa_2way_context *state, const void *data,
+                       size_t len )
+{
+    __m256i *vdata  = (__m256i*)data;
+    __m256i *buffer = (__m256i*)state->buffer;
+    int i;
+    int blocks = (int)len / 32;
+    state-> rembytes = (int)len % 32;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+    {
+       rnd512_2way( state, mm256_bswap_32( vdata[1] ) ,
+                           mm256_bswap_32( vdata[0] ) );
+    }
+
+    // 16 byte partial block exists for 80 byte len
+    // store in buffer for transform in final for midstate to work
+    if ( state->rembytes  )
+    {
+      // remaining data bytes
+      buffer[0] = mm256_bswap_32( vdata[0] );
+      buffer[1] = _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                   0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
+    }
+    return 0;
+}
+
+int luffa_2way_close( luffa_2way_context *state, void *hashval )
+{
+    __m256i *buffer = (__m256i*)state->buffer;
+
+    // transform pad block
+    if ( state->rembytes )
+      // not empty, data is in buffer
+      rnd512_2way( state, buffer[1], buffer[0] );
+    else
+      // empty pad block, constant data
+      rnd512_2way( state, m256_zero,
+                   _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                    0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
+
+    finalization512_2way( state, (uint32*)hashval );
+
+    if ( state->hashbitlen > 512 )
+        finalization512_2way( state, (uint32*)( hashval+128 ) );
+    return 0;
+}
+
+int luffa_2way_update_close( luffa_2way_context *state,
+                 void *output, const void *data, size_t inlen )
+{
+// Optimized for integrals of 16 bytes, good for 64 and 80 byte len
+    __m256i *vdata  = (__m256i*)data;
+    int i;
+    int blocks = (int)( inlen / 32 );
+    state->rembytes = inlen % 32;
+
+    // full blocks
+    for ( i = 0; i < blocks; i++, vdata+=2 )
+       rnd512_2way( state, mm256_bswap_32( vdata[1] ),
+                           mm256_bswap_32( vdata[0] ) );
+
+    // 16 byte partial block exists for 80 byte len
+    if ( state->rembytes  )
+       // padding of partial block
+       rnd512_2way( state,
+                    _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
+                    mm256_bswap_32( vdata[0] ) );
+    else
+       // empty pad block
+       rnd512_2way( state, m256_zero, 
+                    _mm256_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0,
+                                     0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ) );
+
+    finalization512_2way( state, (uint32*)output );
+    if ( state->hashbitlen > 512 )
+        finalization512_2way( state, (uint32*)( output+128 ) );
+
+    return 0;
+}
+
+#endif
diff --git a/algo/luffa/luffa-hash-2way.h b/algo/luffa/luffa-hash-2way.h
new file mode 100644
index 0000000..4ce84eb
--- /dev/null
+++ b/algo/luffa/luffa-hash-2way.h
@@ -0,0 +1,69 @@
+#if !defined(LUFFA_HASH_2WAY_H__)
+#define LUFFA_HASH_2WAY_H__ 1
+/*
+ * luffa_for_sse2.h
+ * Version 2.0 (Sep 15th 2009)
+ *
+ * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
+ *
+ * Hitachi, Ltd. is the owner of this software and hereby grant
+ * the U.S. Government and any interested party the right to use
+ * this software for the purposes of the SHA-3 evaluation process,
+ * notwithstanding that this software is copyrighted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#if defined(__AVX2__)
+
+#include <immintrin.h>
+#include "algo/sha/sha3-defs.h"
+#include "avxdefs.h"
+
+/* The length of digests*/
+#define DIGEST_BIT_LEN_224 224
+#define DIGEST_BIT_LEN_256 256
+#define DIGEST_BIT_LEN_384 384
+#define DIGEST_BIT_LEN_512 512
+
+/*********************************/
+/* The parameters of Luffa       */
+#define MSG_BLOCK_BIT_LEN 256  /*The bit length of a message block*/
+#define MSG_BLOCK_BYTE_LEN (MSG_BLOCK_BIT_LEN >> 3) /* The byte length
+                                                     * of a message block*/
+
+/* The number of blocks in Luffa */
+#define WIDTH_224 3
+#define WIDTH_256 3
+#define WIDTH_384 4
+#define WIDTH_512 5
+
+/* The limit of the length of message */
+#define LIMIT_224 64
+#define LIMIT_256 64
+#define LIMIT_384 128
+#define LIMIT_512 128
+/*********************************/
+
+typedef struct {
+    uint32 buffer[8*2] __attribute((aligned(64)));
+    __m256i chainv[10] __attribute((aligned(32)));   /* Chaining values */
+    int hashbitlen;
+    int rembytes;
+} luffa_2way_context;
+
+int luffa_2way_init( luffa_2way_context *state, int hashbitlen );
+int luffa_2way_update( luffa_2way_context *state, const void *data,
+                       size_t len );
+int luffa_2way_close( luffa_2way_context *state, void *hashval );
+int luffa_2way_update_close( luffa_2way_context *state, void *output,
+                                   const void *data, size_t inlen );
+
+#endif
+#endif
diff --git a/algo/luffa/sse2/luffa_for_sse2.c b/algo/luffa/luffa_for_sse2.c
similarity index 96%
rename from algo/luffa/sse2/luffa_for_sse2.c
rename to algo/luffa/luffa_for_sse2.c
index 12024f8..5491aa6 100644
--- a/algo/luffa/sse2/luffa_for_sse2.c
+++ b/algo/luffa/luffa_for_sse2.c
@@ -272,8 +272,8 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
     // full blocks
     for ( i = 0; i < blocks; i++ )
     {
-       rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
+                      mm_bswap_32( casti_m128i( data, 0 ) ) );
        data += MSG_BLOCK_BYTE_LEN;
     }
 
@@ -282,7 +282,7 @@ HashReturn update_luffa( hashState_luffa *state, const BitSequence *data,
     if ( state->rembytes  )
     {
       // remaining data bytes
-      casti_m128i( state->buffer, 0 ) = mm_byteswap_32( cast_m128i( data ) );
+      casti_m128i( state->buffer, 0 ) = mm_bswap_32( cast_m128i( data ) );
       // padding of partial block
       casti_m128i( state->buffer, 1 ) =
             _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 );
@@ -324,8 +324,8 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
     // full blocks
     for ( i = 0; i < blocks; i++ )
     {
-       rnd512( state, mm_byteswap_32( casti_m128i( data, 1 ) ),
-                      mm_byteswap_32( casti_m128i( data, 0 ) ) );
+       rnd512( state, mm_bswap_32( casti_m128i( data, 1 ) ),
+                      mm_bswap_32( casti_m128i( data, 0 ) ) );
        data += MSG_BLOCK_BYTE_LEN;
     }
 
@@ -334,7 +334,7 @@ HashReturn update_and_final_luffa( hashState_luffa *state, BitSequence* output,
     {
       // padding of partial block
       rnd512( state, _mm_set_epi8( 0,0,0,0, 0,0,0,0, 0,0,0,0, 0x80,0,0,0 ),
-                      mm_byteswap_32( cast_m128i( data ) ) );
+                      mm_bswap_32( cast_m128i( data ) ) );
     }
     else
     {
@@ -542,7 +542,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
 
     _mm256_store_si256( (__m256i*)hash, t );
 
-    casti_m256i( b, 0 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 0 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
 
     rnd512( state, zero, zero );
 
@@ -555,7 +555,7 @@ static void finalization512( hashState_luffa *state, uint32 *b )
 
     _mm256_store_si256( (__m256i*)hash, t );
 
-    casti_m256i( b, 1 ) = mm256_byteswap_32( casti_m256i( hash, 0 ) );
+    casti_m256i( b, 1 ) = mm256_bswap_32( casti_m256i( hash, 0 ) );
 }
 
 #else
@@ -587,8 +587,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
     _mm_store_si128((__m128i*)&hash[0], t[0]);
     _mm_store_si128((__m128i*)&hash[4], t[1]);
 
-    casti_m128i( b, 0 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 1 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 0 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 1 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
 
     rnd512( state, zero, zero );
 
@@ -609,8 +609,8 @@ static void finalization512( hashState_luffa *state, uint32 *b )
     _mm_store_si128((__m128i*)&hash[0], t[0]);
     _mm_store_si128((__m128i*)&hash[4], t[1]);
 
-    casti_m128i( b, 2 ) = mm_byteswap_32( casti_m128i( hash, 0 ) );
-    casti_m128i( b, 3 ) = mm_byteswap_32( casti_m128i( hash, 1 ) );
+    casti_m128i( b, 2 ) = mm_bswap_32( casti_m128i( hash, 0 ) );
+    casti_m128i( b, 3 ) = mm_bswap_32( casti_m128i( hash, 1 ) );
 }
 #endif
 
diff --git a/algo/luffa/sse2/luffa_for_sse2.h b/algo/luffa/luffa_for_sse2.h
similarity index 100%
rename from algo/luffa/sse2/luffa_for_sse2.h
rename to algo/luffa/luffa_for_sse2.h
diff --git a/algo/quark/anime-4way.c b/algo/quark/anime-4way.c
index e6678bc..77a8411 100644
--- a/algo/quark/anime-4way.c
+++ b/algo/quark/anime-4way.c
@@ -60,7 +60,7 @@ void anime_4way_hash( void *state, const void *input )
     blake512_4way_close( &ctx.blake, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );
 
        mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
        update_and_final_groestl( &ctx.groestl, (char*)hash0,
@@ -97,7 +97,7 @@ void anime_4way_hash( void *state, const void *input )
     jh512_4way_close( &ctx.jh, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );
 
        blake512_4way_init( &ctx.blake );
        blake512_4way( &ctx.blake, vhash, 64 );
@@ -118,7 +118,7 @@ void anime_4way_hash( void *state, const void *input )
     skein512_4way_close( &ctx.skein, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );
 
        keccak512_4way_init( &ctx.keccak );
        keccak512_4way( &ctx.keccak, vhash, 64 );
diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c
index 1e6aecc..09a4abb 100644
--- a/algo/quark/quark-4way.c
+++ b/algo/quark/quark-4way.c
@@ -60,7 +60,7 @@ void quark_4way_hash( void *state, const void *input )
     bmw512_4way_close( &ctx.bmw, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );
 
        mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
        update_and_final_groestl( &ctx.groestl, (char*)hash0,
@@ -97,7 +97,7 @@ void quark_4way_hash( void *state, const void *input )
     jh512_4way_close( &ctx.jh, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );
 
        blake512_4way_init( &ctx.blake );
        blake512_4way( &ctx.blake, vhash, 64 );
@@ -118,7 +118,7 @@ void quark_4way_hash( void *state, const void *input )
     skein512_4way_close( &ctx.skein, vhash );
 
     vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ),
-                                  mm256_zero );
+                                  m256_zero );
 
        keccak512_4way_init( &ctx.keccak );
        keccak512_4way( &ctx.keccak, vhash, 64 );
diff --git a/algo/qubit/deep-2way.c b/algo/qubit/deep-2way.c
new file mode 100644
index 0000000..b912e47
--- /dev/null
+++ b/algo/qubit/deep-2way.c
@@ -0,0 +1,130 @@
+#include "deep-gate.h"
+
+#if defined(DEEP_2WAY)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/shavite/sph_shavite.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct
+{
+        luffa_2way_context      luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        hashState_echo          echo;
+} deep_2way_ctx_holder;
+
+deep_2way_ctx_holder deep_2way_ctx;
+
+void init_deep_2way_ctx()
+{
+        luffa_2way_init( &deep_2way_ctx.luffa, 512 );
+        cubehashInit(&deep_2way_ctx.cube,512,16,32);
+        sph_shavite512_init(&deep_2way_ctx.shavite);
+        init_echo(&deep_2way_ctx.echo, 512);
+};
+
+void deep_2way_hash( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*2] __attribute__ ((aligned (64)));
+     deep_2way_ctx_holder ctx;
+
+     memcpy( &ctx, &deep_2way_ctx, sizeof(deep_2way_ctx) );
+     luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
+     luffa_2way_close( &ctx.luffa, vhash );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                           (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &deep_2way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &deep_2way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &deep_2way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+}
+
+int scanhash_deep_2way( int thr_id, struct work *work,uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 32+3;   // 4*8 + 3
+     uint32_t *noncep1 = vdata + 32+7;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
+
+     luffa_2way_init( &deep_2way_ctx.luffa, 512 );
+     luffa_2way_update( &deep_2way_ctx.luffa, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     {
+        uint32_t mask = masks[m];
+        do
+        {
+            found[0] = found[1] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            deep_2way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+64 );
+            }
+            n += 2;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+     }
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
diff --git a/algo/qubit/deep-gate.c b/algo/qubit/deep-gate.c
new file mode 100644
index 0000000..c91655f
--- /dev/null
+++ b/algo/qubit/deep-gate.c
@@ -0,0 +1,17 @@
+#include "deep-gate.h"
+
+bool register_deep_algo( algo_gate_t* gate )
+{
+#if defined (DEEP_2WAY)
+  init_deep_2way_ctx();
+  gate->scanhash  = (void*)&scanhash_deep_2way;
+  gate->hash      = (void*)&deep_2way_hash;
+#else
+  init_deep_ctx();
+  gate->scanhash  = (void*)&scanhash_deep;
+  gate->hash      = (void*)&deep_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  return true;
+};
+
diff --git a/algo/qubit/deep-gate.h b/algo/qubit/deep-gate.h
new file mode 100644
index 0000000..b91f968
--- /dev/null
+++ b/algo/qubit/deep-gate.h
@@ -0,0 +1,32 @@
+#ifndef DEEP_GATE_H__
+#define DEEP_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define DEEP_2WAY
+#endif
+
+bool register_deep_algo( algo_gate_t* gate );
+
+#if defined(DEEP_2WAY)
+
+void deep_2way_hash( void *state, const void *input );
+
+int scanhash_deep_2way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_deep_2way_ctx();
+
+#endif
+
+void deep_hash( void *state, const void *input );
+
+int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_deep_ctx();
+
+#endif
+
diff --git a/algo/qubit/deep.c b/algo/qubit/deep.c
index 6c82aad..eaa4b85 100644
--- a/algo/qubit/deep.c
+++ b/algo/qubit/deep.c
@@ -1,9 +1,9 @@
-#include "algo-gate-api.h"
+#include "deep-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
@@ -139,12 +139,3 @@ int scanhash_deep( int thr_id, struct work *work, uint32_t max_nonce,
 	return 0;
 }
 
-bool register_deep_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_deep_ctx();
-  gate->scanhash = (void*)&scanhash_deep;
-  gate->hash     = (void*)&deep_hash;
-  return true;
-};
-
diff --git a/algo/qubit/qubit-2way.c b/algo/qubit/qubit-2way.c
new file mode 100644
index 0000000..537f0ce
--- /dev/null
+++ b/algo/qubit/qubit-2way.c
@@ -0,0 +1,138 @@
+#include "qubit-gate.h"
+
+#if defined(QUBIT_2WAY)
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include "algo/luffa/luffa-hash-2way.h"
+#include "algo/cubehash/sse2/cubehash_sse2.h" 
+#include "algo/simd/simd-hash-2way.h"
+#include "algo/shavite/sph_shavite.h"
+#include "algo/echo/aes_ni/hash_api.h"
+
+typedef struct
+{
+        luffa_2way_context      luffa;
+        cubehashParam           cube;
+        sph_shavite512_context  shavite;
+        simd_2way_context       simd;
+        hashState_echo          echo;
+} qubit_2way_ctx_holder;
+
+qubit_2way_ctx_holder qubit_2way_ctx;
+
+void init_qubit_2way_ctx()
+{
+        luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
+        cubehashInit(&qubit_2way_ctx.cube,512,16,32);
+        sph_shavite512_init(&qubit_2way_ctx.shavite);
+        simd_2way_init( &qubit_2way_ctx.simd, 512 );
+        init_echo(&qubit_2way_ctx.echo, 512);
+};
+
+void qubit_2way_hash( void *output, const void *input )
+{
+     uint64_t hash0[8] __attribute__ ((aligned (64)));
+     uint64_t hash1[8] __attribute__ ((aligned (64)));
+     uint64_t vhash[8*2] __attribute__ ((aligned (64)));
+     qubit_2way_ctx_holder ctx;
+
+     memcpy( &ctx, &qubit_2way_ctx, sizeof(qubit_2way_ctx) );
+     luffa_2way_update( &ctx.luffa, input + (64<<1), 16 );
+     luffa_2way_close( &ctx.luffa, vhash );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash0,
+                           (const byte*) hash0, 64 );
+     memcpy( &ctx.cube, &qubit_2way_ctx.cube, sizeof(cubehashParam) );
+     cubehashUpdateDigest( &ctx.cube, (byte*)hash1, (const byte*) hash1, 64 );
+
+     sph_shavite512( &ctx.shavite, hash0, 64 );
+     sph_shavite512_close( &ctx.shavite, hash0 );
+     memcpy( &ctx.shavite, &qubit_2way_ctx.shavite,
+             sizeof(sph_shavite512_context) );
+     sph_shavite512( &ctx.shavite, hash1, 64 );
+     sph_shavite512_close( &ctx.shavite, hash1 );
+
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+
+     update_final_echo( &ctx.echo, (BitSequence *)hash0,
+                       (const BitSequence *) hash0, 512 );
+     memcpy( &ctx.echo, &qubit_2way_ctx.echo, sizeof(hashState_echo) );
+     update_final_echo( &ctx.echo, (BitSequence *)hash1,
+                       (const BitSequence *) hash1, 512 );
+
+     memcpy( output,    hash0, 32 );
+     memcpy( output+32, hash1, 32 );
+}
+
+int scanhash_qubit_2way( int thr_id, struct work *work,uint32_t max_nonce,
+                         uint64_t *hashes_done )
+{
+     uint32_t hash[4*8] __attribute__ ((aligned (64)));
+     uint32_t vdata[24*4] __attribute__ ((aligned (64)));
+     uint32_t endiandata[20] __attribute__((aligned(64)));
+     uint32_t *pdata = work->data;
+     uint32_t *ptarget = work->target;
+     uint32_t n = pdata[19];
+     const uint32_t first_nonce = pdata[19];
+     uint32_t *nonces = work->nonces;
+     bool *found = work->nfound;
+     int num_found = 0;
+     uint32_t *noncep0 = vdata + 32+3;   // 4*8 + 3
+     uint32_t *noncep1 = vdata + 32+7;
+     const uint32_t Htarg = ptarget[7];
+     uint64_t htmax[] = {          0,        0xF,       0xFF,
+                               0xFFF,     0xFFFF, 0x10000000  };
+     uint32_t masks[] = { 0xFFFFFFFF, 0xFFFFFFF0, 0xFFFFFF00,
+                          0xFFFFF000, 0xFFFF0000,          0  };
+
+     // big endian encode 0..18 uint32_t, 64 bits at a time
+     swab32_array( endiandata, pdata, 20 );
+
+
+     uint64_t *edata = (uint64_t*)endiandata;
+     mm256_interleave_2x128( (uint64_t*)vdata, edata, edata, 640 );
+
+     luffa_2way_init( &qubit_2way_ctx.luffa, 512 );
+     luffa_2way_update( &qubit_2way_ctx.luffa, vdata, 64 );
+
+     for ( int m=0; m < 6; m++ ) if ( Htarg <= htmax[m] )
+     {
+        uint32_t mask = masks[m];
+        do
+        {
+           found[0] = found[1] = false;
+            be32enc( noncep0, n   );
+            be32enc( noncep1, n+1 );
+            qubit_2way_hash( hash, vdata );
+            pdata[19] = n;
+
+            if ( !( hash[7] & mask ) && fulltest( hash, ptarget) )
+            {
+               found[0] = true;
+               num_found++;
+               nonces[0] = n;
+               work_set_target_ratio( work, hash );
+            }
+            if ( !( (hash+8)[7] & mask ) && fulltest( hash+8, ptarget) )
+            {
+               found[1] = true;
+               num_found++;
+               nonces[1] = n+1;
+               work_set_target_ratio( work, hash+8 );
+            }
+            n += 2;
+         } while ( ( num_found == 0 ) && ( n < max_nonce )
+                   && !work_restart[thr_id].restart );
+         break;
+     }
+     *hashes_done = n - first_nonce + 1;
+     return num_found;
+}
+
+#endif
diff --git a/algo/qubit/qubit-gate.c b/algo/qubit/qubit-gate.c
new file mode 100644
index 0000000..e0e23bb
--- /dev/null
+++ b/algo/qubit/qubit-gate.c
@@ -0,0 +1,17 @@
+#include "qubit-gate.h"
+
+bool register_qubit_algo( algo_gate_t* gate )
+{
+#if defined (QUBIT_2WAY)
+  init_qubit_2way_ctx();
+  gate->scanhash  = (void*)&scanhash_qubit_2way;
+  gate->hash      = (void*)&qubit_2way_hash;
+#else
+  init_qubit_ctx();
+  gate->scanhash  = (void*)&scanhash_qubit;
+  gate->hash      = (void*)&qubit_hash;
+#endif
+  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
+  return true;
+};
+
diff --git a/algo/qubit/qubit-gate.h b/algo/qubit/qubit-gate.h
new file mode 100644
index 0000000..953c1cb
--- /dev/null
+++ b/algo/qubit/qubit-gate.h
@@ -0,0 +1,32 @@
+#ifndef QUBIT_GATE_H__
+#define QUBIT_GATE_H__ 1
+
+#include "algo-gate-api.h"
+#include <stdint.h>
+
+#if defined(__AVX2__) && defined(__AES__)
+  #define QUBIT_2WAY
+#endif
+
+bool register_qubit_algo( algo_gate_t* gate );
+
+#if defined(QUBIT_2WAY)
+
+void qubit_2way_hash( void *state, const void *input );
+
+int scanhash_qubit_2way( int thr_id, struct work *work, uint32_t max_nonce,
+                         uint64_t *hashes_done );
+
+void init_qubit_2way_ctx();
+
+#endif
+
+void qubit_hash( void *state, const void *input );
+
+int scanhash_qubit( int thr_id, struct work *work, uint32_t max_nonce,
+                    uint64_t *hashes_done );
+
+void init_qubit_ctx();
+
+#endif
+
diff --git a/algo/qubit/qubit.c b/algo/qubit/qubit.c
index 4310f9a..bc71cf0 100644
--- a/algo/qubit/qubit.c
+++ b/algo/qubit/qubit.c
@@ -1,11 +1,11 @@
-#include "algo-gate-api.h"
+#include "qubit-gate.h"
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include <stdio.h>
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h" 
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/shavite/sph_shavite.h"
 #ifndef NO_AES_NI
 #include "algo/echo/aes_ni/hash_api.h"
@@ -48,7 +48,7 @@ void qubit_luffa_midstate( const void* input )
     update_luffa( &qubit_luffa_mid, input, 64 );
 }
 
-void qubithash(void *output, const void *input)
+void qubit_hash(void *output, const void *input)
 {
         unsigned char hash[128] __attribute((aligned(64)));
         #define hashB hash+64
@@ -115,7 +115,7 @@ int scanhash_qubit(int thr_id, struct work *work,
                 {
 	            pdata[19] = ++n;
 		    be32enc(&endiandata[19], n);
-		    qubithash(hash64, endiandata);
+		    qubit_hash(hash64, endiandata);
 #ifndef DEBUG_ALGO
 		    if (!(hash64[7] & mask))
                     {
@@ -151,12 +151,3 @@ int scanhash_qubit(int thr_id, struct work *work,
 	return 0;
 }
 
-bool register_qubit_algo( algo_gate_t* gate )
-{
-  gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT;
-  init_qubit_ctx();
-  gate->scanhash = (void*)&scanhash_qubit;
-  gate->hash     = (void*)&qubithash;
-  return true;
-};
-
diff --git a/algo/scrypt.c b/algo/scrypt.c
index 0e268e7..369bcd5 100644
--- a/algo/scrypt.c
+++ b/algo/scrypt.c
@@ -778,6 +778,7 @@ bool scrypt_miner_thread_init( int thr_id )
 
 bool register_scrypt_algo( algo_gate_t* gate )
 {
+  gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT;
   gate->miner_thread_init =(void*)&scrypt_miner_thread_init;
   gate->scanhash         = (void*)&scanhash_scrypt;
 //  gate->hash             = (void*)&scrypt_1024_1_1_256_24way;
diff --git a/algo/sha/md-helper-4way.c b/algo/sha/md-helper-4way.c
index 8ffac8e..eb5c05c 100644
--- a/algo/sha/md-helper-4way.c
+++ b/algo/sha/md-helper-4way.c
@@ -215,18 +215,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
 #if defined BE64
 #if defined PLW1
     sc->buf[ SPH_MAXPAD>>3 ] =
-                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #elif defined PLW4
     memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
     sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-                mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+                mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
     sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-                mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+                mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #else
     sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-               mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+               mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
     sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-               mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+               mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #endif  // PLW
 #else  // LE64
 #if defined PLW1
@@ -255,7 +255,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
     for ( u = 0; u < rnum; u ++ )
     {
 #if defined BE64
-       ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
+       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
 #else  // LE64
        ((__m256i*)dst)[u] = sc->val[u];
 #endif
diff --git a/algo/sha/sha2-hash-4way.c b/algo/sha/sha2-hash-4way.c
index c23bb9f..7a9dd2d 100644
--- a/algo/sha/sha2-hash-4way.c
+++ b/algo/sha/sha2-hash-4way.c
@@ -129,7 +129,7 @@ sha512_4way_round( __m256i *in, __m256i r[8] )
    __m256i W[80];
 
    for ( i = 0; i < 16; i++ )
-      W[i] = mm256_byteswap_64( in[i] );
+      W[i] = mm256_bswap_64( in[i] );
    for ( i = 16; i < 80; i++ )
       W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64(
            SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] );
@@ -224,13 +224,13 @@ void sha512_4way_close( sha512_4way_context *sc, void *dst )
          memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 );
 
     sc->buf[ pad >> 3 ] =
-                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
     sc->buf[ ( pad+8 ) >> 3 ] = 
-                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
     sha512_4way_round( sc->buf, sc->val );
 
     for ( u = 0; u < 8; u ++ )
-       ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
+       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
 }
 
 #endif
diff --git a/algo/shavite/sph-shavite-aesni.c b/algo/shavite/sph-shavite-aesni.c
index 326f469..5560279 100644
--- a/algo/shavite/sph-shavite-aesni.c
+++ b/algo/shavite/sph-shavite-aesni.c
@@ -74,6 +74,18 @@ static const sph_u32 IV512[] = {
 	C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A)
 };
 
+// Return hi 128 bits with elements shifted one lane with vacated lane filled
+// with data rotated from lo.
+// Partially rotate elements in two 128 bit vectors as one 256 bit vector
+// and return the rotated high 128 bits.
+// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
+// completed. It's faster than a full rotation.
+
+static inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
+{   return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
+                        _mm_slli_si128( lo, 16 - (n<<2) ) );
+}
+
 #define AES_ROUND_NOKEY(x0, x1, x2, x3)   do { \
 		sph_u32 t0 = (x0); \
 		sph_u32 t1 = (x1); \
@@ -284,42 +296,42 @@ c512( sph_shavite_big_context *sc, const void *msg )
    // round
    k00 = m[0];
    x = _mm_xor_si128( p1, k00 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
   
    k01 = m[1];
    x = _mm_xor_si128( x, k01 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
 
    k02 = m[2];
    x = _mm_xor_si128( x, k02 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
 
    k03 = m[3];
    x = _mm_xor_si128( x, k03 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
    p0 = _mm_xor_si128( p0, x );
 
    k10 = m[4];
    x = _mm_xor_si128( p3, k10 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
    
    k11 = m[5];
    x = _mm_xor_si128( x, k11 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
 
    k12 = m[6];
    x = _mm_xor_si128( x, k12 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
 
    k13 = m[7];
    x = _mm_xor_si128( x, k13 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
    p2 = _mm_xor_si128( p2, x );
 
    for ( r = 0; r < 3; r ++ )
    {
       // round 1, 5, 9
-      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
       k00 = _mm_xor_si128( k00, k13 ); 
 
       if ( r == 0 )
@@ -327,8 +339,8 @@ c512( sph_shavite_big_context *sc, const void *msg )
                   ~sc->count3, sc->count2, sc->count1, sc->count0 ) ); 
 
       x = _mm_xor_si128( p0, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
       k01 = _mm_xor_si128( k01, k00 );
 
       if ( r == 1 )
@@ -336,34 +348,34 @@ c512( sph_shavite_big_context *sc, const void *msg )
                   ~sc->count0, sc->count1, sc->count2, sc->count3 ) );
 
       x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
       k02 = _mm_xor_si128( k02, k01 );
 
       x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
       k03 = _mm_xor_si128( k03, k02 );
 
       x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
       p3 = _mm_xor_si128( p3, x );
-      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
       k10 = _mm_xor_si128( k10, k03 );
 
       x = _mm_xor_si128( p2, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
       k11 = _mm_xor_si128( k11, k10 );
 
       x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
       k12 = _mm_xor_si128( k12, k11 );
 
       x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
       k13 = _mm_xor_si128( k13, k12 );
 
       if ( r == 2 )
@@ -371,89 +383,89 @@ c512( sph_shavite_big_context *sc, const void *msg )
                   ~sc->count1, sc->count0, sc->count3, sc->count2 ) );
 
       x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
       p1 = _mm_xor_si128( p1, x );
 
       // round 2, 6, 10
 
       k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
       x = _mm_xor_si128( p3, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
 
       k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
       x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
 
       k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
       x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
 
       k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
       x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
 
       p2 = _mm_xor_si128( p2, x );
       k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
       x = _mm_xor_si128( p1, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
 
       k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
       x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
 
       k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
       x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
 
       k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
       x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
       p0 = _mm_xor_si128( p0, x );
 
       // round 3, 7, 11
 
-      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+      k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
       k00 = _mm_xor_si128( k00, k13 );
 
       x = _mm_xor_si128( p2, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
 
-      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) );
+      k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) );
       k01 = _mm_xor_si128( k01, k00 );
 
       x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
       k02 = _mm_xor_si128( k02, k01 );
 
       x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
       k03 = _mm_xor_si128( k03, k02 );
 
       x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
       p1 = _mm_xor_si128( p1, x );
-      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+      k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
       k10 = _mm_xor_si128( k10, k03 );
 
       x = _mm_xor_si128( p0, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
       k11 = _mm_xor_si128( k11, k10 );
 
       x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
       k12 = _mm_xor_si128( k12, k11 );
 
       x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
-      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+      x = _mm_aesenc_si128( x, m128_zero );
+      k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
       k13 = _mm_xor_si128( k13, k12 );
 
       x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
       p3 = _mm_xor_si128( p3, x );
 
       // round 4, 8, 12
@@ -461,83 +473,83 @@ c512( sph_shavite_big_context *sc, const void *msg )
       k00 = _mm_xor_si128( k00, mm_rotr256hi_1x32( k12, k13, 1 ) );
 
       x = _mm_xor_si128( p1, k00 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
       k01 = _mm_xor_si128( k01, mm_rotr256hi_1x32( k13, k00, 1 ) );
 
       x = _mm_xor_si128( x, k01 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
       k02 = _mm_xor_si128( k02, mm_rotr256hi_1x32( k00, k01, 1 ) );
 
       x = _mm_xor_si128( x, k02 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
       k03 = _mm_xor_si128( k03, mm_rotr256hi_1x32( k01, k02, 1 ) );
 
       x = _mm_xor_si128( x, k03 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
       p0 = _mm_xor_si128( p0, x );
       k10 = _mm_xor_si128( k10, mm_rotr256hi_1x32( k02, k03, 1 ) );
 
       x = _mm_xor_si128( p3, k10 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
       k11 = _mm_xor_si128( k11, mm_rotr256hi_1x32( k03, k10, 1 ) );
 
       x = _mm_xor_si128( x, k11 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
       k12 = _mm_xor_si128( k12, mm_rotr256hi_1x32( k10, k11, 1 ) );
 
       x = _mm_xor_si128( x, k12 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
       k13 = _mm_xor_si128( k13, mm_rotr256hi_1x32( k11, k12, 1 ) );
 
       x = _mm_xor_si128( x, k13 );
-      x = _mm_aesenc_si128( x, mm_zero );
+      x = _mm_aesenc_si128( x, m128_zero );
       p2 = _mm_xor_si128( p2, x );
    }
 
    // round 13
 
-   k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, mm_zero ) );
+   k00 = mm_rotr_1x32( _mm_aesenc_si128( k00, m128_zero ) );
    k00 = _mm_xor_si128( k00, k13 );
 
    x = _mm_xor_si128( p0, k00 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, mm_zero ) ); 
+   x = _mm_aesenc_si128( x, m128_zero );
+   k01 = mm_rotr_1x32( _mm_aesenc_si128( k01, m128_zero ) ); 
    k01 = _mm_xor_si128( k01, k00 );
 
    x = _mm_xor_si128( x, k01 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k02 = mm_rotr_1x32( _mm_aesenc_si128( k02, m128_zero ) );
    k02 = _mm_xor_si128( k02, k01 );
 
    x = _mm_xor_si128( x, k02 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k03 = mm_rotr_1x32( _mm_aesenc_si128( k03, m128_zero ) );
    k03 = _mm_xor_si128( k03, k02 );
 
    x = _mm_xor_si128( x, k03 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
    p3 = _mm_xor_si128( p3, x );
-   k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, mm_zero ) );
+   k10 = mm_rotr_1x32( _mm_aesenc_si128( k10, m128_zero ) );
    k10 = _mm_xor_si128( k10, k03 );
 
    x = _mm_xor_si128( p2, k10 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k11 = mm_rotr_1x32( _mm_aesenc_si128( k11, m128_zero ) );
    k11 = _mm_xor_si128( k11, k10 );
 
    x = _mm_xor_si128( x, k11 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k12 = mm_rotr_1x32( _mm_aesenc_si128( k12, m128_zero ) );
    k12 = _mm_xor_si128( k12, _mm_xor_si128( k11, _mm_set_epi32(
                ~sc->count2, sc->count3, sc->count0, sc->count1 ) ) );
 
    x = _mm_xor_si128( x, k12 );
-   x = _mm_aesenc_si128( x, mm_zero );
-   k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, mm_zero ) );
+   x = _mm_aesenc_si128( x, m128_zero );
+   k13 = mm_rotr_1x32( _mm_aesenc_si128( k13, m128_zero ) );
    k13 = _mm_xor_si128( k13, k12 );
 
    x = _mm_xor_si128( x, k13 );
-   x = _mm_aesenc_si128( x, mm_zero );
+   x = _mm_aesenc_si128( x, m128_zero );
    p1 = _mm_xor_si128( p1, x );
 
    h[0] = _mm_xor_si128( h[0], p2 );
diff --git a/algo/simd/sse2/nist.c b/algo/simd/nist.c
similarity index 100%
rename from algo/simd/sse2/nist.c
rename to algo/simd/nist.c
diff --git a/algo/simd/sse2/nist.h b/algo/simd/nist.h
similarity index 100%
rename from algo/simd/sse2/nist.h
rename to algo/simd/nist.h
diff --git a/algo/simd/sse2/simd-compat.h b/algo/simd/simd-compat.h
similarity index 100%
rename from algo/simd/sse2/simd-compat.h
rename to algo/simd/simd-compat.h
diff --git a/algo/simd/simd-hash-2way.c b/algo/simd/simd-hash-2way.c
new file mode 100644
index 0000000..0c2d063
--- /dev/null
+++ b/algo/simd/simd-hash-2way.c
@@ -0,0 +1,853 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "simd-hash-2way.h"
+
+#if defined (__AVX2__)
+
+// imported from simd_iv.h
+
+uint32_t SIMD_IV_512[] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc,
+                           0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
+                           0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f,
+                           0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
+                           0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8,
+                           0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
+                           0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4,
+                           0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 };
+
+/* Twiddle tables */
+
+static const m256_v16 FFT64_Twiddle[] =
+{
+    {{ 1,    2,    4,    8,   16,   32,   64,  128,
+       1,    2,    4,    8,   16,   32,   64,  128 }},
+    {{ 1,   60,    2,  120,    4,  -17,    8,  -34,
+       1,   60,    2,  120,    4,  -17,    8,  -34 }},
+    {{ 1,  120,    8,  -68,   64,  -30,   -2,   17,
+       1,  120,    8,  -68,   64,  -30,   -2,   17 }},
+    {{ 1,   46,   60,  -67,    2,   92,  120,  123,
+       1,   46,   60,  -67,    2,   92,  120,  123 }},
+    {{ 1,   92,  -17,  -22,   32,  117,  -30,   67,
+       1,   92,  -17,  -22,   32,  117,  -30,   67 }},
+    {{ 1,  -67,  120,  -73,    8,  -22,  -68,  -70,
+       1,  -67,  120,  -73,    8,  -22,  -68,  -70 }},
+    {{ 1,  123,  -34,  -70,  128,   67,   17,   35,
+       1,  123,  -34,  -70,  128,   67,   17,   35 }},
+};
+
+static const m256_v16 FFT128_Twiddle[] =
+{
+    {{   1, -118,   46,  -31,   60,  116,  -67,  -61,
+         1, -118,   46,  -31,   60,  116,  -67,  -61 }},
+    {{   2,   21,   92,  -62,  120,  -25,  123, -122,
+         2,   21,   92,  -62,  120,  -25,  123, -122 }},
+    {{   4,   42,  -73, -124,  -17,  -50,  -11,   13,
+         4,   42,  -73, -124,  -17,  -50,  -11,   13 }},
+    {{   8,   84,  111,    9,  -34, -100,  -22,   26,
+         8,   84,  111,    9,  -34, -100,  -22,   26 }},
+    {{  16,  -89,  -35,   18,  -68,   57,  -44,   52,
+        16,  -89,  -35,   18,  -68,   57,  -44,   52 }},
+    {{  32,   79,  -70,   36,  121,  114,  -88,  104,
+        32,   79,  -70,   36,  121,  114,  -88,  104 }},
+    {{  64,  -99,  117,   72,  -15,  -29,   81,  -49,
+        64,  -99,  117,   72,  -15,  -29,   81,  -49 }},
+    {{ 128,   59,  -23, -113,  -30,  -58,  -95,  -98,
+       128,   59,  -23, -113,  -30,  -58,  -95,  -98 }},
+};
+
+static const m256_v16 FFT256_Twiddle[] =
+{
+    {{   1,   41, -118,   45,   46,   87,  -31,   14,
+         1,   41, -118,   45,   46,   87,  -31,   14 }},
+    {{  60, -110,  116, -127,  -67,   80,  -61,   69,
+        60, -110,  116, -127,  -67,   80,  -61,   69 }},
+    {{   2,   82,   21,   90,   92,  -83,  -62,   28,
+         2,   82,   21,   90,   92,  -83,  -62,   28 }},
+    {{ 120,   37,  -25,    3,  123,  -97, -122, -119,
+       120,   37,  -25,    3,  123,  -97, -122, -119 }},
+    {{   4,  -93,   42,  -77,  -73,   91, -124,   56,
+         4,  -93,   42,  -77,  -73,   91, -124,   56 }},
+    {{ -17,   74,  -50,    6,  -11,   63,   13,   19,
+       -17,   74,  -50,    6,  -11,   63,   13,   19 }},
+    {{   8,   71,   84,  103,  111,  -75,    9,  112,
+         8,   71,   84,  103,  111,  -75,    9,  112 }},
+    {{ -34, -109, -100,   12,  -22,  126,   26,   38,
+       -34, -109, -100,   12,  -22,  126,   26,   38 }},
+    {{  16, -115,  -89,  -51,  -35,  107,   18,  -33,
+        16, -115,  -89,  -51,  -35,  107,   18,  -33 }},
+    {{ -68,   39,   57,   24,  -44,   -5,   52,   76,
+       -68,   39,   57,   24,  -44,   -5,   52,   76 }},
+    {{  32,   27,   79, -102,  -70,  -43,   36,  -66,
+        32,   27,   79, -102,  -70,  -43,   36,  -66 }},
+    {{ 121,   78,  114,   48,  -88,  -10,  104, -105,
+       121,   78,  114,   48,  -88,  -10,  104, -105 }},
+    {{  64,   54,  -99,   53,  117,  -86,   72,  125,
+        64,   54,  -99,   53,  117,  -86,   72,  125 }},
+    {{ -15, -101,  -29,   96,   81,  -20,  -49,   47,
+       -15, -101,  -29,   96,   81,  -20,  -49,   47 }},
+    {{ 128,  108,   59,  106,  -23,   85, -113,   -7,
+       128,  108,   59,  106,  -23,   85, -113,   -7 }},
+    {{ -30,   55,  -58,  -65,  -95,  -40,  -98,   94,
+       -30,   55,  -58,  -65,  -95,  -40,  -98,   94 }}
+};
+
+#define SHUFXOR_1 0xb1          /* 0b10110001 */
+#define SHUFXOR_2 0x4e          /* 0b01001110 */
+#define SHUFXOR_3 0x1b          /* 0b00011011 */
+
+#define CAT(x, y) x##y
+#define XCAT(x,y) CAT(x,y)
+
+#define shufxor(x,s) _mm256_shuffle_epi32( x, XCAT( SHUFXOR_, s ))
+
+// imported from vector.c
+
+#define REDUCE(x) \
+  _mm256_sub_epi16( _mm256_and_si256( x, _mm256_set1_epi16( 255 ) ), \
+                                         _mm256_srai_epi16( x, 8 ) )
+
+#define EXTRA_REDUCE_S(x)\
+  _mm256_sub_epi16( x, \
+         _mm256_and_si256( _mm256_set1_epi16( 257 ), \
+                           _mm256_cmpgt_epi16( x, _mm256_set1_epi16( 128 ) ) ) )
+
+#define REDUCE_FULL_S( x )  EXTRA_REDUCE_S( REDUCE (x ) )
+
+#define DO_REDUCE( i )      X(i) = REDUCE( X(i) )
+
+#define DO_REDUCE_FULL_S(i) \
+do { \
+    X(i) = REDUCE( X(i) );                        \
+    X(i) = EXTRA_REDUCE_S( X(i) );                \
+} while(0)
+
+void fft64_2way( void *a )
+{
+  __m256i* const A = a;
+  register __m256i X0, X1, X2, X3, X4, X5, X6, X7;
+
+#define X(i) X##i
+
+  X0 = A[0];
+  X1 = A[1];
+  X2 = A[2];
+  X3 = A[3];
+  X4 = A[4];
+  X5 = A[5];
+  X6 = A[6];
+  X7 = A[7];
+
+#define DO_REDUCE(i)   X(i) = REDUCE( X(i) )
+
+   // Begin with 8 parallels DIF FFT_8
+   //
+   // FFT_8 using w=4 as 8th root of unity
+   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
+   //  Output data is in revbin_permuted order.
+
+  static const int w[] = {0, 2, 4, 6};
+//   __m256i *Twiddle = (__m256i*)FFT64_Twiddle;
+
+
+#define BUTTERFLY_0( i,j ) \
+do { \
+    __m256i v = X(j); \
+    X(j) = _mm256_add_epi16( X(i), X(j) ); \
+    X(i) = _mm256_sub_epi16( X(i), v ); \
+} while(0)
+
+#define BUTTERFLY_N( i,j,n ) \
+do { \
+    __m256i v = X(j); \
+    X(j) = _mm256_add_epi16( X(i), X(j) ); \
+    X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
+} while(0)
+
+  BUTTERFLY_0( 0, 4 );
+  BUTTERFLY_N( 1, 5, 1 );
+  BUTTERFLY_N( 2, 6, 2 );
+  BUTTERFLY_N( 3, 7, 3 );
+
+  DO_REDUCE( 2 );
+  DO_REDUCE( 3 );
+
+  BUTTERFLY_0( 0, 2 );
+  BUTTERFLY_0( 4, 6 );
+  BUTTERFLY_N( 1, 3, 2 );
+  BUTTERFLY_N( 5, 7, 2 );
+
+  DO_REDUCE( 1 );
+
+  BUTTERFLY_0( 0, 1 );
+  BUTTERFLY_0( 2, 3 );
+  BUTTERFLY_0( 4, 5 );
+  BUTTERFLY_0( 6, 7 );
+
+  /* We don't need to reduce X(7) */
+  DO_REDUCE_FULL_S( 0 );
+  DO_REDUCE_FULL_S( 1 );
+  DO_REDUCE_FULL_S( 2 );
+  DO_REDUCE_FULL_S( 3 );
+  DO_REDUCE_FULL_S( 4 );
+  DO_REDUCE_FULL_S( 5 );
+  DO_REDUCE_FULL_S( 6 );
+
+#undef BUTTERFLY_0
+#undef BUTTERFLY_N
+
+  // Multiply by twiddle factors
+  X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i );
+  X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i );
+  X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i );
+  X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i );
+  X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i );
+  X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i );
+  X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i );
+
+  // Transpose the FFT state with a revbin order permutation
+  // on the rows and the column.
+  // This will make the full FFT_64 in order.
+#define INTERLEAVE(i,j) \
+  do { \
+    __m256i t1= X(i); \
+    __m256i t2= X(j); \
+    X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
+    X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
+  } while(0)
+
+  INTERLEAVE( 1, 0 );
+  INTERLEAVE( 3, 2 );
+  INTERLEAVE( 5, 4 );
+  INTERLEAVE( 7, 6 );
+
+  INTERLEAVE( 2, 0 );
+  INTERLEAVE( 3, 1 );
+  INTERLEAVE( 6, 4 );
+  INTERLEAVE( 7, 5 );
+
+  INTERLEAVE( 4, 0 );
+  INTERLEAVE( 5, 1 );
+  INTERLEAVE( 6, 2 );
+  INTERLEAVE( 7, 3 );
+
+#undef INTERLEAVE
+
+   //Finish with 8 parallels DIT FFT_8
+   //FFT_8 using w=4 as 8th root of unity
+   // Unrolled decimation in time (DIT) radix-2 NTT.
+   // Input data is in revbin_permuted order.
+
+#define BUTTERFLY_0( i,j ) \
+do { \
+   __m256i u = X(j); \
+   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
+   X(i) = _mm256_add_epi16( u, X(i) ); \
+} while(0)
+
+
+#define BUTTERFLY_N( i,j,n ) \
+do { \
+   __m256i u = X(j); \
+   X(i) = _mm256_slli_epi16( X(i), w[n] ); \
+   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
+   X(i) = _mm256_add_epi16( u, X(i) ); \
+} while(0)
+
+  DO_REDUCE( 0 );
+  DO_REDUCE( 1 );
+  DO_REDUCE( 2 );
+  DO_REDUCE( 3 );
+  DO_REDUCE( 4 );
+  DO_REDUCE( 5 );
+  DO_REDUCE( 6 );
+  DO_REDUCE( 7 );
+
+  BUTTERFLY_0( 0, 1 );
+  BUTTERFLY_0( 2, 3 );
+  BUTTERFLY_0( 4, 5 );
+  BUTTERFLY_0( 6, 7 );
+
+  BUTTERFLY_0( 0, 2 );
+  BUTTERFLY_0( 4, 6 );
+  BUTTERFLY_N( 1, 3, 2 );
+  BUTTERFLY_N( 5, 7, 2 );
+
+  DO_REDUCE( 3 );
+
+  BUTTERFLY_0( 0, 4 );
+  BUTTERFLY_N( 1, 5, 1 );
+  BUTTERFLY_N( 2, 6, 2 );
+  BUTTERFLY_N( 3, 7, 3 );
+
+  DO_REDUCE_FULL_S( 0 );
+  DO_REDUCE_FULL_S( 1 );
+  DO_REDUCE_FULL_S( 2 );
+  DO_REDUCE_FULL_S( 3 );
+  DO_REDUCE_FULL_S( 4 );
+  DO_REDUCE_FULL_S( 5 );
+  DO_REDUCE_FULL_S( 6 );
+  DO_REDUCE_FULL_S( 7 );
+
+#undef BUTTERFLY
+
+  A[0] = X0;
+  A[1] = X1;
+  A[2] = X2;
+  A[3] = X3;
+  A[4] = X4;
+  A[5] = X5;
+  A[6] = X6;
+  A[7] = X7;
+
+#undef X
+}
+
+void fft128_2way( void *a )
+{
+  int i;
+  // Temp space to help for interleaving in the end
+  __m256i B[8];
+  __m256i *A = (__m256i*) a;
+//  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
+
+  /* Size-2 butterflies */
+  for ( i = 0; i<8; i++ )
+  {
+    B[ i ]   = _mm256_add_epi16( A[ i ], A[ i+8 ] );
+    B[ i ]   = REDUCE_FULL_S( B[ i ] );
+    A[ i+8 ] = _mm256_sub_epi16( A[ i ], A[ i+8 ] );
+    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
+    A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].m256i );
+    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
+  }
+
+  fft64_2way( B );
+  fft64_2way( A+8 );
+
+  /* Transpose (i.e. interleave) */
+  for ( i = 0; i < 8; i++ )
+  {
+    A[ 2*i   ] = _mm256_unpacklo_epi16( B[ i ], A[ i+8 ] );
+    A[ 2*i+1 ] = _mm256_unpackhi_epi16( B[ i ], A[ i+8 ] );
+  }
+}
+
+void fft128_2way_msg( uint16_t *a, const uint8_t *x, int final )
+{
+  static const m256_v16 Tweak      = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
+  static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
+
+  __m256i *X = (__m256i*)x;
+  __m256i *A = (__m256i*)a;
+//  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;
+
+#define UNPACK( i ) \
+do { \
+    __m256i t = X[i]; \
+    A[2*i]   = _mm256_unpacklo_epi8( t, m256_zero ); \
+    A[2*i+8] = _mm256_mullo_epi16( A[2*i], FFT128_Twiddle[2*i].m256i ); \
+    A[2*i+8] = REDUCE(A[2*i+8]); \
+    A[2*i+1] = _mm256_unpackhi_epi8( t, m256_zero ); \
+    A[2*i+9] = _mm256_mullo_epi16(A[2*i+1], FFT128_Twiddle[2*i+1].m256i ); \
+    A[2*i+9] = REDUCE(A[2*i+9]); \
+} while(0)
+
+    // This allows to tweak the last butterflies to introduce X^127
+#define UNPACK_TWEAK( i,tw ) \
+do { \
+    __m256i t = X[i]; \
+    __m256i tmp; \
+    A[2*i]   = _mm256_unpacklo_epi8( t, m256_zero ); \
+    A[2*i+8] = _mm256_mullo_epi16( A[ 2*i ], FFT128_Twiddle[ 2*i ].m256i ); \
+    A[2*i+8] = REDUCE( A[ 2*i+8 ] ); \
+    tmp      = _mm256_unpackhi_epi8( t, m256_zero ); \
+    A[2*i+1] = _mm256_add_epi16( tmp, tw ); \
+    A[2*i+9] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
+                                   FFT128_Twiddle[ 2*i+1 ].m256i );\
+    A[2*i+9] = REDUCE( A[ 2*i+9 ] );                       \
+} while(0)
+
+  UNPACK( 0 );
+  UNPACK( 1 );
+  UNPACK( 2 );
+  if ( final )
+    UNPACK_TWEAK( 3, FinalTweak.m256i );
+  else
+    UNPACK_TWEAK( 3, Tweak.m256i );
+
+#undef UNPACK
+#undef UNPACK_TWEAK
+
+  fft64_2way( a );
+  fft64_2way( a+128 );
+}
+
+void fft256_2way_msg( uint16_t *a, const uint8_t *x, int final )
+{
+  static const m256_v16 Tweak      = {{ 0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,1, }};
+  static const m256_v16 FinalTweak = {{ 0,0,0,0,0,1,0,1, 0,0,0,0,0,1,0,1, }};
+
+  __m256i *X = (__m256i*)x;
+  __m256i *A = (__m256i*)a;
+//  __m256i *Twiddle = (__m256i*)FFT256_Twiddle;
+
+#define UNPACK( i ) \
+do { \
+    __m256i t = X[i]; \
+    A[ 2*i      ] = _mm256_unpacklo_epi8( t, m256_zero ); \
+    A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
+                                        FFT256_Twiddle[ 2*i ].m256i ); \
+    A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
+    A[ 2*i +  1 ] = _mm256_unpackhi_epi8( t, m256_zero ); \
+    A[ 2*i + 17 ] = _mm256_mullo_epi16( A[ 2*i + 1 ], \
+                                        FFT256_Twiddle[ 2*i + 1 ].m256i ); \
+    A[ 2*i + 17 ] = REDUCE( A[ 2*i + 17 ] ); \
+} while(0)
+
+   // This allows to tweak the last butterflies to introduce X^127
+#define UNPACK_TWEAK( i,tw ) \
+do { \
+    __m256i t = X[i]; \
+    __m256i tmp; \
+    A[ 2*i      ] = _mm256_unpacklo_epi8( t, m256_zero ); \
+    A[ 2*i + 16 ] = _mm256_mullo_epi16( A[ 2*i ], \
+                                        FFT256_Twiddle[ 2*i ].m256i ); \
+    A[ 2*i + 16 ] = REDUCE( A[ 2*i + 16 ] ); \
+    tmp           = _mm256_unpackhi_epi8( t, m256_zero ); \
+    A[ 2*i +  1 ] = _mm256_add_epi16( tmp, tw ); \
+    A[ 2*i + 17 ] = _mm256_mullo_epi16( _mm256_sub_epi16( tmp, tw ), \
+                                        FFT256_Twiddle[ 2*i + 1 ].m256i ); \
+  } while(0)
+
+  UNPACK( 0 );
+  UNPACK( 1 );
+  UNPACK( 2 );
+  UNPACK( 3 );
+  UNPACK( 4 );
+  UNPACK( 5 );
+  UNPACK( 6 );
+  if ( final )
+    UNPACK_TWEAK( 7, FinalTweak.m256i );
+  else
+    UNPACK_TWEAK( 7, Tweak.m256i );
+
+#undef UNPACK
+#undef UNPACK_TWEAK
+
+  fft128_2way( a );
+  fft128_2way( a+256 );
+}
+
+void rounds512_2way( uint32_t *state, const uint8_t *msg, uint16_t *fft )
+{
+  register __m256i S0l, S1l, S2l, S3l;
+  register __m256i S0h, S1h, S2h, S3h;
+  __m256i *S = (__m256i*) state;
+  __m256i *M = (__m256i*) msg;
+  __m256i *W = (__m256i*) fft;
+  static const m256_v16 code[] = { mm256_setc1_16(185), mm256_setc1_16(233) };
+
+  S0l = _mm256_xor_si256( S[0], M[0] );
+  S0h = _mm256_xor_si256( S[1], M[1] );
+  S1l = _mm256_xor_si256( S[2], M[2] );
+  S1h = _mm256_xor_si256( S[3], M[3] );
+  S2l = _mm256_xor_si256( S[4], M[4] );
+  S2h = _mm256_xor_si256( S[5], M[5] );
+  S3l = _mm256_xor_si256( S[6], M[6] );
+  S3h = _mm256_xor_si256( S[7], M[7] );
+
+#define S(i) S##i
+
+#define F_0(B, C, D) \
+   _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( C,D ), B ), D )
+#define F_1(B, C, D) \
+   _mm256_or_si256( _mm256_and_si256( D, C ),\
+                    _mm256_and_si256( _mm256_or_si256( D,C ), B ) )
+
+#define Fl(a,b,c,fun) F_##fun (a##l,b##l,c##l)
+#define Fh(a,b,c,fun) F_##fun (a##h,b##h,c##h)
+
+  // We split the round function in two halfes
+  // so as to insert some independent computations in between
+
+#define SUM7_00 0
+#define SUM7_01 1
+#define SUM7_02 2
+#define SUM7_03 3
+#define SUM7_04 4
+#define SUM7_05 5
+#define SUM7_06 6
+
+#define SUM7_10 1
+#define SUM7_11 2
+#define SUM7_12 3
+#define SUM7_13 4
+#define SUM7_14 5
+#define SUM7_15 6
+#define SUM7_16 0
+
+#define SUM7_20 2
+#define SUM7_21 3
+#define SUM7_22 4
+#define SUM7_23 5
+#define SUM7_24 6
+#define SUM7_25 0
+#define SUM7_26 1
+
+#define SUM7_30 3
+#define SUM7_31 4
+#define SUM7_32 5
+#define SUM7_33 6
+#define SUM7_34 0
+#define SUM7_35 1
+#define SUM7_36 2
+
+#define SUM7_40 4
+#define SUM7_41 5
+#define SUM7_42 6
+#define SUM7_43 0
+#define SUM7_44 1
+#define SUM7_45 2
+#define SUM7_46 3
+
+#define SUM7_50 5
+#define SUM7_51 6
+#define SUM7_52 0
+#define SUM7_53 1
+#define SUM7_54 2
+#define SUM7_55 3
+#define SUM7_56 4
+
+#define SUM7_60 6
+#define SUM7_61 0
+#define SUM7_62 1
+#define SUM7_63 2
+#define SUM7_64 3
+#define SUM7_65 4
+#define SUM7_66 5
+
+#define PERM(z,d,a) XCAT(PERM_,XCAT(SUM7_##z,PERM_START))(d,a)
+
+#define PERM_0(d,a) /* XOR 1 */ \
+do { \
+    d##l = shufxor( a##l, 1 ); \
+    d##h = shufxor( a##h, 1 ); \
+ } while(0)
+
+#define PERM_1(d,a) /* XOR 6 */ \
+do { \
+    d##l = shufxor( a##h, 2 ); \
+    d##h = shufxor( a##l, 2 ); \
+} while(0)
+
+#define PERM_2(d,a) /* XOR 2 */ \
+do { \
+    d##l = shufxor( a##l, 2 ); \
+    d##h = shufxor( a##h, 2 ); \
+} while(0)
+
+#define PERM_3(d,a) /* XOR 3 */ \
+do { \
+    d##l = shufxor( a##l, 3 ); \
+    d##h = shufxor( a##h, 3 ); \
+} while(0)
+
+#define PERM_4(d,a) /* XOR 5 */ \
+do { \
+    d##l = shufxor( a##h, 1 ); \
+    d##h = shufxor( a##l, 1 ); \
+} while(0)
+
+#define PERM_5(d,a) /* XOR 7 */ \
+do { \
+    d##l = shufxor( a##h, 3 ); \
+    d##h = shufxor( a##l, 3 ); \
+} while(0)
+
+#define PERM_6(d,a) /* XOR 4 */ \
+do { \
+    d##l = a##h; \
+    d##h = a##l; \
+} while(0)
+
+#define STEP_1_(a,b,c,d,w,fun,r,s,z) \
+do { \
+    TTl  = Fl( a,b,c,fun ); \
+    TTh  = Fh( a,b,c,fun ); \
+    a##l = mm256_rotl_32( a##l, r ); \
+    a##h = mm256_rotl_32( a##h, r ); \
+    w##l = _mm256_add_epi32( w##l, d##l ); \
+    w##h = _mm256_add_epi32( w##h, d##h ); \
+    TTl  = _mm256_add_epi32( TTl, w##l ); \
+    TTh  = _mm256_add_epi32( TTh, w##h ); \
+    TTl  = mm256_rotl_32( TTl, s ); \
+    TTh  = mm256_rotl_32( TTh, s ); \
+    PERM( z,d,a ); \
+} while(0)
+
+#define STEP_1( a,b,c,d,w,fun,r,s,z )   STEP_1_( a,b,c,d,w,fun,r,s,z )
+
+#define STEP_2_( a,b,c,d,w,fun,r,s ) \
+do { \
+    d##l = _mm256_add_epi32( d##l, TTl ); \
+    d##h = _mm256_add_epi32( d##h, TTh ); \
+} while(0)
+
+#define STEP_2( a,b,c,d,w,fun,r,s )  STEP_2_( a,b,c,d,w,fun,r,s )
+
+#define STEP( a,b,c,d,w1,w2,fun,r,s,z ) \
+do { \
+    register __m256i TTl, TTh, Wl=w1, Wh=w2; \
+    STEP_1( a,b,c,d,W,fun,r,s,z ); \
+    STEP_2( a,b,c,d,W,fun,r,s ); \
+} while(0);
+
+#define MSG_l(x) (2*(x))
+#define MSG_h(x) (2*(x)+1)
+
+#define MSG( w,hh,ll,u,z ) \
+do { \
+    int a = MSG_##u(hh); \
+    int b = MSG_##u(ll); \
+    w##l = _mm256_unpacklo_epi16( W[a], W[b] ); \
+    w##l = _mm256_mullo_epi16( w##l, code[z].m256i ); \
+    w##h = _mm256_unpackhi_epi16( W[a], W[b]) ; \
+    w##h = _mm256_mullo_epi16( w##h, code[z].m256i ); \
+} while(0)
+
+#define ROUND( h0,l0,u0,h1,l1,u1,h2,l2,u2,h3,l3,u3,fun,r,s,t,u,z ) \
+do { \
+    register __m256i W0l, W1l, W2l, W3l, TTl; \
+    register __m256i W0h, W1h, W2h, W3h, TTh; \
+    MSG( W0, h0, l0, u0, z ); \
+    STEP_1( S(0), S(1), S(2), S(3), W0, fun, r, s, 0 ); \
+    MSG( W1, h1, l1, u1, z ); \
+    STEP_2( S(0), S(1), S(2), S(3), W0, fun, r, s ); \
+    STEP_1( S(3), S(0), S(1), S(2), W1, fun, s, t, 1 ); \
+    MSG( W2,h2,l2,u2,z ); \
+    STEP_2( S(3), S(0), S(1), S(2), W1, fun, s, t ); \
+    STEP_1( S(2), S(3), S(0), S(1), W2, fun, t, u, 2 ); \
+    MSG( W3,h3,l3,u3,z ); \
+    STEP_2( S(2), S(3), S(0), S(1), W2, fun, t, u ); \
+    STEP_1( S(1), S(2), S(3), S(0), W3, fun, u, r, 3 ); \
+    STEP_2( S(1), S(2), S(3), S(0), W3, fun, u, r ); \
+} while(0)
+
+   // 4 rounds with code 185
+#define PERM_START 0
+   ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
+#undef PERM_START
+#define PERM_START 4
+   ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
+#undef PERM_START
+#define PERM_START 1
+   ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
+#undef PERM_START
+#define PERM_START 5
+   ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
+#undef PERM_START
+
+   // 4 rounds with code 233
+#define PERM_START 2
+   ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
+#undef PERM_START
+#define PERM_START 6
+   ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
+#undef PERM_START
+#define PERM_START 3
+   ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
+#undef PERM_START
+#define PERM_START 0
+   ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
+#undef PERM_START
+
+   // 1 round as feed-forward
+#define PERM_START 4
+   STEP( S(0), S(1), S(2), S(3), S[0], S[1], 0,  4, 13, 0 );
+   STEP( S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1 );
+   STEP( S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2 );
+   STEP( S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3 );
+
+   S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
+   S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
+
+#undef PERM_START
+#undef STEP_1
+#undef STEP_2
+#undef STEP
+#undef ROUND
+}
+
+void SIMD_2way_Compress( simd_2way_context *state, const void *m, int final )
+{
+   m256_v16 Y[32];
+   uint16_t *y = (uint16_t*) Y[0].u16;
+   fft256_2way_msg( y, m, final );
+   rounds512_2way( state->A, m, y );
+}
+
+// imported from nist.c
+
+int simd_2way_init( simd_2way_context *state, int hashbitlen )
+{
+  __m256i *A = (__m256i*)state->A;
+  int n = 8;
+
+  state->hashbitlen = hashbitlen;
+  state->n_feistels = n;
+  state->blocksize = 128*8;
+  state->count = 0;
+
+  for ( int i = 0; i < 8; i++ )
+       A[i] = _mm256_set_epi32( SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
+                                SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0],
+                                SIMD_IV_512[4*i+3], SIMD_IV_512[4*i+2],
+                                SIMD_IV_512[4*i+1], SIMD_IV_512[4*i+0] );
+  return 0;
+}
+
+int simd_2way_update( simd_2way_context *state, const void *data,
+                             int databitlen )
+{
+  int bs      = state->blocksize;
+  int current = state->count & (bs - 1);
+
+  while ( databitlen > 0 )
+  {
+    if ( current == 0 && databitlen >= bs )
+    {
+       // We can hash the data directly from the input buffer.
+      SIMD_2way_Compress( state, data, 0 );
+      databitlen -= bs;
+      data += 2*(bs/8);
+      state->count += bs;
+    }
+    else
+    {
+       // Copy a chunk of data to the buffer
+      int len = bs - current;
+      if ( databitlen < len )
+      {
+        memcpy( state->buffer + 2*(current/8), data, 2*((databitlen+7)/8) );
+        state->count += databitlen;
+        return 0;
+      }
+      else
+      {
+        memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
+        state->count += len;
+        databitlen -= len;
+        data += 2*(len/8);
+        current = 0;
+        SIMD_2way_Compress( state, state->buffer, 0 );
+      }
+    }
+  }
+  return 0;
+}
+
+int simd_2way_close( simd_2way_context *state, void *hashval )
+{
+  uint64_t l;
+  int current = state->count & (state->blocksize - 1);
+  int i;
+  int isshort = 1;
+
+  // If there is still some data in the buffer, hash it
+  if ( current )
+  {
+    current = ( current+7 ) / 8;
+    memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current ) );
+    SIMD_2way_Compress( state, state->buffer, 0 );
+  }
+
+  //* Input the message length as the last block
+  memset( state->buffer, 0, 2*(state->blocksize / 8) );
+  l = state->count;
+  for ( i = 0; i < 8; i++ )
+  {
+    state->buffer[ i     ] = l & 0xff;
+    state->buffer[ i+16 ] = l & 0xff;
+    l >>= 8;
+  }
+  if ( state->count < 16384 )
+    isshort = 2;
+
+  SIMD_2way_Compress( state, state->buffer, isshort );
+  memcpy( hashval, state->A, 2*(state->hashbitlen / 8) );
+
+  return 0;
+}
+
+int simd_2way_update_close( simd_2way_context *state, void *hashval,
+                            const void *data, int databitlen )
+{
+  int current, i;
+  int bs = state->blocksize;  // bits in one lane
+  int isshort = 1;
+  uint64_t l;
+
+  current = state->count & (bs - 1);
+
+  while ( databitlen > 0 )
+  {
+    if ( current == 0 && databitlen >= bs )
+    {
+      // We can hash the data directly from the input buffer.
+      SIMD_2way_Compress( state, data, 0 );
+      databitlen -= bs;
+      data += 2*( bs/8 );
+      state->count += bs;
+    }
+    else
+    {
+      // Copy a chunk of data to the buffer
+      int len = bs - current;
+      if ( databitlen < len )
+      {
+        memcpy( state->buffer + 2*( current/8 ), data, 2*( (databitlen+7)/8 ) );
+        state->count += databitlen;
+        break;
+      }
+      else
+      {
+        memcpy( state->buffer + 2*(current/8), data, 2*(len/8) );
+        state->count += len;
+        databitlen -= len;
+        data += 2*( len/8 );
+        current = 0;
+        SIMD_2way_Compress( state, state->buffer, 0 );
+      }
+    }
+  }
+
+  current = state->count & (state->blocksize - 1);
+
+  // If there is still some data in the buffer, hash it
+  if ( current )
+  {
+    current = ( current+7 ) / 8;
+    memset( state->buffer + 2*current, 0, 2*( state->blocksize/8 - current) );
+    SIMD_2way_Compress( state, state->buffer, 0 );
+  }
+
+  //* Input the message length as the last block
+  memset( state->buffer, 0, 2*( state->blocksize/8 ) );
+  l = state->count;
+  for ( i = 0; i < 8; i++ )
+  {
+    state->buffer[ i    ] = l & 0xff;
+    state->buffer[ i+16 ] = l & 0xff;
+    l >>= 8;
+  }
+  if ( state->count < 16384 )
+    isshort = 2;
+
+  SIMD_2way_Compress( state, state->buffer, isshort );
+  memcpy( hashval, state->A, 2*( state->hashbitlen / 8 ) );
+  return 0;
+}
+
+#endif
diff --git a/algo/simd/simd-hash-2way.h b/algo/simd/simd-hash-2way.h
new file mode 100644
index 0000000..d8f80c1
--- /dev/null
+++ b/algo/simd/simd-hash-2way.h
@@ -0,0 +1,27 @@
+#ifndef SIMD_HASH_2WAY_H__
+#define SIMD_HASH_2WAY_H__ 1
+
+#include "simd-compat.h"
+
+#if defined(__AVX2__)
+
+#include "avxdefs.h"
+
+typedef struct {
+  uint32_t A[ 32*2 ] __attribute__((aligned(64)));
+  uint8_t buffer[ 128*2 ] __attribute__((aligned(64)));
+  uint64_t count;
+  unsigned int hashbitlen;
+  unsigned int blocksize;
+  unsigned int n_feistels;
+  
+} simd_2way_context;
+
+int simd_2way_init( simd_2way_context *state, int hashbitlen );
+int simd_2way_update( simd_2way_context *state, const void *data,
+                      int databitlen );
+int simd_2way_close( simd_2way_context *state, void *hashval );
+int simd_2way_update_close( simd_2way_context *state, void *hashval,
+                            const void *data, int databitlen );
+#endif
+#endif
diff --git a/algo/simd/sse2/simd_iv.h b/algo/simd/simd_iv.h
similarity index 95%
rename from algo/simd/sse2/simd_iv.h
rename to algo/simd/simd_iv.h
index c9b4a4e..ef68900 100644
--- a/algo/simd/sse2/simd_iv.h
+++ b/algo/simd/simd_iv.h
@@ -1,3 +1,6 @@
+#if !defined(SIMD_IV_H__)
+#define SIMD_IV_H__
+
 u32 IV_224[] = {
   0x33586e9f, 0x12fff033, 0xb2d9f64d, 0x6f8fea53,
   0xde943106, 0x2742e439, 0x4fbab5ac, 0x62b9ff96,
@@ -25,3 +28,5 @@ u32 IV_512[] = {
   0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
   0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
 };
+
+#endif
diff --git a/algo/simd/sse2/defs_x5.h b/algo/simd/sse2/defs_x5.h
deleted file mode 100644
index 7ffbde0..0000000
--- a/algo/simd/sse2/defs_x5.h
+++ /dev/null
@@ -1,23 +0,0 @@
-
-#ifndef DEFS_X5_H__
-#define DEFS_X5_H__
-#include <emmintrin.h>
-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
-typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHBITLEN = 2} HashReturn;
-
-typedef unsigned char uint8;
-typedef unsigned int uint32;
-typedef unsigned long long uint64;
-
-typedef struct {
-    uint32 buffer[8]; /* Buffer to be hashed */
-    __m128i chainv[10];   /* Chaining values */
-    uint64 bitlen[2]; /* Message length in bits */
-    uint32 rembitlen; /* Length of buffer data to be hashed */
-    int hashbitlen;
-} hashState_luffa;
-
-
-typedef unsigned char byte;
-#endif
\ No newline at end of file
diff --git a/algo/simd/sse2/sph_types.h b/algo/simd/sse2/sph_types.h
deleted file mode 100644
index 7295b0b..0000000
--- a/algo/simd/sse2/sph_types.h
+++ /dev/null
@@ -1,1976 +0,0 @@
-/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */
-/**
- * Basic type definitions.
- *
- * This header file defines the generic integer types that will be used
- * for the implementation of hash functions; it also contains helper
- * functions which encode and decode multi-byte integer values, using
- * either little-endian or big-endian conventions.
- *
- * This file contains a compile-time test on the size of a byte
- * (the <code>unsigned char</code> C type). If bytes are not octets,
- * i.e. if they do not have a size of exactly 8 bits, then compilation
- * is aborted. Architectures where bytes are not octets are relatively
- * rare, even in the embedded devices market. We forbid non-octet bytes
- * because there is no clear convention on how octet streams are encoded
- * on such systems.
- *
- * ==========================(LICENSE BEGIN)============================
- *
- * Copyright (c) 2007-2010  Projet RNRT SAPHIR
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @file     sph_types.h
- * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
- */
-
-#ifndef SPH_TYPES_H__
-#define SPH_TYPES_H__
-
-#include <limits.h>
-
-/*
- * All our I/O functions are defined over octet streams. We do not know
- * how to handle input data if bytes are not octets.
- */
-#if CHAR_BIT != 8
-#error This code requires 8-bit bytes
-#endif
-
-/* ============= BEGIN documentation block for Doxygen ============ */
-
-#ifdef DOXYGEN_IGNORE
-
-/** @mainpage sphlib C code documentation
- *
- * @section overview Overview
- *
- * <code>sphlib</code> is a library which contains implementations of
- * various cryptographic hash functions. These pages have been generated
- * with <a href="http://www.doxygen.org/index.html">doxygen</a> and
- * document the API for the C implementations.
- *
- * The API is described in appropriate header files, which are available
- * in the "Files" section. Each hash function family has its own header,
- * whose name begins with <code>"sph_"</code> and contains the family
- * name. For instance, the API for the RIPEMD hash functions is available
- * in the header file <code>sph_ripemd.h</code>.
- *
- * @section principles API structure and conventions
- *
- * @subsection io Input/output conventions
- *
- * In all generality, hash functions operate over strings of bits.
- * Individual bits are rarely encountered in C programming or actual
- * communication protocols; most protocols converge on the ubiquitous
- * "octet" which is a group of eight bits. Data is thus expressed as a
- * stream of octets. The C programming language contains the notion of a
- * "byte", which is a data unit managed under the type <code>"unsigned
- * char"</code>. The C standard prescribes that a byte should hold at
- * least eight bits, but possibly more. Most modern architectures, even
- * in the embedded world, feature eight-bit bytes, i.e. map bytes to
- * octets.
- *
- * Nevertheless, for some of the implemented hash functions, an extra
- * API has been added, which allows the input of arbitrary sequences of
- * bits: when the computation is about to be closed, 1 to 7 extra bits
- * can be added. The functions for which this API is implemented include
- * the SHA-2 functions and all SHA-3 candidates.
- *
- * <code>sphlib</code> defines hash function which may hash octet streams,
- * i.e. streams of bits where the number of bits is a multiple of eight.
- * The data input functions in the <code>sphlib</code> API expect data
- * as anonymous pointers (<code>"const void *"</code>) with a length
- * (of type <code>"size_t"</code>) which gives the input data chunk length
- * in bytes. A byte is assumed to be an octet; the <code>sph_types.h</code>
- * header contains a compile-time test which prevents compilation on
- * architectures where this property is not met.
- *
- * The hash function output is also converted into bytes. All currently
- * implemented hash functions have an output width which is a multiple of
- * eight, and this is likely to remain true for new designs.
- *
- * Most hash functions internally convert input data into 32-bit of 64-bit
- * words, using either little-endian or big-endian conversion. The hash
- * output also often consists of such words, which are encoded into output
- * bytes with a similar endianness convention. Some hash functions have
- * been only loosely specified on that subject; when necessary,
- * <code>sphlib</code> has been tested against published "reference"
- * implementations in order to use the same conventions.
- *
- * @subsection shortname Function short name
- *
- * Each implemented hash function has a "short name" which is used
- * internally to derive the identifiers for the functions and context
- * structures which the function uses. For instance, MD5 has the short
- * name <code>"md5"</code>. Short names are listed in the next section,
- * for the implemented hash functions. In subsequent sections, the
- * short name will be assumed to be <code>"XXX"</code>: replace with the
- * actual hash function name to get the C identifier.
- *
- * Note: some functions within the same family share the same core
- * elements, such as update function or context structure. Correspondingly,
- * some of the defined types or functions may actually be macros which
- * transparently evaluate to another type or function name.
- *
- * @subsection context Context structure
- *
- * Each implemented hash fonction has its own context structure, available
- * under the type name <code>"sph_XXX_context"</code> for the hash function
- * with short name <code>"XXX"</code>. This structure holds all needed
- * state for a running hash computation.
- *
- * The contents of these structures are meant to be opaque, and private
- * to the implementation. However, these contents are specified in the
- * header files so that application code which uses <code>sphlib</code>
- * may access the size of those structures.
- *
- * The caller is responsible for allocating the context structure,
- * whether by dynamic allocation (<code>malloc()</code> or equivalent),
- * static allocation (a global permanent variable), as an automatic
- * variable ("on the stack"), or by any other mean which ensures proper
- * structure alignment. <code>sphlib</code> code performs no dynamic
- * allocation by itself.
- *
- * The context must be initialized before use, using the
- * <code>sph_XXX_init()</code> function. This function sets the context
- * state to proper initial values for hashing.
- *
- * Since all state data is contained within the context structure,
- * <code>sphlib</code> is thread-safe and reentrant: several hash
- * computations may be performed in parallel, provided that they do not
- * operate on the same context. Moreover, a running computation can be
- * cloned by copying the context (with a simple <code>memcpy()</code>):
- * the context and its clone are then independant and may be updated
- * with new data and/or closed without interfering with each other.
- * Similarly, a context structure can be moved in memory at will:
- * context structures contain no pointer, in particular no pointer to
- * themselves.
- *
- * @subsection dataio Data input
- *
- * Hashed data is input with the <code>sph_XXX()</code> fonction, which
- * takes as parameters a pointer to the context, a pointer to the data
- * to hash, and the number of data bytes to hash. The context is updated
- * with the new data.
- *
- * Data can be input in one or several calls, with arbitrary input lengths.
- * However, it is best, performance wise, to input data by relatively big
- * chunks (say a few kilobytes), because this allows <code>sphlib</code> to
- * optimize things and avoid internal copying.
- *
- * When all data has been input, the context can be closed with
- * <code>sph_XXX_close()</code>. The hash output is computed and written
- * into the provided buffer. The caller must take care to provide a
- * buffer of appropriate length; e.g., when using SHA-1, the output is
- * a 20-byte word, therefore the output buffer must be at least 20-byte
- * long.
- *
- * For some hash functions, the <code>sph_XXX_addbits_and_close()</code>
- * function can be used instead of <code>sph_XXX_close()</code>. This
- * function can take a few extra <strong>bits</strong> to be added at
- * the end of the input message. This allows hashing messages with a
- * bit length which is not a multiple of 8. The extra bits are provided
- * as an unsigned integer value, and a bit count. The bit count must be
- * between 0 and 7, inclusive. The extra bits are provided as bits 7 to
- * 0 (bits of numerical value 128, 64, 32... downto 0), in that order.
- * For instance, to add three bits of value 1, 1 and 0, the unsigned
- * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count
- * will be 3.
- *
- * The <code>SPH_SIZE_XXX</code> macro is defined for each hash function;
- * it evaluates to the function output size, expressed in bits. For instance,
- * <code>SPH_SIZE_sha1</code> evaluates to <code>160</code>.
- *
- * When closed, the context is automatically reinitialized and can be
- * immediately used for another computation. It is not necessary to call
- * <code>sph_XXX_init()</code> after a close. Note that
- * <code>sph_XXX_init()</code> can still be called to "reset" a context,
- * i.e. forget previously input data, and get back to the initial state.
- *
- * @subsection alignment Data alignment
- *
- * "Alignment" is a property of data, which is said to be "properly
- * aligned" when its emplacement in memory is such that the data can
- * be optimally read by full words. This depends on the type of access;
- * basically, some hash functions will read data by 32-bit or 64-bit
- * words. <code>sphlib</code> does not mandate such alignment for input
- * data, but using aligned data can substantially improve performance.
- *
- * As a rule, it is best to input data by chunks whose length (in bytes)
- * is a multiple of eight, and which begins at "generally aligned"
- * addresses, such as the base address returned by a call to
- * <code>malloc()</code>.
- *
- * @section functions Implemented functions
- *
- * We give here the list of implemented functions. They are grouped by
- * family; to each family corresponds a specific header file. Each
- * individual function has its associated "short name". Please refer to
- * the documentation for that header file to get details on the hash
- * function denomination and provenance.
- *
- * Note: the functions marked with a '(64)' in the list below are
- * available only if the C compiler provides an integer type of length
- * 64 bits or more. Such a type is mandatory in the latest C standard
- * (ISO 9899:1999, aka "C99") and is present in several older compilers
- * as well, so chances are that such a type is available.
- *
- * - HAVAL family: file <code>sph_haval.h</code>
- *   - HAVAL-128/3 (128-bit, 3 passes): short name: <code>haval128_3</code>
- *   - HAVAL-128/4 (128-bit, 4 passes): short name: <code>haval128_4</code>
- *   - HAVAL-128/5 (128-bit, 5 passes): short name: <code>haval128_5</code>
- *   - HAVAL-160/3 (160-bit, 3 passes): short name: <code>haval160_3</code>
- *   - HAVAL-160/4 (160-bit, 4 passes): short name: <code>haval160_4</code>
- *   - HAVAL-160/5 (160-bit, 5 passes): short name: <code>haval160_5</code>
- *   - HAVAL-192/3 (192-bit, 3 passes): short name: <code>haval192_3</code>
- *   - HAVAL-192/4 (192-bit, 4 passes): short name: <code>haval192_4</code>
- *   - HAVAL-192/5 (192-bit, 5 passes): short name: <code>haval192_5</code>
- *   - HAVAL-224/3 (224-bit, 3 passes): short name: <code>haval224_3</code>
- *   - HAVAL-224/4 (224-bit, 4 passes): short name: <code>haval224_4</code>
- *   - HAVAL-224/5 (224-bit, 5 passes): short name: <code>haval224_5</code>
- *   - HAVAL-256/3 (256-bit, 3 passes): short name: <code>haval256_3</code>
- *   - HAVAL-256/4 (256-bit, 4 passes): short name: <code>haval256_4</code>
- *   - HAVAL-256/5 (256-bit, 5 passes): short name: <code>haval256_5</code>
- * - MD2: file <code>sph_md2.h</code>, short name: <code>md2</code>
- * - MD4: file <code>sph_md4.h</code>, short name: <code>md4</code>
- * - MD5: file <code>sph_md5.h</code>, short name: <code>md5</code>
- * - PANAMA: file <code>sph_panama.h</code>, short name: <code>panama</code>
- * - RadioGatun family: file <code>sph_radiogatun.h</code>
- *   - RadioGatun[32]: short name: <code>radiogatun32</code>
- *   - RadioGatun[64]: short name: <code>radiogatun64</code> (64)
- * - RIPEMD family: file <code>sph_ripemd.h</code>
- *   - RIPEMD: short name: <code>ripemd</code>
- *   - RIPEMD-128: short name: <code>ripemd128</code>
- *   - RIPEMD-160: short name: <code>ripemd160</code>
- * - SHA-0: file <code>sph_sha0.h</code>, short name: <code>sha0</code>
- * - SHA-1: file <code>sph_sha1.h</code>, short name: <code>sha1</code>
- * - SHA-2 family, 32-bit hashes: file <code>sph_sha2.h</code>
- *   - SHA-224: short name: <code>sha224</code>
- *   - SHA-256: short name: <code>sha256</code>
- *   - SHA-384: short name: <code>sha384</code> (64)
- *   - SHA-512: short name: <code>sha512</code> (64)
- * - Tiger family: file <code>sph_tiger.h</code>
- *   - Tiger: short name: <code>tiger</code> (64)
- *   - Tiger2: short name: <code>tiger2</code> (64)
- * - WHIRLPOOL family: file <code>sph_whirlpool.h</code>
- *   - WHIRLPOOL-0: short name: <code>whirlpool0</code> (64)
- *   - WHIRLPOOL-1: short name: <code>whirlpool1</code> (64)
- *   - WHIRLPOOL: short name: <code>whirlpool</code> (64)
- *
- * The fourteen second-round SHA-3 candidates are also implemented;
- * when applicable, the implementations follow the "final" specifications
- * as published for the third round of the SHA-3 competition (BLAKE,
- * Groestl, JH, Keccak and Skein have been tweaked for third round).
- *
- * - BLAKE family: file <code>sph_blake.h</code>
- *   - BLAKE-224: short name: <code>blake224</code>
- *   - BLAKE-256: short name: <code>blake256</code>
- *   - BLAKE-384: short name: <code>blake384</code>
- *   - BLAKE-512: short name: <code>blake512</code>
- * - BMW (Blue Midnight Wish) family: file <code>sph_bmw.h</code>
- *   - BMW-224: short name: <code>bmw224</code>
- *   - BMW-256: short name: <code>bmw256</code>
- *   - BMW-384: short name: <code>bmw384</code> (64)
- *   - BMW-512: short name: <code>bmw512</code> (64)
- * - CubeHash family: file <code>sph_cubehash.h</code> (specified as
- *   CubeHash16/32 in the CubeHash specification)
- *   - CubeHash-224: short name: <code>cubehash224</code>
- *   - CubeHash-256: short name: <code>cubehash256</code>
- *   - CubeHash-384: short name: <code>cubehash384</code>
- *   - CubeHash-512: short name: <code>cubehash512</code>
- * - ECHO family: file <code>sph_echo.h</code>
- *   - ECHO-224: short name: <code>echo224</code>
- *   - ECHO-256: short name: <code>echo256</code>
- *   - ECHO-384: short name: <code>echo384</code>
- *   - ECHO-512: short name: <code>echo512</code>
- * - Fugue family: file <code>sph_fugue.h</code>
- *   - Fugue-224: short name: <code>fugue224</code>
- *   - Fugue-256: short name: <code>fugue256</code>
- *   - Fugue-384: short name: <code>fugue384</code>
- *   - Fugue-512: short name: <code>fugue512</code>
- * - Groestl family: file <code>sph_groestl.h</code>
- *   - Groestl-224: short name: <code>groestl224</code>
- *   - Groestl-256: short name: <code>groestl256</code>
- *   - Groestl-384: short name: <code>groestl384</code>
- *   - Groestl-512: short name: <code>groestl512</code>
- * - Hamsi family: file <code>sph_hamsi.h</code>
- *   - Hamsi-224: short name: <code>hamsi224</code>
- *   - Hamsi-256: short name: <code>hamsi256</code>
- *   - Hamsi-384: short name: <code>hamsi384</code>
- *   - Hamsi-512: short name: <code>hamsi512</code>
- * - JH family: file <code>sph_jh.h</code>
- *   - JH-224: short name: <code>jh224</code>
- *   - JH-256: short name: <code>jh256</code>
- *   - JH-384: short name: <code>jh384</code>
- *   - JH-512: short name: <code>jh512</code>
- * - Keccak family: file <code>sph_keccak.h</code>
- *   - Keccak-224: short name: <code>keccak224</code>
- *   - Keccak-256: short name: <code>keccak256</code>
- *   - Keccak-384: short name: <code>keccak384</code>
- *   - Keccak-512: short name: <code>keccak512</code>
- * - Luffa family: file <code>sph_luffa.h</code>
- *   - Luffa-224: short name: <code>luffa224</code>
- *   - Luffa-256: short name: <code>luffa256</code>
- *   - Luffa-384: short name: <code>luffa384</code>
- *   - Luffa-512: short name: <code>luffa512</code>
- * - Shabal family: file <code>sph_shabal.h</code>
- *   - Shabal-192: short name: <code>shabal192</code>
- *   - Shabal-224: short name: <code>shabal224</code>
- *   - Shabal-256: short name: <code>shabal256</code>
- *   - Shabal-384: short name: <code>shabal384</code>
- *   - Shabal-512: short name: <code>shabal512</code>
- * - SHAvite-3 family: file <code>sph_shavite.h</code>
- *   - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"):
- *     short name: <code>shabal224</code>
- *   - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"):
- *     short name: <code>shabal256</code>
- *   - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"):
- *     short name: <code>shabal384</code>
- *   - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"):
- *     short name: <code>shabal512</code>
- * - SIMD family: file <code>sph_simd.h</code>
- *   - SIMD-224: short name: <code>simd224</code>
- *   - SIMD-256: short name: <code>simd256</code>
- *   - SIMD-384: short name: <code>simd384</code>
- *   - SIMD-512: short name: <code>simd512</code>
- * - Skein family: file <code>sph_skein.h</code>
- *   - Skein-224 (nominally specified as Skein-512-224): short name:
- *     <code>skein224</code> (64)
- *   - Skein-256 (nominally specified as Skein-512-256): short name:
- *     <code>skein256</code> (64)
- *   - Skein-384 (nominally specified as Skein-512-384): short name:
- *     <code>skein384</code> (64)
- *   - Skein-512 (nominally specified as Skein-512-512): short name:
- *     <code>skein512</code> (64)
- *
- * For the second-round SHA-3 candidates, the functions are as specified
- * for round 2, i.e. with the "tweaks" that some candidates added
- * between round 1 and round 2. Also, some of the submitted packages for
- * round 2 contained errors, in the specification, reference code, or
- * both. <code>sphlib</code> implements the corrected versions.
- */
-
-/** @hideinitializer
- * Unsigned integer type whose length is at least 32 bits; on most
- * architectures, it will have a width of exactly 32 bits. Unsigned C
- * types implement arithmetics modulo a power of 2; use the
- * <code>SPH_T32()</code> macro to ensure that the value is truncated
- * to exactly 32 bits. Unless otherwise specified, all macros and
- * functions which accept <code>sph_u32</code> values assume that these
- * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures
- * where <code>sph_u32</code> is larger than that.
- */
-typedef __arch_dependant__ sph_u32;
-
-/** @hideinitializer
- * Signed integer type corresponding to <code>sph_u32</code>; it has
- * width 32 bits or more.
- */
-typedef __arch_dependant__ sph_s32;
-
-/** @hideinitializer
- * Unsigned integer type whose length is at least 64 bits; on most
- * architectures which feature such a type, it will have a width of
- * exactly 64 bits. C99-compliant platform will have this type; it
- * is also defined when the GNU compiler (gcc) is used, and on
- * platforms where <code>unsigned long</code> is large enough. If this
- * type is not available, then some hash functions which depends on
- * a 64-bit type will not be available (most notably SHA-384, SHA-512,
- * Tiger and WHIRLPOOL).
- */
-typedef __arch_dependant__ sph_u64;
-
-/** @hideinitializer
- * Signed integer type corresponding to <code>sph_u64</code>; it has
- * width 64 bits or more.
- */
-typedef __arch_dependant__ sph_s64;
-
-/**
- * This macro expands the token <code>x</code> into a suitable
- * constant expression of type <code>sph_u32</code>. Depending on
- * how this type is defined, a suffix such as <code>UL</code> may
- * be appended to the argument.
- *
- * @param x   the token to expand into a suitable constant expression
- */
-#define SPH_C32(x)
-
-/**
- * Truncate a 32-bit value to exactly 32 bits. On most systems, this is
- * a no-op, recognized as such by the compiler.
- *
- * @param x   the value to truncate (of type <code>sph_u32</code>)
- */
-#define SPH_T32(x)
-
-/**
- * Rotate a 32-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 31. This macro assumes that its
- * first argument fits in 32 bits (no extra bit allowed on machines where
- * <code>sph_u32</code> is wider); both arguments may be evaluated
- * several times.
- *
- * @param x   the value to rotate (of type <code>sph_u32</code>)
- * @param n   the rotation count (between 1 and 31, inclusive)
- */
-#define SPH_ROTL32(x, n)
-
-/**
- * Rotate a 32-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 31. This macro assumes that its
- * first argument fits in 32 bits (no extra bit allowed on machines where
- * <code>sph_u32</code> is wider); both arguments may be evaluated
- * several times.
- *
- * @param x   the value to rotate (of type <code>sph_u32</code>)
- * @param n   the rotation count (between 1 and 31, inclusive)
- */
-#define SPH_ROTR32(x, n)
-
-/**
- * This macro is defined on systems for which a 64-bit type has been
- * detected, and is used for <code>sph_u64</code>.
- */
-#define SPH_64
-
-/**
- * This macro is defined on systems for the "native" integer size is
- * 64 bits (64-bit values fit in one register).
- */
-#define SPH_64_TRUE
-
-/**
- * This macro expands the token <code>x</code> into a suitable
- * constant expression of type <code>sph_u64</code>. Depending on
- * how this type is defined, a suffix such as <code>ULL</code> may
- * be appended to the argument. This macro is defined only if a
- * 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param x   the token to expand into a suitable constant expression
- */
-#define SPH_C64(x)
-
-/**
- * Truncate a 64-bit value to exactly 64 bits. On most systems, this is
- * a no-op, recognized as such by the compiler. This macro is defined only
- * if a 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param x   the value to truncate (of type <code>sph_u64</code>)
- */
-#define SPH_T64(x)
-
-/**
- * Rotate a 64-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 63. This macro assumes that its
- * first argument fits in 64 bits (no extra bit allowed on machines where
- * <code>sph_u64</code> is wider); both arguments may be evaluated
- * several times. This macro is defined only if a 64-bit type was detected
- * and used for <code>sph_u64</code>.
- *
- * @param x   the value to rotate (of type <code>sph_u64</code>)
- * @param n   the rotation count (between 1 and 63, inclusive)
- */
-#define SPH_ROTL64(x, n)
-
-/**
- * Rotate a 64-bit value by a number of bits to the left. The rotate
- * count must reside between 1 and 63. This macro assumes that its
- * first argument fits in 64 bits (no extra bit allowed on machines where
- * <code>sph_u64</code> is wider); both arguments may be evaluated
- * several times. This macro is defined only if a 64-bit type was detected
- * and used for <code>sph_u64</code>.
- *
- * @param x   the value to rotate (of type <code>sph_u64</code>)
- * @param n   the rotation count (between 1 and 63, inclusive)
- */
-#define SPH_ROTR64(x, n)
-
-/**
- * This macro evaluates to <code>inline</code> or an equivalent construction,
- * if available on the compilation platform, or to nothing otherwise. This
- * is used to declare inline functions, for which the compiler should
- * endeavour to include the code directly in the caller. Inline functions
- * are typically defined in header files as replacement for macros.
- */
-#define SPH_INLINE
-
-/**
- * This macro is defined if the platform has been detected as using
- * little-endian convention. This implies that the <code>sph_u32</code>
- * type (and the <code>sph_u64</code> type also, if it is defined) has
- * an exact width (i.e. exactly 32-bit, respectively 64-bit).
- */
-#define SPH_LITTLE_ENDIAN
-
-/**
- * This macro is defined if the platform has been detected as using
- * big-endian convention. This implies that the <code>sph_u32</code>
- * type (and the <code>sph_u64</code> type also, if it is defined) has
- * an exact width (i.e. exactly 32-bit, respectively 64-bit).
- */
-#define SPH_BIG_ENDIAN
-
-/**
- * This macro is defined if 32-bit words (and 64-bit words, if defined)
- * can be read from and written to memory efficiently in little-endian
- * convention. This is the case for little-endian platforms, and also
- * for the big-endian platforms which have special little-endian access
- * opcodes (e.g. Ultrasparc).
- */
-#define SPH_LITTLE_FAST
-
-/**
- * This macro is defined if 32-bit words (and 64-bit words, if defined)
- * can be read from and written to memory efficiently in big-endian
- * convention. This is the case for little-endian platforms, and also
- * for the little-endian platforms which have special big-endian access
- * opcodes.
- */
-#define SPH_BIG_FAST
-
-/**
- * On some platforms, this macro is defined to an unsigned integer type
- * into which pointer values may be cast. The resulting value can then
- * be tested for being a multiple of 2, 4 or 8, indicating an aligned
- * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses.
- */
-#define SPH_UPTR
-
-/**
- * When defined, this macro indicates that unaligned memory accesses
- * are possible with only a minor penalty, and thus should be prefered
- * over strategies which first copy data to an aligned buffer.
- */
-#define SPH_UNALIGNED
-
-/**
- * Byte-swap a 32-bit word (i.e. <code>0x12345678</code> becomes
- * <code>0x78563412</code>). This is an inline function which resorts
- * to inline assembly on some platforms, for better performance.
- *
- * @param x   the 32-bit value to byte-swap
- * @return  the byte-swapped value
- */
-static inline sph_u32 sph_bswap32(sph_u32 x);
-
-/**
- * Byte-swap a 64-bit word. This is an inline function which resorts
- * to inline assembly on some platforms, for better performance. This
- * function is defined only if a suitable 64-bit type was found for
- * <code>sph_u64</code>
- *
- * @param x   the 64-bit value to byte-swap
- * @return  the byte-swapped value
- */
-static inline sph_u64 sph_bswap64(sph_u64 x);
-
-/**
- * Decode a 16-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline unsigned sph_dec16le(const void *src);
-
-/**
- * Encode a 16-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc16le(void *dst, unsigned val);
-
-/**
- * Decode a 16-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline unsigned sph_dec16be(const void *src);
-
-/**
- * Encode a 16-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc16be(void *dst, unsigned val);
-
-/**
- * Decode a 32-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32le(const void *src);
-
-/**
- * Decode a 32-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec32le()</code> function.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32le_aligned(const void *src);
-
-/**
- * Encode a 32-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32le(void *dst, sph_u32 val);
-
-/**
- * Encode a 32-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc32le()</code> function.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32le_aligned(void *dst, sph_u32 val);
-
-/**
- * Decode a 32-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32be(const void *src);
-
-/**
- * Decode a 32-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec32be()</code> function.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u32 sph_dec32be_aligned(const void *src);
-
-/**
- * Encode a 32-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first).
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32be(void *dst, sph_u32 val);
-
-/**
- * Encode a 32-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc32be()</code> function.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc32be_aligned(void *dst, sph_u32 val);
-
-/**
- * Decode a 64-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64le(const void *src);
-
-/**
- * Decode a 64-bit unsigned value from memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec64le()</code> function. This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64le_aligned(const void *src);
-
-/**
- * Encode a 64-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64le(void *dst, sph_u64 val);
-
-/**
- * Encode a 64-bit unsigned value into memory, in little-endian convention
- * (least significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc64le()</code> function. This function is defined
- * only if a suitable 64-bit type was detected and used for
- * <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64le_aligned(void *dst, sph_u64 val);
-
-/**
- * Decode a 64-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64be(const void *src);
-
-/**
- * Decode a 64-bit unsigned value from memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * source address is suitably aligned for a direct access, if the platform
- * supports such things; it can thus be marginally faster than the generic
- * <code>sph_dec64be()</code> function. This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param src   the source address
- * @return  the decoded value
- */
-static inline sph_u64 sph_dec64be_aligned(const void *src);
-
-/**
- * Encode a 64-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first). This function is defined only
- * if a suitable 64-bit type was detected and used for <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64be(void *dst, sph_u64 val);
-
-/**
- * Encode a 64-bit unsigned value into memory, in big-endian convention
- * (most significant byte comes first). This function assumes that the
- * destination address is suitably aligned for a direct access, if the
- * platform supports such things; it can thus be marginally faster than
- * the generic <code>sph_enc64be()</code> function. This function is defined
- * only if a suitable 64-bit type was detected and used for
- * <code>sph_u64</code>.
- *
- * @param dst   the destination buffer
- * @param val   the value to encode
- */
-static inline void sph_enc64be_aligned(void *dst, sph_u64 val);
-
-#endif
-
-/* ============== END documentation block for Doxygen ============= */
-
-#ifndef DOXYGEN_IGNORE
-
-/*
- * We want to define the types "sph_u32" and "sph_u64" which hold
- * unsigned values of at least, respectively, 32 and 64 bits. These
- * tests should select appropriate types for most platforms. The
- * macro "SPH_64" is defined if the 64-bit is supported.
- */
-
-#undef SPH_64
-#undef SPH_64_TRUE
-
-#if defined __STDC__ && __STDC_VERSION__ >= 199901L
-
-/*
- * On C99 implementations, we can use <stdint.h> to get an exact 64-bit
- * type, if any, or otherwise use a wider type (which must exist, for
- * C99 conformance).
- */
-
-#include <stdint.h>
-
-#ifdef UINT32_MAX
-typedef uint32_t sph_u32;
-typedef int32_t sph_s32;
-#else
-typedef uint_fast32_t sph_u32;
-typedef int_fast32_t sph_s32;
-#endif
-#if !SPH_NO_64
-#ifdef UINT64_MAX
-typedef uint64_t sph_u64;
-typedef int64_t sph_s64;
-#else
-typedef uint_fast64_t sph_u64;
-typedef int_fast64_t sph_s64;
-#endif
-#endif
-
-#define SPH_C32(x)    ((sph_u32)(x))
-#if !SPH_NO_64
-#define SPH_C64(x)    ((sph_u64)(x))
-#define SPH_64  1
-#endif
-
-#else
-
-/*
- * On non-C99 systems, we use "unsigned int" if it is wide enough,
- * "unsigned long" otherwise. This supports all "reasonable" architectures.
- * We have to be cautious: pre-C99 preprocessors handle constants
- * differently in '#if' expressions. Hence the shifts to test UINT_MAX.
- */
-
-#if ((UINT_MAX >> 11) >> 11) >= 0x3FF
-
-typedef unsigned int sph_u32;
-typedef int sph_s32;
-
-#define SPH_C32(x)    ((sph_u32)(x ## U))
-
-#else
-
-typedef unsigned long sph_u32;
-typedef long sph_s32;
-
-#define SPH_C32(x)    ((sph_u32)(x ## UL))
-
-#endif
-
-#if !SPH_NO_64
-
-/*
- * We want a 64-bit type. We use "unsigned long" if it is wide enough (as
- * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9),
- * "unsigned long long" otherwise, if available. We use ULLONG_MAX to
- * test whether "unsigned long long" is available; we also know that
- * gcc features this type, even if the libc header do not know it.
- */
-
-#if ((ULONG_MAX >> 31) >> 31) >= 3
-
-typedef unsigned long sph_u64;
-typedef long sph_s64;
-
-#define SPH_C64(x)    ((sph_u64)(x ## UL))
-
-#define SPH_64  1
-
-#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__
-
-typedef unsigned long long sph_u64;
-typedef long long sph_s64;
-
-#define SPH_C64(x)    ((sph_u64)(x ## ULL))
-
-#define SPH_64  1
-
-#else
-
-/*
- * No 64-bit type...
- */
-
-#endif
-
-#endif
-
-#endif
-
-/*
- * If the "unsigned long" type has length 64 bits or more, then this is
- * a "true" 64-bit architectures. This is also true with Visual C on
- * amd64, even though the "long" type is limited to 32 bits.
- */
-#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64)
-#define SPH_64_TRUE   1
-#endif
-
-/*
- * Implementation note: some processors have specific opcodes to perform
- * a rotation. Recent versions of gcc recognize the expression above and
- * use the relevant opcodes, when appropriate.
- */
-
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-#define SPH_ROTL32(x, n)   SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
-
-#if SPH_64
-
-#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
-#define SPH_ROTL64(x, n)   SPH_T64(((x) << (n)) | ((x) >> (64 - (n))))
-#define SPH_ROTR64(x, n)   SPH_ROTL64(x, (64 - (n)))
-
-#endif
-
-#ifndef DOXYGEN_IGNORE
-/*
- * Define SPH_INLINE to be an "inline" qualifier, if available. We define
- * some small macro-like functions which benefit greatly from being inlined.
- */
-#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__
-#define SPH_INLINE inline
-#elif defined _MSC_VER
-#define SPH_INLINE __inline
-#else
-#define SPH_INLINE
-#endif
-#endif
-
-/*
- * We define some macros which qualify the architecture. These macros
- * may be explicit set externally (e.g. as compiler parameters). The
- * code below sets those macros if they are not already defined.
- *
- * Most macros are boolean, thus evaluate to either zero or non-zero.
- * The SPH_UPTR macro is special, in that it evaluates to a C type,
- * or is not defined.
- *
- * SPH_UPTR             if defined: unsigned type to cast pointers into
- *
- * SPH_UNALIGNED        non-zero if unaligned accesses are efficient
- * SPH_LITTLE_ENDIAN    non-zero if architecture is known to be little-endian
- * SPH_BIG_ENDIAN       non-zero if architecture is known to be big-endian
- * SPH_LITTLE_FAST      non-zero if little-endian decoding is fast
- * SPH_BIG_FAST         non-zero if big-endian decoding is fast
- *
- * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit
- * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN
- * _must_ be non-zero in those situations. The 32-bit and 64-bit types
- * _must_ also have an exact width.
- *
- * SPH_SPARCV9_GCC_32   UltraSPARC-compatible with gcc, 32-bit mode
- * SPH_SPARCV9_GCC_64   UltraSPARC-compatible with gcc, 64-bit mode
- * SPH_SPARCV9_GCC      UltraSPARC-compatible with gcc
- * SPH_I386_GCC         x86-compatible (32-bit) with gcc
- * SPH_I386_MSVC        x86-compatible (32-bit) with Microsoft Visual C
- * SPH_AMD64_GCC        x86-compatible (64-bit) with gcc
- * SPH_AMD64_MSVC       x86-compatible (64-bit) with Microsoft Visual C
- * SPH_PPC32_GCC        PowerPC, 32-bit, with gcc
- * SPH_PPC64_GCC        PowerPC, 64-bit, with gcc
- *
- * TODO: enhance automatic detection, for more architectures and compilers.
- * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with
- * some very fast functions (e.g. MD4) when using unaligned input data.
- * The CPU-specific-with-GCC macros are useful only for inline assembly,
- * normally restrained to this header file.
- */
-
-/*
- * 32-bit x86, aka "i386 compatible".
- */
-#if defined __i386__ || defined _M_IX86
-
-#define SPH_DETECT_UNALIGNED         1
-#define SPH_DETECT_LITTLE_ENDIAN     1
-#define SPH_DETECT_UPTR              sph_u32
-#ifdef __GNUC__
-#define SPH_DETECT_I386_GCC          1
-#endif
-#ifdef _MSC_VER
-#define SPH_DETECT_I386_MSVC         1
-#endif
-
-/*
- * 64-bit x86, hereafter known as "amd64".
- */
-#elif defined __x86_64 || defined _M_X64
-
-#define SPH_DETECT_UNALIGNED         1
-#define SPH_DETECT_LITTLE_ENDIAN     1
-#define SPH_DETECT_UPTR              sph_u64
-#ifdef __GNUC__
-#define SPH_DETECT_AMD64_GCC         1
-#endif
-#ifdef _MSC_VER
-#define SPH_DETECT_AMD64_MSVC        1
-#endif
-
-/*
- * 64-bit Sparc architecture (implies v9).
- */
-#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) \
-	|| defined __sparcv9
-
-#define SPH_DETECT_BIG_ENDIAN        1
-#define SPH_DETECT_UPTR              sph_u64
-#ifdef __GNUC__
-#define SPH_DETECT_SPARCV9_GCC_64    1
-#define SPH_DETECT_LITTLE_FAST       1
-#endif
-
-/*
- * 32-bit Sparc.
- */
-#elif (defined __sparc__ || defined __sparc) \
-	&& !(defined __sparcv9 || defined __arch64__)
-
-#define SPH_DETECT_BIG_ENDIAN        1
-#define SPH_DETECT_UPTR              sph_u32
-#if defined __GNUC__ && defined __sparc_v9__
-#define SPH_DETECT_SPARCV9_GCC_32    1
-#define SPH_DETECT_LITTLE_FAST       1
-#endif
-
-/*
- * ARM, little-endian.
- */
-#elif defined __arm__ && __ARMEL__
-
-#define SPH_DETECT_LITTLE_ENDIAN     1
-
-/*
- * MIPS, little-endian.
- */
-#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__
-
-#define SPH_DETECT_LITTLE_ENDIAN     1
-
-/*
- * MIPS, big-endian.
- */
-#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__
-
-#define SPH_DETECT_BIG_ENDIAN        1
-
-/*
- * PowerPC.
- */
-#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ \
-	|| defined _ARCH_PPC
-
-/*
- * Note: we do not declare cross-endian access to be "fast": even if
- * using inline assembly, implementation should still assume that
- * keeping the decoded word in a temporary is faster than decoding
- * it again.
- */
-#if defined __GNUC__
-#if SPH_64_TRUE
-#define SPH_DETECT_PPC64_GCC         1
-#else
-#define SPH_DETECT_PPC32_GCC         1
-#endif
-#endif
-
-#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
-#define SPH_DETECT_BIG_ENDIAN        1
-#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN
-#define SPH_DETECT_LITTLE_ENDIAN     1
-#endif
-
-/*
- * Itanium, 64-bit.
- */
-#elif defined __ia64 || defined __ia64__ \
-	|| defined __itanium__ || defined _M_IA64
-
-#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN
-#define SPH_DETECT_BIG_ENDIAN        1
-#else
-#define SPH_DETECT_LITTLE_ENDIAN     1
-#endif
-#if defined __LP64__ || defined _LP64
-#define SPH_DETECT_UPTR              sph_u64
-#else
-#define SPH_DETECT_UPTR              sph_u32
-#endif
-
-#endif
-
-#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64
-#define SPH_DETECT_SPARCV9_GCC       1
-#endif
-
-#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED
-#define SPH_UNALIGNED         SPH_DETECT_UNALIGNED
-#endif
-#if defined SPH_DETECT_UPTR && !defined SPH_UPTR
-#define SPH_UPTR              SPH_DETECT_UPTR
-#endif
-#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN
-#define SPH_LITTLE_ENDIAN     SPH_DETECT_LITTLE_ENDIAN
-#endif
-#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN
-#define SPH_BIG_ENDIAN        SPH_DETECT_BIG_ENDIAN
-#endif
-#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST
-#define SPH_LITTLE_FAST       SPH_DETECT_LITTLE_FAST
-#endif
-#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST
-#define SPH_BIG_FAST    SPH_DETECT_BIG_FAST
-#endif
-#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32
-#define SPH_SPARCV9_GCC_32    SPH_DETECT_SPARCV9_GCC_32
-#endif
-#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64
-#define SPH_SPARCV9_GCC_64    SPH_DETECT_SPARCV9_GCC_64
-#endif
-#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC
-#define SPH_SPARCV9_GCC       SPH_DETECT_SPARCV9_GCC
-#endif
-#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC
-#define SPH_I386_GCC          SPH_DETECT_I386_GCC
-#endif
-#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC
-#define SPH_I386_MSVC         SPH_DETECT_I386_MSVC
-#endif
-#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC
-#define SPH_AMD64_GCC         SPH_DETECT_AMD64_GCC
-#endif
-#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC
-#define SPH_AMD64_MSVC        SPH_DETECT_AMD64_MSVC
-#endif
-#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC
-#define SPH_PPC32_GCC         SPH_DETECT_PPC32_GCC
-#endif
-#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC
-#define SPH_PPC64_GCC         SPH_DETECT_PPC64_GCC
-#endif
-
-#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST
-#define SPH_LITTLE_FAST              1
-#endif
-#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST
-#define SPH_BIG_FAST                 1
-#endif
-
-#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN)
-#error SPH_UPTR defined, but endianness is not known.
-#endif
-
-#if SPH_I386_GCC && !SPH_NO_ASM
-
-/*
- * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
- * values.
- */
-
-static SPH_INLINE sph_u32
-sph_bswap32(sph_u32 x)
-{
-	__asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
-	return x;
-}
-
-#if SPH_64
-
-static SPH_INLINE sph_u64
-sph_bswap64(sph_u64 x)
-{
-	return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
-		| (sph_u64)sph_bswap32((sph_u32)(x >> 32));
-}
-
-#endif
-
-#elif SPH_AMD64_GCC && !SPH_NO_ASM
-
-/*
- * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit
- * and 64-bit values.
- */
-
-static SPH_INLINE sph_u32
-sph_bswap32(sph_u32 x)
-{
-	__asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x));
-	return x;
-}
-
-#if SPH_64
-
-static SPH_INLINE sph_u64
-sph_bswap64(sph_u64 x)
-{
-	__asm__ __volatile__ ("bswapq %0" : "=r" (x) : "0" (x));
-	return x;
-}
-
-#endif
-
-/*
- * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough
- * to generate proper opcodes for endianness swapping with the pure C
- * implementation below.
- *
-
-#elif SPH_I386_MSVC && !SPH_NO_ASM
-
-static __inline sph_u32 __declspec(naked) __fastcall
-sph_bswap32(sph_u32 x)
-{
-	__asm {
-		bswap  ecx
-		mov    eax,ecx
-		ret
-	}
-}
-
-#if SPH_64
-
-static SPH_INLINE sph_u64
-sph_bswap64(sph_u64 x)
-{
-	return ((sph_u64)sph_bswap32((sph_u32)x) << 32)
-		| (sph_u64)sph_bswap32((sph_u32)(x >> 32));
-}
-
-#endif
-
- *
- * [end of disabled code]
- */
-
-#else
-
-static SPH_INLINE sph_u32
-sph_bswap32(sph_u32 x)
-{
-	x = SPH_T32((x << 16) | (x >> 16));
-	x = ((x & SPH_C32(0xFF00FF00)) >> 8)
-		| ((x & SPH_C32(0x00FF00FF)) << 8);
-	return x;
-}
-
-#if SPH_64
-
-/**
- * Byte-swap a 64-bit value.
- *
- * @param x   the input value
- * @return  the byte-swapped value
- */
-static SPH_INLINE sph_u64
-sph_bswap64(sph_u64 x)
-{
-	x = SPH_T64((x << 32) | (x >> 32));
-	x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16)
-		| ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16);
-	x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8)
-		| ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8);
-	return x;
-}
-
-#endif
-
-#endif
-
-#if SPH_SPARCV9_GCC && !SPH_NO_ASM
-
-/*
- * On UltraSPARC systems, native ordering is big-endian, but it is
- * possible to perform little-endian read accesses by specifying the
- * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use
- * the opcode "lda [%reg]0x88,%dst", where %reg is the register which
- * contains the source address and %dst is the destination register,
- * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register
- * to get the address space name. The latter format is better since it
- * combines an addition and the actual access in a single opcode; but
- * it requires the setting (and subsequent resetting) of %asi, which is
- * slow. Some operations (i.e. MD5 compression function) combine many
- * successive little-endian read accesses, which may share the same
- * %asi setting. The macros below contain the appropriate inline
- * assembly.
- */
-
-#define SPH_SPARCV9_SET_ASI   \
-	sph_u32 sph_sparcv9_asi; \
-	__asm__ __volatile__ ( \
-		"rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r" (sph_sparcv9_asi));
-
-#define SPH_SPARCV9_RESET_ASI  \
-	__asm__ __volatile__ ("wr %%g0,%0,%%asi" : : "r" (sph_sparcv9_asi));
-
-#define SPH_SPARCV9_DEC32LE(base, idx)   ({ \
-		sph_u32 sph_sparcv9_tmp; \
-		__asm__ __volatile__ ("lda [%1+" #idx "*4]%%asi,%0" \
-			: "=r" (sph_sparcv9_tmp) : "r" (base)); \
-		sph_sparcv9_tmp; \
-	})
-
-#endif
-
-static SPH_INLINE void
-sph_enc16be(void *dst, unsigned val)
-{
-	((unsigned char *)dst)[0] = (val >> 8);
-	((unsigned char *)dst)[1] = val;
-}
-
-static SPH_INLINE unsigned
-sph_dec16be(const void *src)
-{
-	return ((unsigned)(((const unsigned char *)src)[0]) << 8)
-		| (unsigned)(((const unsigned char *)src)[1]);
-}
-
-static SPH_INLINE void
-sph_enc16le(void *dst, unsigned val)
-{
-	((unsigned char *)dst)[0] = val;
-	((unsigned char *)dst)[1] = val >> 8;
-}
-
-static SPH_INLINE unsigned
-sph_dec16le(const void *src)
-{
-	return (unsigned)(((const unsigned char *)src)[0])
-		| ((unsigned)(((const unsigned char *)src)[1]) << 8);
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (big endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 32-bit value to encode
- */
-static SPH_INLINE void
-sph_enc32be(void *dst, sph_u32 val)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-	val = sph_bswap32(val);
-#endif
-	*(sph_u32 *)dst = val;
-#else
-	if (((SPH_UPTR)dst & 3) == 0) {
-#if SPH_LITTLE_ENDIAN
-		val = sph_bswap32(val);
-#endif
-		*(sph_u32 *)dst = val;
-	} else {
-		((unsigned char *)dst)[0] = (val >> 24);
-		((unsigned char *)dst)[1] = (val >> 16);
-		((unsigned char *)dst)[2] = (val >> 8);
-		((unsigned char *)dst)[3] = val;
-	}
-#endif
-#else
-	((unsigned char *)dst)[0] = (val >> 24);
-	((unsigned char *)dst)[1] = (val >> 16);
-	((unsigned char *)dst)[2] = (val >> 8);
-	((unsigned char *)dst)[3] = val;
-#endif
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (big endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (32-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void
-sph_enc32be_aligned(void *dst, sph_u32 val)
-{
-#if SPH_LITTLE_ENDIAN
-	*(sph_u32 *)dst = sph_bswap32(val);
-#elif SPH_BIG_ENDIAN
-	*(sph_u32 *)dst = val;
-#else
-	((unsigned char *)dst)[0] = (val >> 24);
-	((unsigned char *)dst)[1] = (val >> 16);
-	((unsigned char *)dst)[2] = (val >> 8);
-	((unsigned char *)dst)[3] = val;
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (big endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32
-sph_dec32be(const void *src)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-	return sph_bswap32(*(const sph_u32 *)src);
-#else
-	return *(const sph_u32 *)src;
-#endif
-#else
-	if (((SPH_UPTR)src & 3) == 0) {
-#if SPH_LITTLE_ENDIAN
-		return sph_bswap32(*(const sph_u32 *)src);
-#else
-		return *(const sph_u32 *)src;
-#endif
-	} else {
-		return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
-			| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
-			| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
-			| (sph_u32)(((const unsigned char *)src)[3]);
-	}
-#endif
-#else
-	return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
-		| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
-		| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
-		| (sph_u32)(((const unsigned char *)src)[3]);
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (big endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (32-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32
-sph_dec32be_aligned(const void *src)
-{
-#if SPH_LITTLE_ENDIAN
-	return sph_bswap32(*(const sph_u32 *)src);
-#elif SPH_BIG_ENDIAN
-	return *(const sph_u32 *)src;
-#else
-	return ((sph_u32)(((const unsigned char *)src)[0]) << 24)
-		| ((sph_u32)(((const unsigned char *)src)[1]) << 16)
-		| ((sph_u32)(((const unsigned char *)src)[2]) << 8)
-		| (sph_u32)(((const unsigned char *)src)[3]);
-#endif
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (little endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 32-bit value to encode
- */
-static SPH_INLINE void
-sph_enc32le(void *dst, sph_u32 val)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-	val = sph_bswap32(val);
-#endif
-	*(sph_u32 *)dst = val;
-#else
-	if (((SPH_UPTR)dst & 3) == 0) {
-#if SPH_BIG_ENDIAN
-		val = sph_bswap32(val);
-#endif
-		*(sph_u32 *)dst = val;
-	} else {
-		((unsigned char *)dst)[0] = val;
-		((unsigned char *)dst)[1] = (val >> 8);
-		((unsigned char *)dst)[2] = (val >> 16);
-		((unsigned char *)dst)[3] = (val >> 24);
-	}
-#endif
-#else
-	((unsigned char *)dst)[0] = val;
-	((unsigned char *)dst)[1] = (val >> 8);
-	((unsigned char *)dst)[2] = (val >> 16);
-	((unsigned char *)dst)[3] = (val >> 24);
-#endif
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (little endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (32-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void
-sph_enc32le_aligned(void *dst, sph_u32 val)
-{
-#if SPH_LITTLE_ENDIAN
-	*(sph_u32 *)dst = val;
-#elif SPH_BIG_ENDIAN
-	*(sph_u32 *)dst = sph_bswap32(val);
-#else
-	((unsigned char *)dst)[0] = val;
-	((unsigned char *)dst)[1] = (val >> 8);
-	((unsigned char *)dst)[2] = (val >> 16);
-	((unsigned char *)dst)[3] = (val >> 24);
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (little endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32
-sph_dec32le(const void *src)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-	return sph_bswap32(*(const sph_u32 *)src);
-#else
-	return *(const sph_u32 *)src;
-#endif
-#else
-	if (((SPH_UPTR)src & 3) == 0) {
-#if SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC && !SPH_NO_ASM
-		sph_u32 tmp;
-
-		/*
-		 * "__volatile__" is needed here because without it,
-		 * gcc-3.4.3 miscompiles the code and performs the
-		 * access before the test on the address, thus triggering
-		 * a bus error...
-		 */
-		__asm__ __volatile__ (
-			"lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
-		return tmp;
-/*
- * On PowerPC, this turns out not to be worth the effort: the inline
- * assembly makes GCC optimizer uncomfortable, which tends to nullify
- * the decoding gains.
- *
- * For most hash functions, using this inline assembly trick changes
- * hashing speed by less than 5% and often _reduces_ it. The biggest
- * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is
- * less then 10%. The speed gain on CubeHash is probably due to the
- * chronic shortage of registers that CubeHash endures; for the other
- * functions, the generic code appears to be efficient enough already.
- *
-#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
-		sph_u32 tmp;
-
-		__asm__ __volatile__ (
-			"lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-		return tmp;
- */
-#else
-		return sph_bswap32(*(const sph_u32 *)src);
-#endif
-#else
-		return *(const sph_u32 *)src;
-#endif
-	} else {
-		return (sph_u32)(((const unsigned char *)src)[0])
-			| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
-			| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
-			| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
-	}
-#endif
-#else
-	return (sph_u32)(((const unsigned char *)src)[0])
-		| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
-		| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
-		| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
-#endif
-}
-
-/**
- * Decode a 32-bit value from the provided buffer (little endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (32-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u32
-sph_dec32le_aligned(const void *src)
-{
-#if SPH_LITTLE_ENDIAN
-	return *(const sph_u32 *)src;
-#elif SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC && !SPH_NO_ASM
-	sph_u32 tmp;
-
-	__asm__ __volatile__ ("lda [%1]0x88,%0" : "=r" (tmp) : "r" (src));
-	return tmp;
-/*
- * Not worth it generally.
- *
-#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM
-	sph_u32 tmp;
-
-	__asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-	return tmp;
- */
-#else
-	return sph_bswap32(*(const sph_u32 *)src);
-#endif
-#else
-	return (sph_u32)(((const unsigned char *)src)[0])
-		| ((sph_u32)(((const unsigned char *)src)[1]) << 8)
-		| ((sph_u32)(((const unsigned char *)src)[2]) << 16)
-		| ((sph_u32)(((const unsigned char *)src)[3]) << 24);
-#endif
-}
-
-#if SPH_64
-
-/**
- * Encode a 64-bit value into the provided buffer (big endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 64-bit value to encode
- */
-static SPH_INLINE void
-sph_enc64be(void *dst, sph_u64 val)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-	val = sph_bswap64(val);
-#endif
-	*(sph_u64 *)dst = val;
-#else
-	if (((SPH_UPTR)dst & 7) == 0) {
-#if SPH_LITTLE_ENDIAN
-		val = sph_bswap64(val);
-#endif
-		*(sph_u64 *)dst = val;
-	} else {
-		((unsigned char *)dst)[0] = (val >> 56);
-		((unsigned char *)dst)[1] = (val >> 48);
-		((unsigned char *)dst)[2] = (val >> 40);
-		((unsigned char *)dst)[3] = (val >> 32);
-		((unsigned char *)dst)[4] = (val >> 24);
-		((unsigned char *)dst)[5] = (val >> 16);
-		((unsigned char *)dst)[6] = (val >> 8);
-		((unsigned char *)dst)[7] = val;
-	}
-#endif
-#else
-	((unsigned char *)dst)[0] = (val >> 56);
-	((unsigned char *)dst)[1] = (val >> 48);
-	((unsigned char *)dst)[2] = (val >> 40);
-	((unsigned char *)dst)[3] = (val >> 32);
-	((unsigned char *)dst)[4] = (val >> 24);
-	((unsigned char *)dst)[5] = (val >> 16);
-	((unsigned char *)dst)[6] = (val >> 8);
-	((unsigned char *)dst)[7] = val;
-#endif
-}
-
-/**
- * Encode a 64-bit value into the provided buffer (big endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (64-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void
-sph_enc64be_aligned(void *dst, sph_u64 val)
-{
-#if SPH_LITTLE_ENDIAN
-	*(sph_u64 *)dst = sph_bswap64(val);
-#elif SPH_BIG_ENDIAN
-	*(sph_u64 *)dst = val;
-#else
-	((unsigned char *)dst)[0] = (val >> 56);
-	((unsigned char *)dst)[1] = (val >> 48);
-	((unsigned char *)dst)[2] = (val >> 40);
-	((unsigned char *)dst)[3] = (val >> 32);
-	((unsigned char *)dst)[4] = (val >> 24);
-	((unsigned char *)dst)[5] = (val >> 16);
-	((unsigned char *)dst)[6] = (val >> 8);
-	((unsigned char *)dst)[7] = val;
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (big endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64
-sph_dec64be(const void *src)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_LITTLE_ENDIAN
-	return sph_bswap64(*(const sph_u64 *)src);
-#else
-	return *(const sph_u64 *)src;
-#endif
-#else
-	if (((SPH_UPTR)src & 7) == 0) {
-#if SPH_LITTLE_ENDIAN
-		return sph_bswap64(*(const sph_u64 *)src);
-#else
-		return *(const sph_u64 *)src;
-#endif
-	} else {
-		return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
-			| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
-			| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
-			| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
-			| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
-			| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
-			| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
-			| (sph_u64)(((const unsigned char *)src)[7]);
-	}
-#endif
-#else
-	return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
-		| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
-		| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
-		| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
-		| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
-		| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
-		| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
-		| (sph_u64)(((const unsigned char *)src)[7]);
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (big endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (64-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64
-sph_dec64be_aligned(const void *src)
-{
-#if SPH_LITTLE_ENDIAN
-	return sph_bswap64(*(const sph_u64 *)src);
-#elif SPH_BIG_ENDIAN
-	return *(const sph_u64 *)src;
-#else
-	return ((sph_u64)(((const unsigned char *)src)[0]) << 56)
-		| ((sph_u64)(((const unsigned char *)src)[1]) << 48)
-		| ((sph_u64)(((const unsigned char *)src)[2]) << 40)
-		| ((sph_u64)(((const unsigned char *)src)[3]) << 32)
-		| ((sph_u64)(((const unsigned char *)src)[4]) << 24)
-		| ((sph_u64)(((const unsigned char *)src)[5]) << 16)
-		| ((sph_u64)(((const unsigned char *)src)[6]) << 8)
-		| (sph_u64)(((const unsigned char *)src)[7]);
-#endif
-}
-
-/**
- * Encode a 64-bit value into the provided buffer (little endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 64-bit value to encode
- */
-static SPH_INLINE void
-sph_enc64le(void *dst, sph_u64 val)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-	val = sph_bswap64(val);
-#endif
-	*(sph_u64 *)dst = val;
-#else
-	if (((SPH_UPTR)dst & 7) == 0) {
-#if SPH_BIG_ENDIAN
-		val = sph_bswap64(val);
-#endif
-		*(sph_u64 *)dst = val;
-	} else {
-		((unsigned char *)dst)[0] = val;
-		((unsigned char *)dst)[1] = (val >> 8);
-		((unsigned char *)dst)[2] = (val >> 16);
-		((unsigned char *)dst)[3] = (val >> 24);
-		((unsigned char *)dst)[4] = (val >> 32);
-		((unsigned char *)dst)[5] = (val >> 40);
-		((unsigned char *)dst)[6] = (val >> 48);
-		((unsigned char *)dst)[7] = (val >> 56);
-	}
-#endif
-#else
-	((unsigned char *)dst)[0] = val;
-	((unsigned char *)dst)[1] = (val >> 8);
-	((unsigned char *)dst)[2] = (val >> 16);
-	((unsigned char *)dst)[3] = (val >> 24);
-	((unsigned char *)dst)[4] = (val >> 32);
-	((unsigned char *)dst)[5] = (val >> 40);
-	((unsigned char *)dst)[6] = (val >> 48);
-	((unsigned char *)dst)[7] = (val >> 56);
-#endif
-}
-
-/**
- * Encode a 64-bit value into the provided buffer (little endian convention).
- * The destination buffer must be properly aligned.
- *
- * @param dst   the destination buffer (64-bit aligned)
- * @param val   the value to encode
- */
-static SPH_INLINE void
-sph_enc64le_aligned(void *dst, sph_u64 val)
-{
-#if SPH_LITTLE_ENDIAN
-	*(sph_u64 *)dst = val;
-#elif SPH_BIG_ENDIAN
-	*(sph_u64 *)dst = sph_bswap64(val);
-#else
-	((unsigned char *)dst)[0] = val;
-	((unsigned char *)dst)[1] = (val >> 8);
-	((unsigned char *)dst)[2] = (val >> 16);
-	((unsigned char *)dst)[3] = (val >> 24);
-	((unsigned char *)dst)[4] = (val >> 32);
-	((unsigned char *)dst)[5] = (val >> 40);
-	((unsigned char *)dst)[6] = (val >> 48);
-	((unsigned char *)dst)[7] = (val >> 56);
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (little endian convention).
- *
- * @param src   the source buffer
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64
-sph_dec64le(const void *src)
-{
-#if defined SPH_UPTR
-#if SPH_UNALIGNED
-#if SPH_BIG_ENDIAN
-	return sph_bswap64(*(const sph_u64 *)src);
-#else
-	return *(const sph_u64 *)src;
-#endif
-#else
-	if (((SPH_UPTR)src & 7) == 0) {
-#if SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
-		sph_u64 tmp;
-
-		__asm__ __volatile__ (
-			"ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
-		return tmp;
-/*
- * Not worth it generally.
- *
-#elif SPH_PPC32_GCC && !SPH_NO_ASM
-		return (sph_u64)sph_dec32le_aligned(src)
-			| ((sph_u64)sph_dec32le_aligned(
-				(const char *)src + 4) << 32);
-#elif SPH_PPC64_GCC && !SPH_NO_ASM
-		sph_u64 tmp;
-
-		__asm__ __volatile__ (
-			"ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-		return tmp;
- */
-#else
-		return sph_bswap64(*(const sph_u64 *)src);
-#endif
-#else
-		return *(const sph_u64 *)src;
-#endif
-	} else {
-		return (sph_u64)(((const unsigned char *)src)[0])
-			| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
-			| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
-			| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
-			| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
-			| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
-			| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
-			| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
-	}
-#endif
-#else
-	return (sph_u64)(((const unsigned char *)src)[0])
-		| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
-		| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
-		| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
-		| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
-		| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
-		| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
-		| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
-#endif
-}
-
-/**
- * Decode a 64-bit value from the provided buffer (little endian convention).
- * The source buffer must be properly aligned.
- *
- * @param src   the source buffer (64-bit aligned)
- * @return  the decoded value
- */
-static SPH_INLINE sph_u64
-sph_dec64le_aligned(const void *src)
-{
-#if SPH_LITTLE_ENDIAN
-	return *(const sph_u64 *)src;
-#elif SPH_BIG_ENDIAN
-#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM
-	sph_u64 tmp;
-
-	__asm__ __volatile__ ("ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src));
-	return tmp;
-/*
- * Not worth it generally.
- *
-#elif SPH_PPC32_GCC && !SPH_NO_ASM
-	return (sph_u64)sph_dec32le_aligned(src)
-		| ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32);
-#elif SPH_PPC64_GCC && !SPH_NO_ASM
-	sph_u64 tmp;
-
-	__asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src));
-	return tmp;
- */
-#else
-	return sph_bswap64(*(const sph_u64 *)src);
-#endif
-#else
-	return (sph_u64)(((const unsigned char *)src)[0])
-		| ((sph_u64)(((const unsigned char *)src)[1]) << 8)
-		| ((sph_u64)(((const unsigned char *)src)[2]) << 16)
-		| ((sph_u64)(((const unsigned char *)src)[3]) << 24)
-		| ((sph_u64)(((const unsigned char *)src)[4]) << 32)
-		| ((sph_u64)(((const unsigned char *)src)[5]) << 40)
-		| ((sph_u64)(((const unsigned char *)src)[6]) << 48)
-		| ((sph_u64)(((const unsigned char *)src)[7]) << 56);
-#endif
-}
-
-#endif
-
-#endif /* Doxygen excluded block */
-
-#endif
diff --git a/algo/simd/sse2/vector.c b/algo/simd/vector.c
similarity index 99%
rename from algo/simd/sse2/vector.c
rename to algo/simd/vector.c
index e6df467..12692db 100644
--- a/algo/simd/sse2/vector.c
+++ b/algo/simd/vector.c
@@ -63,13 +63,13 @@ MAYBE_INLINE void fft64(void *a) {
   v16* const A = a;
 
   register v16 X0, X1, X2, X3, X4, X5, X6, X7;
-
+/*
 #if V16_SIZE == 8
 #define X(i) A[i]
 #elif V16_SIZE == 4
 #define X(i) A[2*i]
 #endif
-
+*/
 #define X(i) X##i
 
   X0 = A[0];
@@ -623,6 +623,11 @@ void rounds(u32* state, const unsigned char* msg, short* fft) {
   STEP(S(1), S(2), S(3), S(0), S[3], 0, 25,  4, 20);
 
   S[0] = S(0);  S[1] = S(1);  S[2] = S(2);  S[3] = S(3);
+
+#undef ROUND
+#undef STEP
+#undef STEP_1
+#undef STEP_2
 }
 
 
@@ -849,24 +854,32 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
    */
 #define PERM_START 0
   ROUND(  2, 10, l,  3, 11, l,  0,  8, l,  1,  9, l, 0, 3,  23, 17, 27, 0);
+#undef PERM_START
 #define PERM_START 4
   ROUND(  3, 11, h,  2, 10, h,  1,  9, h,  0,  8, h, 1, 3,  23, 17, 27, 0);
+#undef PERM_START
 #define PERM_START 1
   ROUND(  7, 15, h,  5, 13, h,  6, 14, l,  4, 12, l, 0, 28, 19, 22, 7,  0);
+#undef PERM_START
 #define PERM_START 5
   ROUND(  4, 12, h,  6, 14, h,  5, 13, l,  7, 15, l, 1, 28, 19, 22, 7,  0);
+#undef PERM_START
 
   /*
    * 4 rounds with code 233
    */
 #define PERM_START 2
   ROUND(  0,  4, h,  1,  5, l,  3,  7, h,  2,  6, l, 0, 29,  9, 15,  5, 1);
+#undef PERM_START
 #define PERM_START 6
   ROUND(  3,  7, l,  2,  6, h,  0,  4, l,  1,  5, h, 1, 29,  9, 15,  5, 1);
+#undef PERM_START
 #define PERM_START 3
   ROUND( 11, 15, l,  8, 12, l,  8, 12, h, 11, 15, h, 0,  4, 13, 10, 25, 1);
+#undef PERM_START
 #define PERM_START 0
   ROUND(  9, 13, h, 10, 14, h, 10, 14, l,  9, 13, l, 1,  4, 13, 10, 25, 1);
+#undef PERM_START
 
 
   /*
@@ -877,9 +890,15 @@ void rounds512(u32* state, const unsigned char* msg, short* fft) {
   STEP(S(3), S(0), S(1), S(2), S[2], S[3], 0, 13, 10, 1);
   STEP(S(2), S(3), S(0), S(1), S[4], S[5], 0, 10, 25, 2);
   STEP(S(1), S(2), S(3), S(0), S[6], S[7], 0, 25,  4, 3);
+#undef PERM_START
 
   S[0] = S0l;  S[1] = S0h;  S[2] = S1l;  S[3] = S1h;
   S[4] = S2l;  S[5] = S2h;  S[6] = S3l;  S[7] = S3h;
+
+#undef ROUND
+#undef STEP
+#undef STEP_1
+#undef STEP_2
 }
 
 void SIMD_Compress(hashState_sd * state, const unsigned char *m, int final) {
diff --git a/algo/simd/sse2/vector.h b/algo/simd/vector.h
similarity index 100%
rename from algo/simd/sse2/vector.h
rename to algo/simd/vector.h
diff --git a/algo/sm3/sm3-hash-4way.c b/algo/sm3/sm3-hash-4way.c
index c970d54..0dc3502 100644
--- a/algo/sm3/sm3-hash-4way.c
+++ b/algo/sm3/sm3-hash-4way.c
@@ -125,14 +125,14 @@ void sm3_4way_close( void *cc, void *dst )
       memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 );
    }
 
-   count[0] = mm_byteswap_32(
+   count[0] = mm_bswap_32(
                   _mm_set1_epi32( ctx->nblocks >> 23 ) );
-   count[1] = mm_byteswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
+   count[1] = mm_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) +
                                               ( ctx->num     << 3 ) ) );
    sm3_4way_compress( ctx->digest, block );
 
    for ( i = 0; i < 8 ; i++ )
-     hash[i] = mm_byteswap_32( ctx->digest[i] );
+     hash[i] = mm_bswap_32( ctx->digest[i] );
 }
 
 #define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm_rotl_32( x,  9 ), \
@@ -165,7 +165,7 @@ void sm3_4way_compress( __m128i *digest, __m128i *block )
    int j;
 
    for ( j = 0; j < 16; j++ )
-      W[j] = mm_byteswap_32( block[j] );
+      W[j] = mm_bswap_32( block[j] );
 
    for ( j = 16; j < 68; j++ )
       W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ],
diff --git a/algo/whirlpool/md-helper-4way.c b/algo/whirlpool/md-helper-4way.c
index 4e2c631..dc3ad83 100644
--- a/algo/whirlpool/md-helper-4way.c
+++ b/algo/whirlpool/md-helper-4way.c
@@ -229,18 +229,18 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
 #if defined BE64
 #if defined PLW1
     sc->buf[ SPH_MAXPAD>>3 ] =
-                 mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+                 mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #elif defined PLW4
     memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 );
     sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-                mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+                mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
     sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-                mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+                mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #else
     sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] =
-               mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
+               mm256_bswap_64( _mm256_set1_epi64x( sc->count >> 61 ) );
     sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] =
-               mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
+               mm256_bswap_64( _mm256_set1_epi64x( sc->count << 3 ) );
 #endif  // PLW
 #else  // LE64
 #if defined PLW1
@@ -276,7 +276,7 @@ SPH_XCAT( HASH, _addbits_and_close )(void *cc, 	unsigned ub, unsigned n,
     for ( u = 0; u < rnum; u ++ )
     {
 #if defined BE64
-       ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] );
+       ((__m256i*)dst)[u] = mm256_bswap_64( sc->val[u] );
 #else  // LE64
        ((__m256i*)dst)[u] = sc->val[u];
 #endif
diff --git a/algo/x11/c11-4way.c b/algo/x11/c11-4way.c
index 1d96fa1..dc33a95 100644
--- a/algo/x11/c11-4way.c
+++ b/algo/x11/c11-4way.c
@@ -12,10 +12,10 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 
 typedef struct {
@@ -25,10 +25,10 @@ typedef struct {
     skein512_4way_context   skein;
     jh512_4way_context      jh;    
     keccak512_4way_context  keccak;    
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
     hashState_echo          echo;
 } c11_4way_ctx_holder;
 
@@ -42,10 +42,10 @@ void init_c11_4way_ctx()
      skein512_4way_init( &c11_4way_ctx.skein );
      jh512_4way_init( &c11_4way_ctx.jh );
      keccak512_4way_init( &c11_4way_ctx.keccak );
-     init_luffa( &c11_4way_ctx.luffa, 512 );
+     luffa_2way_init( &c11_4way_ctx.luffa, 512 );
      cubehashInit( &c11_4way_ctx.cube, 512, 16, 32 );
      sph_shavite512_init( &c11_4way_ctx.shavite );
-     init_sd( &c11_4way_ctx.simd, 512 );
+     simd_2way_init( &c11_4way_ctx.simd, 512 );
      init_echo( &c11_4way_ctx.echo, 512 );
 }
 
@@ -56,6 +56,7 @@ void c11_4way_hash( void *state, const void *input )
      uint64_t hash2[8] __attribute__ ((aligned (64)));
      uint64_t hash3[8] __attribute__ ((aligned (64)));
      uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
      c11_4way_ctx_holder ctx;
      memcpy( &ctx, &c11_4way_ctx, sizeof(c11_4way_ctx) );
 
@@ -98,17 +99,13 @@ void c11_4way_hash( void *state, const void *input )
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
      // 7 Luffa
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                             (const BitSequence*)hash0, 64 );
-     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                             (const BitSequence*)hash1, 64 );
-     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                             (const BitSequence*)hash2, 64 );
-     memcpy( &ctx.luffa, &c11_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                             (const BitSequence*)hash3, 64 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
 
      // 8 Cubehash
      cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -136,17 +133,13 @@ void c11_4way_hash( void *state, const void *input )
      sph_shavite512_close( &ctx.shavite, hash3 );
 
      // 10 Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                      (const BitSequence *)hash0, 512 );
-     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                      (const BitSequence *)hash1, 512 );
-     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                      (const BitSequence *)hash2, 512 );
-     memcpy( &ctx.simd, &c11_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                      (const BitSequence *)hash3, 512 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
 
      // 11 Echo
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
diff --git a/algo/x11/c11.c b/algo/x11/c11.c
index b26791d..51ee0b5 100644
--- a/algo/x11/c11.c
+++ b/algo/x11/c11.c
@@ -22,9 +22,9 @@
   #include "algo/echo/aes_ni/hash_api.h"
 #endif
 
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/keccak/sse2/keccak.c"
 #include "algo/bmw/sse2/bmw.c"
diff --git a/algo/x11/timetravel-4way.c b/algo/x11/timetravel-4way.c
index 3538710..c1d850c 100644
--- a/algo/x11/timetravel-4way.c
+++ b/algo/x11/timetravel-4way.c
@@ -12,7 +12,7 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 
 static __thread uint32_t s_ntime = UINT32_MAX;
@@ -25,7 +25,7 @@ typedef struct {
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
     cubehashParam           cube;
 } tt8_4way_ctx_holder;
 
@@ -39,7 +39,7 @@ void init_tt8_4way_ctx()
     skein512_4way_init( &tt8_4way_ctx.skein );
     jh512_4way_init( &tt8_4way_ctx.jh );
     keccak512_4way_init( &tt8_4way_ctx.keccak );
-    init_luffa( &tt8_4way_ctx.luffa, 512 );
+    luffa_2way_init( &tt8_4way_ctx.luffa, 512 );
     cubehashInit( &tt8_4way_ctx.cube, 512, 16, 32 );
 };
 
@@ -139,17 +139,13 @@ void timetravel_4way_hash(void *output, const void *input)
         case 6:
            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                     vhashA, dataLen<<3 );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                                        (const BitSequence *)hash0, dataLen );
-           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                                         (const BitSequence*)hash1, dataLen );
-           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                                         (const BitSequence*)hash2, dataLen );
-           memcpy( &ctx.luffa, &tt8_4way_ctx.luffa, sizeof(hashState_luffa) );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                                         (const BitSequence*)hash3, dataLen );
+           mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
+           luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+           mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
+           mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
+           luffa_2way_init( &ctx.luffa, 512 );
+           luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+           mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
            if ( i != 7 )           
               mm256_interleave_4x64( vhashB,
                                      hash0, hash1, hash2, hash3, dataLen<<3 );
diff --git a/algo/x11/timetravel.c b/algo/x11/timetravel.c
index fdbfef1..5dc1d3e 100644
--- a/algo/x11/timetravel.c
+++ b/algo/x11/timetravel.c
@@ -9,7 +9,7 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #ifdef NO_AES_NI
   #include "algo/groestl/sph_groestl.h"
diff --git a/algo/x11/timetravel10-4way.c b/algo/x11/timetravel10-4way.c
index e2e9c1f..ec10f19 100644
--- a/algo/x11/timetravel10-4way.c
+++ b/algo/x11/timetravel10-4way.c
@@ -12,10 +12,10 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread int permutation[TT10_FUNC_COUNT] = { 0 };
@@ -27,10 +27,10 @@ typedef struct {
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
 } tt10_4way_ctx_holder;
 
 tt10_4way_ctx_holder tt10_4way_ctx __attribute__ ((aligned (64)));
@@ -43,10 +43,10 @@ void init_tt10_4way_ctx()
     skein512_4way_init( &tt10_4way_ctx.skein );
     jh512_4way_init( &tt10_4way_ctx.jh );
     keccak512_4way_init( &tt10_4way_ctx.keccak );
-    init_luffa( &tt10_4way_ctx.luffa, 512 );
+    luffa_2way_init( &tt10_4way_ctx.luffa, 512 );
     cubehashInit( &tt10_4way_ctx.cube, 512, 16, 32 );
     sph_shavite512_init( &tt10_4way_ctx.shavite );
-    init_sd( &tt10_4way_ctx.simd, 512 );
+    simd_2way_init( &tt10_4way_ctx.simd, 512 );
 };
 
 void timetravel10_4way_hash(void *output, const void *input)
@@ -145,17 +145,13 @@ void timetravel10_4way_hash(void *output, const void *input)
         case 6:
            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                     vhashA, dataLen<<3 );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                                        (const BitSequence *)hash0, dataLen );
-           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                                         (const BitSequence*)hash1, dataLen );
-           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                                         (const BitSequence*)hash2, dataLen );
-           memcpy( &ctx.luffa, &tt10_4way_ctx.luffa, sizeof(hashState_luffa) );
-           update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                                         (const BitSequence*)hash3, dataLen );
+           mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
+           luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+           mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
+           mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
+           luffa_2way_init( &ctx.luffa, 512 );
+           luffa_2way_update_close( &ctx.luffa, vhashA, vhashA, dataLen );
+           mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
            if ( i != 9 )           
               mm256_interleave_4x64( vhashB,
                                      hash0, hash1, hash2, hash3, dataLen<<3 );
@@ -199,17 +195,13 @@ void timetravel10_4way_hash(void *output, const void *input)
         case 9:
            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                     vhashA, dataLen<<3 );
-           update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                            (const BitSequence *)hash0, dataLen<<3 );
-           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
-           update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                            (const BitSequence *)hash1, dataLen<<3 );
-           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
-           update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                            (const BitSequence *)hash2, dataLen<<3 );
-           memcpy( &ctx.simd, &tt10_4way_ctx.simd, sizeof ctx.simd );
-           update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                            (const BitSequence *)hash3, dataLen<<3 );
+           mm256_interleave_2x128( vhashA, hash0, hash1, dataLen<<3 );
+           simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+           mm256_deinterleave_2x128( hash0, hash1, vhashA, dataLen<<3 );
+           mm256_interleave_2x128( vhashA, hash2, hash3, dataLen<<3 );
+           simd_2way_init( &ctx.simd, 512 );
+           simd_2way_update_close( &ctx.simd, vhashA, vhashA, dataLen<<3 );
+           mm256_deinterleave_2x128( hash2, hash3, vhashA, dataLen<<3 );
            if ( i != 9 )
               mm256_interleave_4x64( vhashB,
                                      hash0, hash1, hash2, hash3, dataLen<<3 );
diff --git a/algo/x11/timetravel10.c b/algo/x11/timetravel10.c
index 888d53e..905610c 100644
--- a/algo/x11/timetravel10.c
+++ b/algo/x11/timetravel10.c
@@ -8,10 +8,10 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 
 #ifdef NO_AES_NI
   #include "algo/groestl/sph_groestl.h"
diff --git a/algo/x11/x11-4way.c b/algo/x11/x11-4way.c
index 35ce68e..e8718eb 100644
--- a/algo/x11/x11-4way.c
+++ b/algo/x11/x11-4way.c
@@ -5,17 +5,16 @@
 
 #include <string.h>
 #include <stdint.h>
-
 #include "algo/blake/blake-hash-4way.h"
 #include "algo/bmw/bmw-hash-4way.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 
 typedef struct {
@@ -25,10 +24,10 @@ typedef struct {
     skein512_4way_context   skein;
     jh512_4way_context      jh;    
     keccak512_4way_context  keccak;    
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
     hashState_echo          echo;
 } x11_4way_ctx_holder;
 
@@ -42,10 +41,10 @@ void init_x11_4way_ctx()
      skein512_4way_init( &x11_4way_ctx.skein );
      jh512_4way_init( &x11_4way_ctx.jh );
      keccak512_4way_init( &x11_4way_ctx.keccak );
-     init_luffa( &x11_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x11_4way_ctx.luffa, 512 );
      cubehashInit( &x11_4way_ctx.cube, 512, 16, 32 );
      sph_shavite512_init( &x11_4way_ctx.shavite );
-     init_sd( &x11_4way_ctx.simd, 512 );
+     simd_2way_init( &x11_4way_ctx.simd, 512 );
      init_echo( &x11_4way_ctx.echo, 512 );
 }
 
@@ -56,6 +55,8 @@ void x11_4way_hash( void *state, const void *input )
      uint64_t hash2[8] __attribute__ ((aligned (64)));
      uint64_t hash3[8] __attribute__ ((aligned (64)));
      uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+     uint64_t vhashB[8*2] __attribute__ ((aligned (64)));
+
      x11_4way_ctx_holder ctx;
      memcpy( &ctx, &x11_4way_ctx, sizeof(x11_4way_ctx) );
 
@@ -94,21 +95,16 @@ void x11_4way_hash( void *state, const void *input )
      keccak512_4way( &ctx.keccak, vhash, 64 );
      keccak512_4way_close( &ctx.keccak, vhash );
 
-     // Serial
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
-     // 7 Luffa
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                             (const BitSequence*)hash0, 64 );
-     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                             (const BitSequence*)hash1, 64 );
-     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                             (const BitSequence*)hash2, 64 );
-     memcpy( &ctx.luffa, &x11_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                             (const BitSequence*)hash3, 64 );
+     // 7 Luffa parallel 2 way 128 bit
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhashB, vhashB, 64 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
 
      // 8 Cubehash
      cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -136,17 +132,13 @@ void x11_4way_hash( void *state, const void *input )
      sph_shavite512_close( &ctx.shavite, hash3 );
 
      // 10 Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                      (const BitSequence *)hash0, 512 );
-     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                      (const BitSequence *)hash1, 512 );
-     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                      (const BitSequence *)hash2, 512 );
-     memcpy( &ctx.simd, &x11_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                      (const BitSequence *)hash3, 512 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     mm256_interleave_2x128( vhashB, hash2, hash3, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhashB, vhashB, 512 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhashB, 512 );
 
      // 11 Echo
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
diff --git a/algo/x11/x11.c b/algo/x11/x11.c
index 41e4c4f..7847926 100644
--- a/algo/x11/x11.c
+++ b/algo/x11/x11.c
@@ -10,10 +10,8 @@
 #include "algo/jh/sph_jh.h"
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
-#include "algo/luffa/sph_luffa.h"
 #include "algo/cubehash/sph_cubehash.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
 #include "algo/echo/sph_echo.h"
 
 #ifndef NO_AES_NI
@@ -21,9 +19,9 @@
   #include "algo/echo/aes_ni/hash_api.h"
 #endif
 
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"  
 #include "algo/keccak/sse2/keccak.c"
 #include "algo/bmw/sse2/bmw.c"
diff --git a/algo/x11/x11evo-4way.c b/algo/x11/x11evo-4way.c
index e73e52c..f7b8f4a 100644
--- a/algo/x11/x11evo-4way.c
+++ b/algo/x11/x11evo-4way.c
@@ -11,15 +11,12 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sph_luffa.h"
-#include "algo/cubehash/sph_cubehash.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sph_simd.h"
 #include "algo/groestl/aes_ni/hash-groestl.h"
 #include "algo/echo/aes_ni/hash_api.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 
 typedef struct {
     blake512_4way_context   blake;
@@ -28,10 +25,10 @@ typedef struct {
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
     hashState_echo          echo;
 } x11evo_4way_ctx_holder;
 
@@ -45,10 +42,11 @@ void init_x11evo_4way_ctx()
      skein512_4way_init( &x11evo_4way_ctx.skein );
      jh512_4way_init( &x11evo_4way_ctx.jh );
      keccak512_4way_init( &x11evo_4way_ctx.keccak );
+     luffa_2way_init( &x11evo_4way_ctx.luffa, 512 );
      init_luffa( &x11evo_4way_ctx.luffa, 512 );
      cubehashInit( &x11evo_4way_ctx.cube, 512, 16, 32 );
      sph_shavite512_init( &x11evo_4way_ctx.shavite );
-     init_sd( &x11evo_4way_ctx.simd, 512 );
+     simd_2way_init( &x11evo_4way_ctx.simd, 512 );
      init_echo( &x11evo_4way_ctx.echo, 512 );
 }
 
@@ -142,20 +140,13 @@ void x11evo_4way_hash( void *state, const void *input )
          case 6:
             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                      vhash, 64<<3 );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                                          (const BitSequence*)hash0, 64 );
-            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
-                    sizeof(hashState_luffa) );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                                          (const BitSequence*)hash1, 64 );
-            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
-                    sizeof(hashState_luffa) );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                                          (const BitSequence*)hash2, 64 );
-            memcpy( &ctx.luffa, &x11evo_4way_ctx.luffa,
-                    sizeof(hashState_luffa) );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                                          (const BitSequence*)hash3, 64 );
+            mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 );
+            mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 );
+            luffa_2way_init( &ctx.luffa, 512 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+            mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 );
             if ( i < len-1 )
                mm256_interleave_4x64( vhash,
                                       hash0, hash1, hash2, hash3, 64<<3 );
@@ -202,17 +193,13 @@ void x11evo_4way_hash( void *state, const void *input )
          case 9:
             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3,
                                      vhash, 64<<3 );
-            update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                                  (const BitSequence *)hash0, 512 );
-            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
-            update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                                  (const BitSequence *)hash1, 512 );
-            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
-            update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                                  (const BitSequence *)hash2, 512 );
-            memcpy( &ctx.simd, &x11evo_4way_ctx.simd, sizeof(hashState_sd) );
-            update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                                  (const BitSequence *)hash3, 512 );
+            mm256_interleave_2x128( vhash, hash0, hash1, 64<<3 );
+            simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 );
+            mm256_deinterleave_2x128( hash0, hash1, vhash, 64<<3 );
+            mm256_interleave_2x128( vhash, hash2, hash3, 64<<3 );
+            simd_2way_init( &ctx.simd, 512 );
+            simd_2way_update_close( &ctx.simd, vhash, vhash, 64<<3 );
+            mm256_deinterleave_2x128( hash2, hash3, vhash, 64<<3 );
             if ( i < len-1 )
                mm256_interleave_4x64( vhash,
                                       hash0, hash1, hash2, hash3, 64<<3 );
diff --git a/algo/x11/x11evo.c b/algo/x11/x11evo.c
index 6b1f3f9..50ab9b7 100644
--- a/algo/x11/x11evo.c
+++ b/algo/x11/x11evo.c
@@ -22,9 +22,9 @@
   #include "algo/echo/aes_ni/hash_api.h"
 #endif
 
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 
 typedef struct {
 #ifdef NO_AES_NI
diff --git a/algo/x11/x11gost-4way.c b/algo/x11/x11gost-4way.c
index b22f1d6..2604456 100644
--- a/algo/x11/x11gost-4way.c
+++ b/algo/x11/x11gost-4way.c
@@ -13,10 +13,10 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/gost/sph_gost.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 
 typedef struct {
@@ -27,10 +27,10 @@ typedef struct {
     jh512_4way_context      jh;    
     keccak512_4way_context  keccak;    
     sph_gost512_context     gost;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
     hashState_echo          echo;
 } x11gost_4way_ctx_holder;
 
@@ -45,10 +45,10 @@ void init_x11gost_4way_ctx()
      jh512_4way_init( &x11gost_4way_ctx.jh );
      keccak512_4way_init( &x11gost_4way_ctx.keccak );
      sph_gost512_init( &x11gost_4way_ctx.gost );
-     init_luffa( &x11gost_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x11gost_4way_ctx.luffa, 512 );
      cubehashInit( &x11gost_4way_ctx.cube, 512, 16, 32 );
      sph_shavite512_init( &x11gost_4way_ctx.shavite );
-     init_sd( &x11gost_4way_ctx.simd, 512 );
+     simd_2way_init( &x11gost_4way_ctx.simd, 512 );
      init_echo( &x11gost_4way_ctx.echo, 512 );
 }
 
@@ -59,6 +59,7 @@ void x11gost_4way_hash( void *state, const void *input )
      uint64_t hash2[8] __attribute__ ((aligned (64)));
      uint64_t hash3[8] __attribute__ ((aligned (64)));
      uint64_t vhash[8*4] __attribute__ ((aligned (64)));
+
      x11gost_4way_ctx_holder ctx;
      memcpy( &ctx, &x11gost_4way_ctx, sizeof(x11gost_4way_ctx) );
 
@@ -109,17 +110,13 @@ void x11gost_4way_hash( void *state, const void *input )
      sph_gost512( &ctx.gost, hash3, 64 );
      sph_gost512_close( &ctx.gost, hash3 );
 
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                             (const BitSequence*)hash0, 64 );
-     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                             (const BitSequence*)hash1, 64 );
-     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                             (const BitSequence*)hash2, 64 );
-     memcpy( &ctx.luffa, &x11gost_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                             (const BitSequence*)hash3, 64 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
 
      cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
      memcpy( &ctx.cube, &x11gost_4way_ctx.cube, sizeof(cubehashParam) );
@@ -144,17 +141,12 @@ void x11gost_4way_hash( void *state, const void *input )
      sph_shavite512( &ctx.shavite, hash3, 64 );
      sph_shavite512_close( &ctx.shavite, hash3 );
 
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                      (const BitSequence *)hash0, 512 );
-     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                      (const BitSequence *)hash1, 512 );
-     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                      (const BitSequence *)hash2, 512 );
-     memcpy( &ctx.simd, &x11gost_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                      (const BitSequence *)hash3, 512 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
 
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, 512 );
diff --git a/algo/x11/x11gost.c b/algo/x11/x11gost.c
index 31d391b..3356e4a 100644
--- a/algo/x11/x11gost.c
+++ b/algo/x11/x11gost.c
@@ -10,9 +10,9 @@
 #include "algo/shavite/sph_shavite.h"
 #include "algo/echo/sph_echo.h"
 
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/keccak/sse2/keccak.c"
 #include "algo/bmw/sse2/bmw.c"
diff --git a/algo/x13/x13-4way.c b/algo/x13/x13-4way.c
index 927ea33..c8304ec 100644
--- a/algo/x13/x13-4way.c
+++ b/algo/x13/x13-4way.c
@@ -12,10 +12,10 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
@@ -27,10 +27,10 @@ typedef struct {
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
     hashState_echo          echo;
     hamsi512_4way_context   hamsi;
     sph_fugue512_context    fugue;
@@ -46,10 +46,10 @@ void init_x13_4way_ctx()
      skein512_4way_init( &x13_4way_ctx.skein );
      jh512_4way_init( &x13_4way_ctx.jh );
      keccak512_4way_init( &x13_4way_ctx.keccak );
-     init_luffa( &x13_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x13_4way_ctx.luffa, 512 );
      cubehashInit( &x13_4way_ctx.cube, 512, 16, 32 );
      sph_shavite512_init( &x13_4way_ctx.shavite );
-     init_sd( &x13_4way_ctx.simd, 512 );
+     simd_2way_init( &x13_4way_ctx.simd, 512 );
      init_echo( &x13_4way_ctx.echo, 512 );
      hamsi512_4way_init( &x13_4way_ctx.hamsi );
      sph_fugue512_init( &x13_4way_ctx.fugue );
@@ -104,17 +104,13 @@ void x13_4way_hash( void *state, const void *input )
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
      // 7 Luffa
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                             (const BitSequence*)hash0, 64 );
-     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                             (const BitSequence*)hash1, 64 );
-     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                             (const BitSequence*)hash2, 64 );
-     memcpy( &ctx.luffa, &x13_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                             (const BitSequence*)hash3, 64 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
 
      // 8 Cubehash
      cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -142,17 +138,13 @@ void x13_4way_hash( void *state, const void *input )
      sph_shavite512_close( &ctx.shavite, hash3 );
 
      // 10 Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                      (const BitSequence *)hash0, 512 );
-     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                      (const BitSequence *)hash1, 512 );
-     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                      (const BitSequence *)hash2, 512 );
-     memcpy( &ctx.simd, &x13_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                      (const BitSequence *)hash3, 512 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
 
      // 11 Echo
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -168,10 +160,10 @@ void x13_4way_hash( void *state, const void *input )
                        (const BitSequence *) hash3, 512 );
 
      // 12 Hamsi parallel 4way 32 bit
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
      hamsi512_4way( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
      // 13 Fugue serial
      sph_fugue512( &ctx.fugue, hash0, 64 );
diff --git a/algo/x13/x13.c b/algo/x13/x13.c
index 8a052c3..8ba00d6 100644
--- a/algo/x13/x13.c
+++ b/algo/x13/x13.c
@@ -19,9 +19,9 @@
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"   
 #include "algo/bmw/sse2/bmw.c"
 #include "algo/keccak/sse2/keccak.c"
diff --git a/algo/x13/x13sm3-4way.c b/algo/x13/x13sm3-4way.c
index 7cc18b6..c394342 100644
--- a/algo/x13/x13sm3-4way.c
+++ b/algo/x13/x13sm3-4way.c
@@ -12,10 +12,10 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/sm3/sm3-hash-4way.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -28,10 +28,10 @@ typedef struct {
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
     hashState_echo          echo;
     sm3_4way_ctx_t          sm3;
     hamsi512_4way_context   hamsi;
@@ -49,10 +49,10 @@ void init_x13sm3_4way_ctx()
      skein512_4way_init( &x13sm3_4way_ctx.skein );
      jh512_4way_init( &x13sm3_4way_ctx.jh );
      keccak512_4way_init( &x13sm3_4way_ctx.keccak );
-     init_luffa( &x13sm3_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x13sm3_4way_ctx.luffa, 512 );
      cubehashInit( &x13sm3_4way_ctx.cube, 512, 16, 32 );
      sph_shavite512_init( &x13sm3_4way_ctx.shavite );
-     init_sd( &x13sm3_4way_ctx.simd, 512 );
+     simd_2way_init( &x13sm3_4way_ctx.simd, 512 );
      init_echo( &x13sm3_4way_ctx.echo, 512 );
      sm3_4way_init( &x13sm3_4way_ctx.sm3 );
      hamsi512_4way_init( &x13sm3_4way_ctx.hamsi );
@@ -111,17 +111,13 @@ void x13sm3_4way_hash( void *state, const void *input )
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
      // Luffa
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                             (const BitSequence*)hash0, 64 );
-     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                             (const BitSequence*)hash1, 64 );
-     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                             (const BitSequence*)hash2, 64 );
-     memcpy( &ctx.luffa, &x13sm3_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                             (const BitSequence*)hash3, 64 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
 
      // Cubehash
      cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -149,17 +145,13 @@ void x13sm3_4way_hash( void *state, const void *input )
      sph_shavite512_close( &ctx.shavite, hash3 );
 
      // Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                      (const BitSequence *)hash0, 512 );
-     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                      (const BitSequence *)hash1, 512 );
-     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                      (const BitSequence *)hash2, 512 );
-     memcpy( &ctx.simd, &x13sm3_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                      (const BitSequence *)hash3, 512 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
 
      // Echo
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -190,12 +182,13 @@ void x13sm3_4way_hash( void *state, const void *input )
 
      sm3_4way( &ctx.sm3, vhash, 64 );
      sm3_4way_close( &ctx.sm3, sm3_vhash );
+     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, sm3_vhash, 512 );
 
-     // Hamsi parallel 32 bit
-     hamsi512_4way( &ctx.hamsi, sm3_vhash, 64 );
+     // Hamsi parallel 4x32x2
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
+     hamsi512_4way( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
-
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
      // Fugue serial
      sph_fugue512( &ctx.fugue, hash0, 64 );
diff --git a/algo/x13/x13sm3.c b/algo/x13/x13sm3.c
index 8724cef..c7674a4 100644
--- a/algo/x13/x13sm3.c
+++ b/algo/x13/x13sm3.c
@@ -15,9 +15,9 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/sm3/sph_sm3.h"
 
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/echo/sse2/sph_echo.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/bmw/sse2/bmw.c"
diff --git a/algo/x14/polytimos-4way.c b/algo/x14/polytimos-4way.c
index bd6d392..652fcad 100644
--- a/algo/x14/polytimos-4way.c
+++ b/algo/x14/polytimos-4way.c
@@ -9,8 +9,7 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/fugue//sph_fugue.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
-//#include "algo/shabal/sph_shabal.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/gost/sph_gost.h"
 #include "algo/echo/aes_ni/hash_api.h"
 
@@ -18,7 +17,7 @@ typedef struct {
    skein512_4way_context   skein;
    shabal512_4way_context  shabal;
    hashState_echo          echo;
-   hashState_luffa         luffa;
+   luffa_2way_context      luffa;
    sph_fugue512_context    fugue;
    sph_gost512_context     gost;
 } poly_4way_ctx_holder;
@@ -27,12 +26,12 @@ poly_4way_ctx_holder poly_4way_ctx;
 
 void init_polytimos_4way_ctx()
 {
-   skein512_4way_init( &poly_4way_ctx.skein );
-   shabal512_4way_init( &poly_4way_ctx.shabal );
-   init_echo( &poly_4way_ctx.echo, 512  );
-   init_luffa( &poly_4way_ctx.luffa, 512 );
-   sph_fugue512_init( &poly_4way_ctx.fugue );
-   sph_gost512_init( &poly_4way_ctx.gost );
+    skein512_4way_init( &poly_4way_ctx.skein );
+    shabal512_4way_init( &poly_4way_ctx.shabal );
+    init_echo( &poly_4way_ctx.echo, 512  );
+    luffa_2way_init( &poly_4way_ctx.luffa, 512 );
+    sph_fugue512_init( &poly_4way_ctx.fugue );
+    sph_gost512_init( &poly_4way_ctx.gost );
 }
 
 void polytimos_4way_hash( void *output, const void *input )
@@ -67,17 +66,13 @@ void polytimos_4way_hash( void *output, const void *input )
      update_final_echo( &ctx.echo, (BitSequence *)hash3,
                        (const BitSequence *) hash3, 512 );
 
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                             (const BitSequence*)hash0, 64 );
-     memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                             (const BitSequence*)hash1, 64 );
-     memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                             (const BitSequence*)hash2, 64 );
-     memcpy( &ctx.luffa, &poly_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                             (const BitSequence*)hash3, 64 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
 
      sph_fugue512( &ctx.fugue, hash0, 64 );
      sph_fugue512_close( &ctx.fugue, hash0 );
diff --git a/algo/x14/polytimos.c b/algo/x14/polytimos.c
index 6673628..d72792a 100644
--- a/algo/x14/polytimos.c
+++ b/algo/x14/polytimos.c
@@ -8,7 +8,7 @@
 #include "algo/skein/sph_skein.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/fugue//sph_fugue.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/gost/sph_gost.h"
 #ifndef NO_AES_NI
diff --git a/algo/x14/x14-4way.c b/algo/x14/x14-4way.c
index 0a02fa9..85d277d 100644
--- a/algo/x14/x14-4way.c
+++ b/algo/x14/x14-4way.c
@@ -12,10 +12,10 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
@@ -29,10 +29,10 @@ typedef struct {
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
     hashState_echo          echo;
     hamsi512_4way_context   hamsi;
     sph_fugue512_context    fugue;
@@ -45,15 +45,14 @@ void init_x14_4way_ctx()
 {
      blake512_4way_init( &x14_4way_ctx.blake );
      bmw512_4way_init( &x14_4way_ctx.bmw );
-     sph_bmw512_init( &x14_4way_ctx.bmw );
      init_groestl( &x14_4way_ctx.groestl, 64 );
      skein512_4way_init( &x14_4way_ctx.skein );
      jh512_4way_init( &x14_4way_ctx.jh );
      keccak512_4way_init( &x14_4way_ctx.keccak );
-     init_luffa( &x14_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x14_4way_ctx.luffa, 512 );
      cubehashInit( &x14_4way_ctx.cube, 512, 16, 32 );
      sph_shavite512_init( &x14_4way_ctx.shavite );
-     init_sd( &x14_4way_ctx.simd, 512 );
+     simd_2way_init( &x14_4way_ctx.simd, 512 );
      init_echo( &x14_4way_ctx.echo, 512 );
      hamsi512_4way_init( &x14_4way_ctx.hamsi );
      sph_fugue512_init( &x14_4way_ctx.fugue );
@@ -109,17 +108,13 @@ void x14_4way_hash( void *state, const void *input )
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
      // 7 Luffa
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                             (const BitSequence*)hash0, 64 );
-     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                             (const BitSequence*)hash1, 64 );
-     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                             (const BitSequence*)hash2, 64 );
-     memcpy( &ctx.luffa, &x14_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                             (const BitSequence*)hash3, 64 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
 
      // 8 Cubehash
      cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -147,17 +142,13 @@ void x14_4way_hash( void *state, const void *input )
      sph_shavite512_close( &ctx.shavite, hash3 );
 
      // 10 Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                      (const BitSequence *)hash0, 512 );
-     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                      (const BitSequence *)hash1, 512 );
-     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                      (const BitSequence *)hash2, 512 );
-     memcpy( &ctx.simd, &x14_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                      (const BitSequence *)hash3, 512 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
 
      // 11 Echo
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -173,10 +164,10 @@ void x14_4way_hash( void *state, const void *input )
                        (const BitSequence *) hash3, 512 );
 
      // 12 Hamsi parallel 4way 32 bit
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
      hamsi512_4way( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
      // 13 Fugue serial
      sph_fugue512( &ctx.fugue, hash0, 64 );
diff --git a/algo/x14/x14.c b/algo/x14/x14.c
index 8d1c928..014966f 100644
--- a/algo/x14/x14.c
+++ b/algo/x14/x14.c
@@ -20,9 +20,9 @@
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/echo/sse2/sph_echo.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/bmw/sse2/bmw.c"
diff --git a/algo/x15/x15-4way.c b/algo/x15/x15-4way.c
index 56e4b55..7cd7a3d 100644
--- a/algo/x15/x15-4way.c
+++ b/algo/x15/x15-4way.c
@@ -12,14 +12,13 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
-//#include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/shabal-hash-4way.h"
 #include "algo/whirlpool/sph_whirlpool.h"
@@ -31,13 +30,12 @@ typedef struct {
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
     hashState_echo          echo;
     hamsi512_4way_context   hamsi;
-//    sph_hamsi512_context    hamsi;
     sph_fugue512_context    fugue;
     shabal512_4way_context  shabal;
     sph_whirlpool_context   whirlpool;
@@ -53,13 +51,12 @@ void init_x15_4way_ctx()
      skein512_4way_init( &x15_4way_ctx.skein );
      jh512_4way_init( &x15_4way_ctx.jh );
      keccak512_4way_init( &x15_4way_ctx.keccak );
-     init_luffa( &x15_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x15_4way_ctx.luffa, 512 );
      cubehashInit( &x15_4way_ctx.cube, 512, 16, 32 );
      sph_shavite512_init( &x15_4way_ctx.shavite );
-     init_sd( &x15_4way_ctx.simd, 512 );
+     simd_2way_init( &x15_4way_ctx.simd, 512 );
      init_echo( &x15_4way_ctx.echo, 512 );
      hamsi512_4way_init( &x15_4way_ctx.hamsi );
-//     sph_hamsi512_init( &x15_4way_ctx.hamsi );
      sph_fugue512_init( &x15_4way_ctx.fugue );
      shabal512_4way_init( &x15_4way_ctx.shabal );
      sph_whirlpool_init( &x15_4way_ctx.whirlpool );
@@ -114,17 +111,13 @@ void x15_4way_hash( void *state, const void *input )
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
      // 7 Luffa
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                             (const BitSequence*)hash0, 64 );
-     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                             (const BitSequence*)hash1, 64 );
-     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                             (const BitSequence*)hash2, 64 );
-     memcpy( &ctx.luffa, &x15_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                             (const BitSequence*)hash3, 64 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
 
      // 8 Cubehash
      cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -152,17 +145,13 @@ void x15_4way_hash( void *state, const void *input )
      sph_shavite512_close( &ctx.shavite, hash3 );
 
      // 10 Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                      (const BitSequence *)hash0, 512 );
-     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                      (const BitSequence *)hash1, 512 );
-     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                      (const BitSequence *)hash2, 512 );
-     memcpy( &ctx.simd, &x15_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                      (const BitSequence *)hash3, 512 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
 
      // 11 Echo
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -178,24 +167,11 @@ void x15_4way_hash( void *state, const void *input )
                        (const BitSequence *) hash3, 512 );
 
      // 12 Hamsi parallel 4way 32 bit
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
      hamsi512_4way( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
-/*
-     // 12 Hamsi
-     sph_hamsi512( &ctx.hamsi, hash0, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash0 );
-     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
-     sph_hamsi512( &ctx.hamsi, hash1, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash1 );
-     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
-     sph_hamsi512( &ctx.hamsi, hash2, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash2 );
-     memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) );
-     sph_hamsi512( &ctx.hamsi, hash3, 64 );
-     sph_hamsi512_close( &ctx.hamsi, hash3 );
-*/
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
+
      // 13 Fugue
      sph_fugue512( &ctx.fugue, hash0, 64 );
      sph_fugue512_close( &ctx.fugue, hash0 );
diff --git a/algo/x15/x15.c b/algo/x15/x15.c
index f96c684..e94f015 100644
--- a/algo/x15/x15.c
+++ b/algo/x15/x15.c
@@ -21,9 +21,9 @@
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/bmw/sse2/bmw.c"
 #include "algo/keccak/sse2/keccak.c"
diff --git a/algo/x17/hmq1725.c b/algo/x17/hmq1725.c
index 9345f0d..b03b2be 100644
--- a/algo/x17/hmq1725.c
+++ b/algo/x17/hmq1725.c
@@ -23,9 +23,9 @@
   #include "algo/groestl/aes_ni/hash-groestl.h"
   #include "algo/echo/aes_ni/hash_api.h"
 #endif
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/jh/sse2/jh_sse2_opt64.h"
 
 typedef struct {
diff --git a/algo/x17/x16r-4way.c b/algo/x17/x16r-4way.c
index 6b967af..de054a8 100644
--- a/algo/x17/x16r-4way.c
+++ b/algo/x17/x16r-4way.c
@@ -19,9 +19,9 @@
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
@@ -41,10 +41,10 @@ typedef struct {
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
     hamsi512_4way_context   hamsi;
     sph_fugue512_context    fugue;
     shabal512_4way_context  shabal;
@@ -68,6 +68,10 @@ void x16r_4way_hash( void* output, const void* input )
    uint32_t hash2[24] __attribute__ ((aligned (64)));
    uint32_t hash3[24] __attribute__ ((aligned (64)));
    uint32_t vhash[24*4] __attribute__ ((aligned (64)));
+//   uint32_t inp0[24] __attribute__ ((aligned (64)));
+//   uint32_t inp1[24] __attribute__ ((aligned (64)));
+//   uint32_t inp2[24] __attribute__ ((aligned (64)));
+//   uint32_t inp3[24] __attribute__ ((aligned (64)));
 
    x16r_4way_ctx_holder ctx;
    
@@ -75,7 +79,6 @@ void x16r_4way_hash( void* output, const void* input )
    void *in1 = (void*) hash1;
    void *in2 = (void*) hash2;
    void *in3 = (void*) hash3;
-
    int size = 80;
 
    mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, input, 640 );
@@ -111,7 +114,7 @@ void x16r_4way_hash( void* output, const void* input )
                blake512_4way( &ctx.blake, vhash, size );
             }
             blake512_4way_close( &ctx.blake, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case BMW:
             bmw512_4way_init( &ctx.bmw );
@@ -123,7 +126,7 @@ void x16r_4way_hash( void* output, const void* input )
                bmw512_4way( &ctx.bmw, vhash, size );
             }
             bmw512_4way_close( &ctx.bmw, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case GROESTL:
                init_groestl( &ctx.groestl, 64 );
@@ -149,7 +152,7 @@ void x16r_4way_hash( void* output, const void* input )
                skein512_4way( &ctx.skein, vhash, size );
             }
             skein512_4way_close( &ctx.skein, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case JH:
             jh512_4way_init( &ctx.jh );
@@ -161,7 +164,7 @@ void x16r_4way_hash( void* output, const void* input )
                jh512_4way( &ctx.jh, vhash, size );
             }
             jh512_4way_close( &ctx.jh, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case KECCAK:
             keccak512_4way_init( &ctx.keccak );
@@ -173,21 +176,17 @@ void x16r_4way_hash( void* output, const void* input )
                keccak512_4way( &ctx.keccak, vhash, size );
             }
             keccak512_4way_close( &ctx.keccak, vhash );
-            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
+            mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case LUFFA:
-            init_luffa( &ctx.luffa, 512 );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                                          (const BitSequence*)in0, size );
-            init_luffa( &ctx.luffa, 512 );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                                          (const BitSequence*)in1, size );
-            init_luffa( &ctx.luffa, 512 );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                                          (const BitSequence*)in2, size );
-            init_luffa( &ctx.luffa, 512 );
-            update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                                          (const BitSequence*)in3, size );
+            mm256_interleave_2x128( vhash, in0, in1, size<<3 );
+            luffa_2way_init( &ctx.luffa, 512 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size );
+            mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+            mm256_interleave_2x128( vhash, in2, in3, size<<3 );
+            luffa_2way_init( &ctx.luffa, 512 );
+            luffa_2way_update_close( &ctx.luffa, vhash, vhash, size);
+            mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
          break;
          case CUBEHASH:
             cubehashReinit( &ctx.cube );
@@ -218,18 +217,14 @@ void x16r_4way_hash( void* output, const void* input )
             sph_shavite512_close( &ctx.shavite, hash3 );
          break;
          case SIMD:
-             init_sd( &ctx.simd, 512 );
-             update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                              (const BitSequence*)in0, size<<3 );
-             init_sd( &ctx.simd, 512 );
-             update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                              (const BitSequence*)in1, size<<3 );
-             init_sd( &ctx.simd, 512 );
-             update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                              (const BitSequence*)in2, size<<3 );
-             init_sd( &ctx.simd, 512 );
-             update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                              (const BitSequence*)in3, size<<3 );
+            mm256_interleave_2x128( vhash, in0, in1, size<<3 );
+            simd_2way_init( &ctx.simd, 512 );
+            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+            mm256_interleave_2x128( vhash, in2, in3, size<<3 );
+            simd_2way_init( &ctx.simd, 512 );
+            simd_2way_update_close( &ctx.simd, vhash, vhash, size<<3 );
+            mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
          break;
          case ECHO:
              init_echo( &ctx.echo, 512 );
@@ -246,11 +241,11 @@ void x16r_4way_hash( void* output, const void* input )
                                 (const BitSequence*)in3, size<<3 );
          break;
          case HAMSI:
-             mm_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 );
+             mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 );
              hamsi512_4way_init( &ctx.hamsi );
              hamsi512_4way( &ctx.hamsi, vhash, size );
              hamsi512_4way_close( &ctx.hamsi, vhash );
-             mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, size<<3 );
+             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case FUGUE:
              sph_fugue512_init( &ctx.fugue );
@@ -271,7 +266,7 @@ void x16r_4way_hash( void* output, const void* input )
              shabal512_4way_init( &ctx.shabal );
              shabal512_4way( &ctx.shabal, vhash, size );
              shabal512_4way_close( &ctx.shabal, vhash );
-             mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, size<<3 );
+             mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
          case WHIRLPOOL:
              sph_whirlpool_init( &ctx.whirlpool );
@@ -292,9 +287,13 @@ void x16r_4way_hash( void* output, const void* input )
              sha512_4way_init( &ctx.sha512 );
              sha512_4way( &ctx.sha512, vhash, size );
              sha512_4way_close( &ctx.sha512, vhash );
-             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, size<<3 );
+             mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
          break;
       }
+//      in0 = (void*) hash0;
+//      in1 = (void*) hash1;
+//      in2 = (void*) hash2;
+//      in3 = (void*) hash3;
       size = 64;
    }
    memcpy( output,    hash0, 32 );
@@ -351,28 +350,28 @@ int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce,
       x16r_4way_hash( hash, vdata );
       pdata[19] = n;
 
-      if ( ( hash[7] <= Htarg ) && fulltest( hash, ptarget ) )
+      if ( hash[7] <= Htarg && fulltest( hash, ptarget ) )
       {
          found[0] = true;
          num_found++;
          nonces[0] = n;
          work_set_target_ratio( work, hash );
       }
-      if ( ( (hash+8)[7] <= Htarg ) && fulltest( hash+8, ptarget ) )
+      if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) )
       {
          found[1] = true;
          num_found++;
          nonces[1] = n+1;
          work_set_target_ratio( work, hash+8 );
       }
-      if ( ( (hash+16)[7] <= Htarg ) && fulltest( hash+16, ptarget ) )
+      if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) )
       {
          found[2] = true;
          num_found++;
          nonces[2] = n+2;
          work_set_target_ratio( work, hash+16 );
       }
-      if ( ( (hash+24)[7] <= Htarg ) && fulltest( hash+24, ptarget ) )
+      if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) )
       {
          found[3] = true;
          num_found++;
diff --git a/algo/x17/x16r.c b/algo/x17/x16r.c
index 08b5a42..ff5e48d 100644
--- a/algo/x17/x16r.c
+++ b/algo/x17/x16r.c
@@ -16,9 +16,9 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/echo/sph_echo.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
@@ -117,7 +117,7 @@ void x16r_hash( void* output, const void* input )
          case GROESTL:
 #ifdef NO_AES_NI
             sph_groestl512_init( &ctx.groestl );
-            sph_groestl512( &ctx.groestl, in, size<<3 );
+            sph_groestl512( &ctx.groestl, in, size );
             sph_groestl512_close(&ctx.groestl, hash);
 #else
             init_groestl( &ctx.groestl, 64 );
diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c
index 12471b4..8d4b055 100644
--- a/algo/x17/x17-4way.c
+++ b/algo/x17/x17-4way.c
@@ -12,10 +12,10 @@
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/jh/jh-hash-4way.h"
 #include "algo/keccak/keccak-hash-4way.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
@@ -31,10 +31,10 @@ typedef struct {
     skein512_4way_context   skein;
     jh512_4way_context      jh;
     keccak512_4way_context  keccak;
-    hashState_luffa         luffa;
+    luffa_2way_context      luffa;
     cubehashParam           cube;
     sph_shavite512_context  shavite;
-    hashState_sd            simd;
+    simd_2way_context       simd;
     hashState_echo          echo;
     hamsi512_4way_context   hamsi;
     sph_fugue512_context    fugue;
@@ -54,10 +54,10 @@ void init_x17_4way_ctx()
      skein512_4way_init( &x17_4way_ctx.skein );
      jh512_4way_init( &x17_4way_ctx.jh );
      keccak512_4way_init( &x17_4way_ctx.keccak );
-     init_luffa( &x17_4way_ctx.luffa, 512 );
+     luffa_2way_init( &x17_4way_ctx.luffa, 512 );
      cubehashInit( &x17_4way_ctx.cube, 512, 16, 32 );
      sph_shavite512_init( &x17_4way_ctx.shavite );
-     init_sd( &x17_4way_ctx.simd, 512 );
+     simd_2way_init( &x17_4way_ctx.simd, 512 );
      init_echo( &x17_4way_ctx.echo, 512 );
      hamsi512_4way_init( &x17_4way_ctx.hamsi );
      sph_fugue512_init( &x17_4way_ctx.fugue );
@@ -114,18 +114,14 @@ void x17_4way_hash( void *state, const void *input )
 
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
-     // 7 Luffa serial
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                             (const BitSequence*)hash0, 64 );
-     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                             (const BitSequence*)hash1, 64 );
-     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                             (const BitSequence*)hash2, 64 );
-     memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                             (const BitSequence*)hash3, 64 );
+     // 7 Luffa
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, 64 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
 
      // 8 Cubehash
      cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0, 64 );
@@ -153,17 +149,13 @@ void x17_4way_hash( void *state, const void *input )
      sph_shavite512_close( &ctx.shavite, hash3 );
 
      // 10 Simd
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                      (const BitSequence *)hash0, 512 );
-     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                      (const BitSequence *)hash1, 512 );
-     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                      (const BitSequence *)hash2, 512 );
-     memcpy( &ctx.simd, &x17_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                      (const BitSequence *)hash3, 512 );
+     mm256_interleave_2x128( vhash, hash0, hash1, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, 512 );
+     mm256_interleave_2x128( vhash, hash2, hash3, 512 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, 512 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, 512 );
 
      // 11 Echo
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
@@ -178,11 +170,11 @@ void x17_4way_hash( void *state, const void *input )
      update_final_echo( &ctx.echo, (BitSequence *)hash3,
                        (const BitSequence *) hash3, 512 );
 
-     // 12 Hamsi parallel 4way 32 bit
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 );
+     // 12 Hamsi
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 );
      hamsi512_4way( &ctx.hamsi, vhash, 64 );
      hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 );
 
      // 13 Fugue
      sph_fugue512( &ctx.fugue, hash0, 64 );
diff --git a/algo/x17/x17.c b/algo/x17/x17.c
index fca8a72..f190a7e 100644
--- a/algo/x17/x17.c
+++ b/algo/x17/x17.c
@@ -21,9 +21,9 @@
 #include "algo/sha/sph_sha2.h"
 #include "algo/haval/sph-haval.h"
 
-#include "algo/luffa/sse2/luffa_for_sse2.h" 
+#include "algo/luffa/luffa_for_sse2.h" 
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/blake/sse2/blake.c"
 #include "algo/bmw/sse2/bmw.c"
 #include "algo/keccak/sse2/keccak.c"
diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c
index 847dadd..1521e11 100644
--- a/algo/x17/xevan-4way.c
+++ b/algo/x17/xevan-4way.c
@@ -13,9 +13,9 @@
 #include "algo/keccak/keccak-hash-4way.h"
 #include "algo/skein/skein-hash-4way.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa-hash-2way.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/simd-hash-2way.h"
 #include "algo/echo/aes_ni/hash_api.h"
 #include "algo/hamsi/hamsi-hash-4way.h"
 #include "algo/fugue/sph_fugue.h"
@@ -31,10 +31,10 @@ typedef struct {
         skein512_4way_context   skein;
         jh512_4way_context      jh;
         keccak512_4way_context  keccak;
-        hashState_luffa         luffa;
+        luffa_2way_context      luffa;
         cubehashParam           cube;
         sph_shavite512_context  shavite;
-        hashState_sd            simd;
+        simd_2way_context       simd;
         hashState_echo          echo;
         hamsi512_4way_context   hamsi;
         sph_fugue512_context    fugue;
@@ -56,10 +56,10 @@ void init_xevan_4way_ctx()
         skein512_4way_init(&xevan_4way_ctx.skein);
         jh512_4way_init(&xevan_4way_ctx.jh);
         keccak512_4way_init(&xevan_4way_ctx.keccak);
-        init_luffa( &xevan_4way_ctx.luffa, 512 );
+        luffa_2way_init( &xevan_4way_ctx.luffa, 512 );
         cubehashInit( &xevan_4way_ctx.cube, 512, 16, 32 );
         sph_shavite512_init( &xevan_4way_ctx.shavite );
-        init_sd( &xevan_4way_ctx.simd, 512 );
+        simd_2way_init( &xevan_4way_ctx.simd, 512 );
         init_echo( &xevan_4way_ctx.echo, 512 );
         hamsi512_4way_init( &xevan_4way_ctx.hamsi );
         sph_fugue512_init( &xevan_4way_ctx.fugue );
@@ -127,20 +127,14 @@ void xevan_4way_hash( void *output, const void *input )
      keccak512_4way( &ctx.keccak, vhash, dataLen );
      keccak512_4way_close( &ctx.keccak, vhash );
 
-     // Serial
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
-
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                             (const BitSequence*)hash0, dataLen );
-     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                             (const BitSequence*)hash1, dataLen );
-     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                             (const BitSequence*)hash2, dataLen );
-     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                             (const BitSequence*)hash3, dataLen );
+     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
+     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
 
      cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
                            dataLen );
@@ -169,17 +163,13 @@ void xevan_4way_hash( void *output, const void *input )
      sph_shavite512( &ctx.shavite, hash3, dataLen );
      sph_shavite512_close( &ctx.shavite, hash3 );
 
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                      (const BitSequence *)hash0, dataLen<<3 );
-     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                      (const BitSequence *)hash1, dataLen<<3  );
-     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                      (const BitSequence *)hash2, dataLen<<3  );
-     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                      (const BitSequence *)hash3, dataLen<<3  );
+     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
+     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
 
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, dataLen<<3 );
@@ -192,12 +182,11 @@ void xevan_4way_hash( void *output, const void *input )
      memcpy( &ctx.echo, &xevan_4way_ctx.echo, sizeof(hashState_echo) );
      update_final_echo( &ctx.echo, (BitSequence *)hash3,
                        (const BitSequence *) hash3, dataLen<<3 );
-
-     // Parallel 32 bit
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     // Parallel
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
      hamsi512_4way( &ctx.hamsi, vhash, dataLen );
      hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
      sph_fugue512( &ctx.fugue, hash0, dataLen );
      sph_fugue512_close( &ctx.fugue, hash0 );
@@ -278,18 +267,13 @@ void xevan_4way_hash( void *output, const void *input )
      keccak512_4way_close( &ctx.keccak, vhash );
 
      mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
-
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0,
-                             (const BitSequence*)hash0, dataLen );
-     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1,
-                             (const BitSequence*)hash1, dataLen );
-     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2,
-                             (const BitSequence*)hash2, dataLen );
-     memcpy( &ctx.luffa, &xevan_4way_ctx.luffa, sizeof(hashState_luffa) );
-     update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3,
-                             (const BitSequence*)hash3, dataLen );
+     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
+     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
+     luffa_2way_init( &ctx.luffa, 512 );
+     luffa_2way_update_close( &ctx.luffa, vhash, vhash, dataLen );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
 
      cubehashUpdateDigest( &ctx.cube, (byte*)hash0, (const byte*) hash0,
                            dataLen );
@@ -318,17 +302,13 @@ void xevan_4way_hash( void *output, const void *input )
      sph_shavite512( &ctx.shavite, hash3, dataLen );
      sph_shavite512_close( &ctx.shavite, hash3 );
 
-     update_final_sd( &ctx.simd, (BitSequence *)hash0,
-                      (const BitSequence *)hash0, dataLen<<3 );
-     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash1,
-                      (const BitSequence *)hash1, dataLen<<3  );
-     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash2,
-                      (const BitSequence *)hash2, dataLen<<3  );
-     memcpy( &ctx.simd, &xevan_4way_ctx.simd, sizeof(hashState_sd) );
-     update_final_sd( &ctx.simd, (BitSequence *)hash3,
-                      (const BitSequence *)hash3, dataLen<<3  );
+     mm256_interleave_2x128( vhash, hash0, hash1, dataLen<<3 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
+     mm256_deinterleave_2x128( hash0, hash1, vhash, dataLen<<3 );
+     mm256_interleave_2x128( vhash, hash2, hash3, dataLen<<3 );
+     simd_2way_init( &ctx.simd, 512 );
+     simd_2way_update_close( &ctx.simd, vhash, vhash, dataLen<<3 );
+     mm256_deinterleave_2x128( hash2, hash3, vhash, dataLen<<3 );
 
      update_final_echo( &ctx.echo, (BitSequence *)hash0,
                        (const BitSequence *) hash0, dataLen<<3 );
@@ -342,10 +322,10 @@ void xevan_4way_hash( void *output, const void *input )
      update_final_echo( &ctx.echo, (BitSequence *)hash3,
                        (const BitSequence *) hash3, dataLen<<3 );
 
-     mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
+     mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 );
      hamsi512_4way( &ctx.hamsi, vhash, dataLen );
      hamsi512_4way_close( &ctx.hamsi, vhash );
-     mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
+     mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, dataLen<<3 );
 
      sph_fugue512( &ctx.fugue, hash0, dataLen );
      sph_fugue512_close( &ctx.fugue, hash0 );
diff --git a/algo/x17/xevan.c b/algo/x17/xevan.c
index f3c4f9d..c3e6918 100644
--- a/algo/x17/xevan.c
+++ b/algo/x17/xevan.c
@@ -11,14 +11,14 @@
 #include "algo/keccak/sph_keccak.h"
 #include "algo/skein/sph_skein.h"
 #include "algo/shavite/sph_shavite.h"
-#include "algo/luffa/sse2/luffa_for_sse2.h"
+#include "algo/luffa/luffa_for_sse2.h"
 #include "algo/hamsi/sph_hamsi.h"
 #include "algo/fugue/sph_fugue.h"
 #include "algo/shabal/sph_shabal.h"
 #include "algo/whirlpool/sph_whirlpool.h"
 #include "algo/sha/sph_sha2.h"
 #include "algo/haval/sph-haval.h"
-#include "algo/simd/sse2/nist.h"
+#include "algo/simd/nist.h"
 #include "algo/cubehash/sse2/cubehash_sse2.h"
 #include <openssl/sha.h>
 #ifdef NO_AES_NI
diff --git a/algo/yescrypt/yescrypt.c b/algo/yescrypt/yescrypt.c
index aab95c8..78107f1 100644
--- a/algo/yescrypt/yescrypt.c
+++ b/algo/yescrypt/yescrypt.c
@@ -424,12 +424,17 @@ int64_t yescryptr16_get_max64()
   return 0xfffLL;
 }
 
-bool register_yescrypt_algo( algo_gate_t* gate )
+void yescrypt_gate_base(algo_gate_t *gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT;
+   gate->optimizations = SSE2_OPT | AVX_OPT | SHA_OPT;
    gate->scanhash   = (void*)&scanhash_yescrypt;
    gate->hash       = (void*)&yescrypt_hash;
    gate->set_target = (void*)&scrypt_set_target;
+}
+
+bool register_yescrypt_algo( algo_gate_t* gate )
+{
+   yescrypt_gate_base( gate );
    gate->get_max64  = (void*)&yescrypt_get_max64;
    client_key_hack = true;
    YESCRYPT_N = 2048;
@@ -440,10 +445,7 @@ bool register_yescrypt_algo( algo_gate_t* gate )
 
 bool register_yescryptr8_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT;
-   gate->scanhash   = (void*)&scanhash_yescrypt;
-   gate->hash       = (void*)&yescrypt_hash;
-   gate->set_target = (void*)&scrypt_set_target;
+   yescrypt_gate_base( gate );
    gate->get_max64  = (void*)&yescrypt_get_max64;
    client_key_hack = false;
    YESCRYPT_N = 2048;
@@ -454,10 +456,7 @@ bool register_yescryptr8_algo( algo_gate_t* gate )
 
 bool register_yescryptr16_algo( algo_gate_t* gate )
 {
-   gate->optimizations = SSE2_OPT | SHA_OPT;
-   gate->scanhash   = (void*)&scanhash_yescrypt;
-   gate->hash       = (void*)&yescrypt_hash;
-   gate->set_target = (void*)&scrypt_set_target;
+   yescrypt_gate_base( gate );
    gate->get_max64  = (void*)&yescryptr16_get_max64;
    client_key_hack = false;
    YESCRYPT_N = 4096;   
diff --git a/avxdefs.h b/avxdefs.h
index fa018b4..9664beb 100644
--- a/avxdefs.h
+++ b/avxdefs.h
@@ -3,7 +3,7 @@
 
 // Some tools to help using AVX and AVX2.
 // SSE2 is required for most 128 vector operations with the exception of
-// _mm_shuffle_epi8, used by byteswap, which needs SSSE3.
+// _mm_shuffle_epi8, used by bswap, which needs SSSE3.
 // AVX2 is required for all 256 bit vector operations.
 // AVX512 has more powerful 256 bit instructions but with AVX512 available
 // there is little reason to use them.
@@ -14,133 +14,157 @@
 // There exist duplicates of some functions. In general the first defined
 // is preferred as it is more efficient but also more restrictive and may
 // not be applicable. The less efficient versions are more flexible.
+//
+// Naming convention:
+//
+// [prefix]_[operation]_[size]
+//
+// prefix: 
+//    m128:  128 bit variable vector data
+//    c128:  128 bit constant vector data
+//    mm:    128 bit intrinsic function
+//    m256:  256 bit variable vector data
+//    c256:  256 bit constant vector data
+//    mm256: 256 bit intrinsic function
+//
+// operation;
+//    data:     variable/constant name
+//    function: dexcription of operation
+//
+// size: size of element if applicable
+// 
 
 #include <inttypes.h>
 #include <immintrin.h>
 #include <memory.h>
 #include <stdbool.h>
 
-//
 // 128 bit utilities and shortcuts
 
 //
-// Pseudo constants, there are no real vector constants.
+// Experimental code to implement compile time vector initialization
+// and support for constant vectors. Useful for arrays, simple constant
+// vectors should use _mm_set at run time. The supporting constant and
+// function macro definitions are used only for initializing global or
+// local, constant or variable vectors.
+// Element size is only used for intialization, all run time references should
+// use the vector overlay with any element size.
+//
+// Long form initialization with union member specifier:
+//
+//   __m128i foo()
+//   {
+//      const m128_v64[] = { {{ 0, 0 }}, {{ 0, 0 }}, ... };
+//      return x.m128i;
+//   }
+//
+// Short form macros with union member abstracted:
+//
+//   __m128i foo()
+//   {
+//      const m128i_v64 x_[] = { c128_zero, c128_zero, ... };
+//      #define x ((__m128i*)x_);
+//      return x;
+//      #undef x
+//   }
+//
+
+union m128_v64 {
+  uint64_t u64[2];
+  __m128i m128i;
+};
+typedef union m128_v64 m128_v64; 
+
+union m128_v32 {
+  uint32_t u32[4];
+  __m128i m128i;
+};
+typedef union m128_v32 m128_v32;
+
+union m128_v16 {
+  uint16_t u16[8];
+  __m128i m128i;
+};
+typedef union m128_v16 m128_v16;
+
+union m128_v8 {
+  uint8_t u8[16];
+  __m128i m128i;
+};
+typedef union m128_v8 m128_v8;
+
+// Compile time definition macros, for initializing only.
+// x must be a scalar constant.
+#define mm_setc_64( x1, x0 ) {{ x1, x0 }}
+#define mm_setc1_64( x )     {{  x,  x }}
+
+#define mm_setc_32(  x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
+#define mm_setc1_32(  x ) {{  [0 ... 3] = x }}
+
+#define mm_setc_16( x7, x6, x5, x4, x3, x2, x1, x0 ) \
+                 {{ x7, x6, x5, x4, x3, x2, x1, x0 }}
+#define mm_setc1_16( x ) {{ [0 ... 7] = x }}
+
+#define mm_setc_8( x15, x14, x13, x12, x11, x10, x09, x08, \
+                   x07, x06, x05, x04, x03, x02, x01, x00 ) \
+                {{ x15, x14, x13, x12, x11, x10, x09, x08, \
+                   x07, x06, x05, x04, x03, x02, x01, x00 }}
+#define mm_setc1_8( x ) {{ [0 ... 15] = x }}
+
+// Compile time constants, use only for initializing.
+#define c128_zero      mm_setc1_64( 0ULL )
+#define c128_neg1      mm_setc1_64( 0xFFFFFFFFFFFFFFFFULL )
+#define c128_one_128   mm_setc_64(  0ULL, 1ULL )  
+#define c128_one_64    mm_setc1_64( 1ULL )
+#define c128_one_32    mm_setc1_32( 1UL )
+#define c128_one_16    mm_setc1_16( 1U )
+#define c128_one_8     mm_setc1_8(  1U )
+
+
+// compile test
+static const m128_v8 yyy_ = mm_setc1_8( 3 );
+#define yyy yyy_.m128i
+
+static const m128_v64 zzz_[] = { c128_zero, c128_zero };
+#define zzz ((const __m128i*)zzz_)
+static inline __m128i foo()
+{
+ m128_v64 x = mm_setc_64( 1, 2 );
+ return  _mm_add_epi32( zzz[0], x.m128i );
+}
+
+//
+// Pseudo constants.
 // These can't be used for compile time initialization.
+// These should be used for all simple vectors. Use above for
+// vector array initializing.
 
 // Constant zero
-#define mm_zero      _mm_setzero_si128()
+#define m128_zero      _mm_setzero_si128()
 
 // Constant 1
-#define mm_one_128   _mm_set_epi64x(  0ULL, 1ULL )
-#define mm_one_64    _mm_set1_epi64x( 1ULL )
-#define mm_one_32    _mm_set1_epi32(  1UL )
-#define mm_one_16    _mm_set1_epi16(  1U )
-#define mm_one_8     _mm_set1_epi8(   1U )
+#define m128_one_128   _mm_set_epi64x(  0ULL, 1ULL )
+#define m128_one_64    _mm_set1_epi64x( 1ULL )
+#define m128_one_32    _mm_set1_epi32(  1UL )
+#define m128_one_16    _mm_set1_epi16(  1U )
+#define m128_one_8     _mm_set1_epi8(   1U )
 
 // Constant minus 1
-#define mm_neg1      _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
+#define m128_neg1      _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
 
 //
 // Basic operations without equivalent SIMD intrinsic
 
-// Bitwise not (~x)
-#define mm_not( x )  _mm_xor_si128( (x), mm_neg1 ) 
+// Bitwise not (~v)
+#define mm_not( v )  _mm_xor_si128( (v), m128_neg1 ) 
 
-// Unary negation (-a)
-#define mm_negate_64( a ) _mm_sub_epi64( mm_zero, a )
-#define mm_negate_32( a ) _mm_sub_epi32( mm_zero, a )  
-#define mm_negate_16( a ) _mm_sub_epi16( mm_zero, a )  
+// Unary negation (-v)
+#define mm_negate_64( v ) _mm_sub_epi64( m128_zero, v )
+#define mm_negate_32( v ) _mm_sub_epi32( m128_zero, v )  
+#define mm_negate_16( v ) _mm_sub_epi16( m128_zero, v )  
 
 //
-// Bit operations
-
-// Return bit n in position, all other bits zeroed.
-#define mm_bitextract_64 ( x, n ) \
-   _mm_and_si128( _mm_slli_epi64( mm_one_64, n ), x )
-#define mm_bitextract_32 ( x, n ) \
-   _mm_and_si128( _mm_slli_epi32( mm_one_32, n ), x )
-#define mm_bitextract_16 ( x, n ) \
-   _mm_and_si128( _mm_slli_epi16( mm_one_16, n ), x )
-
-// Return bit n as bool
-#define mm_bittest_64( x, n ) \
-   _mm_and_si256( mm_one_64, _mm_srli_epi64( x, n ) ) 
-#define mm_bittest_32( x, n ) \
-   _mm_and_si256( mm_one_32, _mm_srli_epi32( x, n ) ) 
-#define mm_bittest_16( x, n ) \
-   _mm_and_si256( mm_one_16, _mm_srli_epi16( x, n ) ) 
-
-// Return x with bit n set/cleared in all elements
-#define mm_bitset_64( x, n ) \
-   _mm_or_si128( _mm_slli_epi64( mm_one_64, n ), x )
-#define mm_bitclr_64( x, n ) \
-   _mm_andnot_si128( _mm_slli_epi64( mm_one_64, n ), x )
-#define mm_bitset_32( x, n ) \
-   _mm_or_si128( _mm_slli_epi32( mm_one_32, n ), x )
-#define mm_bitclr_32( x, n ) \
-   _mm_andnot_si128( _mm_slli_epi32( mm_one_32, n ), x )
-#define mm_bitset_16( x, n ) \
-   _mm_or_si128( _mm_slli_epi16( mm_one_16, n ), x )
-#define mm_bitclr_16( x, n ) \
-   _mm_andnot_si128( _mm_slli_epi16( mm_one_16, n ), x )
-
-// Return x with bit n toggled
-#define mm_bitflip_64( x, n ) \
-   _mm_xor_si128( _mm_slli_epi64( mm_one_64, n ), x )
-#define mm_bitflip_32( x, n ) \
-   _mm_xor_si128( _mm_slli_epi32( mm_one_32, n ), x )
-#define mm_bitflip_16( x, n ) \
-   _mm_xor_si128( _mm_slli_epi16( mm_one_16, n ), x )
-
-
-//
-// Memory functions
-// n = number of __m128i, bytes/16
-
-inline void memset_zero_128( __m128i *dst,  int n )
-{
-   for ( int i = 0; i < n; i++ ) dst[i] = mm_zero;
-}
-
-inline void memset_128( __m128i *dst, const __m128i a,  int n )
-{
-   for ( int i = 0; i < n; i++ ) dst[i] = a;
-}
-
-inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
-{
-   for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
-}
-
-// Compare data in memory, return true if different
-inline bool memcmp_128( __m128i src1, __m128i src2, int n )
-{
-   for ( int i = 0; i < n; i++ )
-     if ( src1[i] != src2[i] ) return true;
-   return false;
-}
-
-// A couple of 64 bit scalar functions
-// n = bytes/8
-
-inline void memcpy_64( uint64_t *dst, const uint64_t *src, int n )
-{
-   for ( int i = 0; i < n; i++ ) dst[i] = src[i];
-}
-
-inline void memset_zero_64( uint64_t *src, int n )
-{
-   for ( int i = 0; i < n; i++ ) src[i] = 0;
-}
-
-inline void memset_64( uint64_t *dst, uint64_t a,  int n )
-{
-   for ( int i = 0; i < n; i++ ) dst[i] = a;
-}
-
-
-//
-// Pointer cast
+// Vector pointer cast
 
 // p = any aligned pointer
 // returns p as pointer to vector type
@@ -154,26 +178,248 @@ inline void memset_64( uint64_t *dst, uint64_t a,  int n )
 // returns p[i]
 #define casti_m128i(p,i) (((__m128i*)(p))[(i)])
 
+//
+// Memory functions
+// n = number of __m128i, bytes/16
+
+static inline void memset_zero_128( __m128i *dst,  int n )
+{   for ( int i = 0; i < n; i++ ) dst[i] = m128_zero; }
+
+static inline void memset_128( __m128i *dst, const __m128i a,  int n )
+{   for ( int i = 0; i < n; i++ ) dst[i] = a; }
+
+static inline void memcpy_128( __m128i *dst, const __m128i *src, int n )
+{   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
+
+// Compare data in memory, return true if different
+static inline bool memcmp_128( __m128i src1, __m128i src2, int n )
+{   for ( int i = 0; i < n; i++ )
+      if ( src1[i] != src2[i] ) return true;
+    return false;
+}
+
+// A couple of 64 bit scalar functions
+// n = bytes/8
+
+static inline void memcpy_64( uint64_t *dst, const uint64_t *src, int n )
+{   for ( int i = 0; i < n; i++ ) dst[i] = src[i]; }
+
+static inline void memset_zero_64( uint64_t *src, int n )
+{   for ( int i = 0; i < n; i++ ) src[i] = 0; }
+
+static inline void memset_64( uint64_t *dst, uint64_t a,  int n )
+{   for ( int i = 0; i < n; i++ ) dst[i] = a; }
+
+
+//
+// Bit operations
+
+// Return a vector with n bits extracted and right justified from each
+// element of v starting at bit i.
+static inline __m128i mm_bfextract_64( __m128i v, int i, int n )
+{   return _mm_srli_epi64( _mm_slli_epi64( v, 64 - i - n ), 64 - n ); } 
+
+static inline __m128i mm_bfextract_32( __m128i v, int i, int n )
+{   return _mm_srli_epi32( _mm_slli_epi32( v, 32 - i - n ), 32 - n ); }
+
+static inline __m128i mm_bfextract_16( __m128i v, int i, int n )
+{   return _mm_srli_epi16( _mm_slli_epi16( v, 16 - i - n ), 16 - n ); }
+
+// Return v with n bits from a inserted starting at bit i.
+static inline __m128i mm_bfinsert_64( __m128i v, __m128i a, int i, int n )
+{   return _mm_or_si128(
+               _mm_and_si128( v,
+                  _mm_srli_epi64( _mm_slli_epi64( m128_neg1, 64-n ), 64-i ) ),
+           _mm_slli_epi64( a, i) );
+}
+
+static inline __m128i mm_bfinsert_32( __m128i v, __m128i a, int i, int n )
+{   return _mm_or_si128( 
+               _mm_and_si128( v,
+                  _mm_srli_epi32( _mm_slli_epi32( m128_neg1, 32-n ), 32-i ) ),
+           _mm_slli_epi32( a, i) );
+}
+
+static inline __m128i mm_bfinsert_16( __m128i v, __m128i a, int i, int n )
+{   return _mm_or_si128( 
+               _mm_and_si128( v,
+                  _mm_srli_epi16( _mm_slli_epi16( m128_neg1, 16-n ), 16-i ) ),
+           _mm_slli_epi16( a, i) );
+}
+
+// not very useful, just use a mask.
+// Return vector with bit i of each element in v in position,
+// all other bits zeroed.
+static inline __m128i  mm_bitextract_64( __m128i v, int i )
+{   return _mm_and_si128( v, _mm_slli_epi64( m128_one_64, i ) ); }
+
+static inline __m128i mm_bitextract_32( __m128i v, int i )
+{   return _mm_and_si128( v, _mm_slli_epi32( m128_one_32, i ) ); }
+
+static inline __m128i mm_bitextract_16( __m128i v, int i )
+{   return _mm_and_si128( v, _mm_slli_epi16( m128_one_16, i ) ); }
+
+// obsolete, use bfextract with n = 1
+// Return vector with bit i of each element of v as a bool
+// (shifted to position 0)
+static inline __m128i mm_bittest_64( __m128i v, int i )
+{   return _mm_and_si128( _mm_srli_epi64( v, i ), m128_one_64 ); }
+
+static inline __m128i mm_bittest_32( __m128i v, int i )
+{   return _mm_and_si128( _mm_srli_epi32( v, i ), m128_one_64 ); }
+
+static inline __m128i mm_bittest_16( __m128i v, int i )
+{   return _mm_and_si128( _mm_srli_epi16( v, i ), m128_one_64 ); }
+
+// Return vector with bit i of each element in v set/cleared
+static inline __m128i mm_bitset_64( __m128i v, int i )
+{   return _mm_or_si128( _mm_slli_epi64( m128_one_64, i ), v ); }
+
+static inline __m128i mm_bitclr_64( __m128i v, int i )
+{   return _mm_andnot_si128( _mm_slli_epi64( m128_one_64, i ), v ); }
+
+static inline __m128i mm_bitset_32( __m128i v, int i )
+{   return _mm_or_si128( _mm_slli_epi32( m128_one_32, i ), v ); }
+
+static inline __m128i mm_bitclr_32( __m128i v, int i )
+{   return _mm_andnot_si128( _mm_slli_epi32( m128_one_32, i ), v ); }
+
+static inline __m128i mm_bitset_16( __m128i v, int i )
+{   return _mm_or_si128( _mm_slli_epi16( m128_one_16, i ), v ); }
+
+static inline __m128i mm_bitclr_16( __m128i v, int i )
+{   return _mm_andnot_si128( _mm_slli_epi16( m128_one_16, i ), v ); }
+
+// Return vector with bit i in each element toggled
+static inline __m128i mm_bitflip_64( __m128i v, int i )
+{   return _mm_xor_si128( _mm_slli_epi64( m128_one_64, i ), v ); }
+
+static inline __m128i mm_bitflip_32( __m128i v, int i )
+{   return _mm_xor_si128( _mm_slli_epi32( m128_one_32, i ), v ); }
+
+static inline __m128i mm_bitflip_16( __m128i v, int i )
+{   return _mm_xor_si128( _mm_slli_epi16( m128_one_16, i ), v ); }
+
+
+// converting bitmask to vector mask
+// return vector with each element set to -1 if the corresponding
+// bit in the bitmask is set and zero if the corresponding bit is clear.
+// Can be used by blend
+static inline __m128i mm_mask_to_vmask_64( uint8_t m )
+{  return _mm_set_epi64x( -( (m>>1) & 1 ), -( m & 1 ) ); }
+
+static inline __m128i mm_mask_to_vmask_32( uint8_t m )
+{  return _mm_set_epi32( -( (m>>3) & 1 ), -( (m>>2) & 1 ),
+                         -( (m>>1) & 1 ), -(  m     & 1 ) );
+}
+
+static inline __m128i mm_mask_to_vmask_16( uint8_t m )
+{  return _mm_set_epi16( -( (m>>7) & 1 ), -( (m>>6) & 1 ),
+                         -( (m>>5) & 1 ), -(  m>>4  & 1 ),
+                         -( (m>>3) & 1 ), -( (m>>2) & 1 ),
+                         -( (m>>1) & 1 ), -(  m     & 1 ) );
+}
+
+// converting immediate index to vector index, used by permute, shuffle, shift
+// Return vector with each element set from the corresponding n bits in imm8
+// index i.
+static inline __m128i mm_index_to_vindex_64( uint8_t i, uint8_t n )
+{  uint8_t mask = ( 2 << n ) - 1;
+   return _mm_set_epi64x( (i >> n) & mask, i & mask );
+}
+
+static inline __m128i mm_index_to_vindex_32( uint8_t i, uint8_t n )
+{  uint8_t mask = ( 2 << n ) - 1;
+   return _mm_set_epi32( ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
+                         ( (i >>   n) & mask ), (  i         & mask ) ) ;
+}
+
+static inline __m128i mm_index_to_vindex_16( uint8_t i, uint8_t n )
+{  uint8_t mask = ( 2 << n ) - 1;
+   return _mm_set_epi16( ( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ),
+                         ( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ),
+                         ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
+                         ( (i >>   n) & mask ), (  i         & mask ) ) ;
+}
+
+static inline uint8_t mm_vindex_to_imm8_64( __m128i v, uint8_t n )
+{  m128_v64 s = (m128_v64)v;
+   return ( s.u64[1] << n ) | ( s.u64[0] );
+} 
+
+static inline uint8_t mm_vindex_to_imm8_32( __m128i v, uint8_t n )
+{  m128_v32 s = (m128_v32)v;
+   return ( s.u32[3] << 3*n ) | ( s.u32[2] << 2*n )
+        | ( s.u32[1] <<   n ) | ( s.u32[0]        );
+}
+
+static inline uint8_t mm_vindex_to_imm8_16( __m128i v, uint8_t n )
+{  m128_v16 s = (m128_v16)v;
+   return ( s.u16[7] << 7*n ) | ( s.u16[6] << 6*n )
+        | ( s.u16[5] << 5*n ) | ( s.u16[4] << 4*n )
+        | ( s.u16[3] << 3*n ) | ( s.u16[2] << 2*n )
+        | ( s.u16[1] <<   n ) | ( s.u16[0]        );
+}
+
+
 //
 // Bit rotations
 
 // XOP is an obsolete AMD feature that has native rotation. 
-//    _mm_roti_epi64( w, c)
+//    _mm_roti_epi64( v, c)
 // Never implemented by Intel and since removed from Zen by AMD.
 
 // Rotate bits in vector elements
-#define mm_rotr_64( w, c ) _mm_or_si128( _mm_srli_epi64( w, c ), \
-                                         _mm_slli_epi64( w, 64-(c) ) )
-#define mm_rotl_64( w, c ) _mm_or_si128( _mm_slli_epi64( w, c ), \
-                                         _mm_srli_epi64( w, 64-(c) ) )
-#define mm_rotr_32( w, c ) _mm_or_si128( _mm_srli_epi32( w, c ), \
-                                         _mm_slli_epi32( w, 32-(c) ) )
-#define mm_rotl_32( w, c ) _mm_or_si128( _mm_slli_epi32( w, c ), \
-                                         _mm_srli_epi32( w, 32-(c) ) )
-#define mm_rotr_16( w, c ) _mm_or_si128( _mm_srli_epi16( w, c ), \
-                                         _mm_slli_epi16( w, 16-(c) ) )
-#define mm_rotl_16( w, c ) _mm_or_si128( _mm_slli_epi16( w, c ), \
-                                         _mm_srli_epi16( w, 16-(c) ) )
+
+static inline __m128i mm_rotr_64( __m128i v, int c )
+{ return _mm_or_si128( _mm_srli_epi64( v, c ), _mm_slli_epi64( v, 64-(c) ) ); }
+
+static inline __m128i mm_rotl_64( __m128i v, int c )
+{ return _mm_or_si128( _mm_slli_epi64( v, c ), _mm_srli_epi64( v, 64-(c) ) ); }
+
+static inline __m128i mm_rotr_32( __m128i v, int c )
+{ return _mm_or_si128( _mm_srli_epi32( v, c ), _mm_slli_epi32( v, 32-(c) ) ); }
+
+static inline __m128i mm_rotl_32( __m128i v, int c ) 
+{ return _mm_or_si128( _mm_slli_epi32( v, c ), _mm_srli_epi32( v, 32-(c) ) ); }
+
+static inline __m128i mm_rotr_16( __m128i v, int c )
+{ return _mm_or_si128( _mm_srli_epi16( v, c ), _mm_slli_epi16( v, 16-(c) ) ); }
+
+static inline __m128i mm_rotl_16( __m128i v, int c )
+{ return _mm_or_si128( _mm_slli_epi16( v, c ), _mm_srli_epi16( v, 16-(c) ) ); }
+
+// Rotate bits in each element by amount in corresponding element of
+// index vector
+/* Needs AVX2
+static inline __m128i mm_rotrv_64( __m128i v, __m128i c )
+{
+   return _mm_or_si128(
+              _mm_srlv_epi64( v, c ),
+              _mm_sllv_epi64( v, _mm_sub_epi64( _mm_set1_epi64x(64), c ) ) );
+}
+
+static inline __m128i mm_rotlv_64( __m128i v, __m128i c )
+{
+   return _mm_or_si128(
+              _mm_sllv_epi64( v, c ),
+              _mm_srlv_epi64( v, _mm_sub_epi64( _mm_set1_epi64x(64), c ) ) );
+}
+
+static inline __m128i mm_rotrv_32( __m128i v, __m128i c )
+{
+   return _mm_or_si128(
+              _mm_srlv_epi32( v, c ),
+              _mm_sllv_epi32( v, _mm_sub_epi32( _mm_set1_epi32(32), c ) ) );
+}
+
+static inline __m128i mm_rotlv_32( __m128i v, __m128i c )
+{
+   return _mm_or_si128(
+              _mm_sllv_epi32( v, c ),
+              _mm_srlv_epi32( v, _mm_sub_epi32( _mm_set1_epi32(32), c ) ) );
+}
+*/
 
 //
 // Rotate elements in vector
@@ -181,126 +427,107 @@ inline void memset_64( uint64_t *dst, uint64_t a,  int n )
 // Optimized shuffle
 
 // Swap hi/lo 64 bits in 128 bit vector
-#define mm_swap_64( w )    _mm_shuffle_epi32( w, 0x4e )
+#define mm_swap_64( v )    _mm_shuffle_epi32( v, 0x4e )
 
-// rotate 128 bit vector by 32 bits
-#define mm_rotr_1x32( w )  _mm_shuffle_epi32( w, 0x39 )
-#define mm_rotl_1x32( w )  _mm_shuffle_epi32( w, 0x93 )
+// Rotate 128 bit vector by 32 bits
+#define mm_rotr_1x32( v )  _mm_shuffle_epi32( v, 0x39 )
+#define mm_rotl_1x32( v )  _mm_shuffle_epi32( v, 0x93 )
 
 // Swap hi/lo 32 bits in each 64 bit element
-#define mm_swap64_32( x )  _mm_shuffle_epi32( x, 0xb1 )
+#define mm_swap64_32( v )  _mm_shuffle_epi32( v, 0xb1 )
 
 // Less efficient but more versatile. Use only for odd number rotations.
 // Use shuffle above when possible.
 
 // Rotate vector by n bytes.
-#define mm_rotr128_x8( w, n ) \
-     _mm_or_si128( _mm_srli_si128( w, n ), _mm_slli_si128( w, 16-(n) ) )
-#define mm_rotl128_x8( w, n ) \
-     _mm_or_si128( _mm_slli_si128( w, n ), _mm_srli_si128( w, 16-(n) ) )
+static inline __m128i mm_brotr_128( __m128i v, int c )
+{
+  return _mm_or_si128( _mm_bsrli_si128( v, c ), _mm_bslli_si128( v, 16-(c) ) );}
+
+static inline __m128i mm_brotl_128( __m128i v, int c )
+{
+  return _mm_or_si128( _mm_bslli_si128( v, c ), _mm_bsrli_si128( v, 16-(c) ) );
+}
 
 // Rotate vector by c elements, use only for odd number rotations
-#define mm_rotr128_x32( w, c ) mm_rotr128_x8( w, (c)>>2 ) 
-#define mm_rotl128_x32( w, c ) mm_rotl128_x8( w, (c)>>2 )
-#define mm_rotr128_x16( w, c ) mm_rotr128_x8( w, (c)>>1 ) 
-#define mm_rotl128_x16( w, c ) mm_rotl128_x8( w, (c)>>1 )
+#define mm_rotr128_x32( v, c ) mm_brotr_128( v, (c)>>2 ) 
+#define mm_rotl128_x32( v, c ) mm_brotl_128( v, (c)>>2 )
+#define mm_rotr128_x16( v, c ) mm_brotr_128( v, (c)>>1 ) 
+#define mm_rotl128_x16( v, c ) mm_brotl_128( v, (c)>>1 )
 
 //
-// Rotate elements across two 128 bit vectors as one 256 bit vector {hi,lo}
+// Rotate elements across two 128 bit vectors as one 256 bit vector
 
 // Swap 128 bit source vectors in place, aka rotate 256 bits by 128 bits.
 // void mm128_swap128( __m128i, __m128i )
-#define mm_swap_128(hi, lo) \
+#define mm_swap_128(v1, v2) \
 { \
-   hi = _mm_xor_si128(hi, lo); \
-   lo = _mm_xor_si128(hi, lo); \
-   hi = _mm_xor_si128(hi, lo); \
+   v1 = _mm_xor_si128(v1, v2); \
+   v2 = _mm_xor_si128(v1, v2); \
+   v1 = _mm_xor_si128(v1, v2); \
 }
 
 // Rotate two 128 bit vectors in place as one 256 vector by 1 element
-#define mm_rotl256_1x64( hi, lo ) \
+#define mm_rotl256_1x64( v1, v2 ) \
 do { \
  __m128i t; \
- hi = mm_swap_64( hi ); \
- lo = mm_swap_64( lo ); \
- t  = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
- lo = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
- hi = t; \
+ v1 = mm_swap_64( v1 ); \
+ v2 = mm_swap_64( v2 ); \
+ t  = _mm_blendv_epi8( v1, v2, _mm_set_epi64x(0xffffffffffffffffull, 0ull)); \
+ v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi64x(0ull, 0xffffffffffffffffull)); \
+ v1 = t; \
 } while(0)
 
-#define mm_rotr256_1x64( hi, lo ) \
+#define mm_rotr256_1x64( v1, v2 ) \
 do { \
  __m128i t; \
- hi = mm_swap_64( hi ); \
- lo = mm_swap_64( lo ); \
- t  = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0ull, 0xffffffffffffffffull )); \
- lo = _mm_blendv_epi8( hi, lo, _mm_set_epi64x( 0xffffffffffffffffull, 0ull )); \
- hi = t; \
+ v1 = mm_swap_64( v1 ); \
+ v2 = mm_swap_64( v2 ); \
+ t  = _mm_blendv_epi8( v1, v2, _mm_set_epi64x(0ull, 0xffffffffffffffffull)); \
+ v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi64x(0xffffffffffffffffull, 0ull)); \
+ v1 = t; \
 } while(0)
 
-#define mm_rotl256_1x32( hi, lo ) \
+#define mm_rotl256_1x32( v1, v2 ) \
 do { \
  __m128i t; \
- hi = mm_swap_64( hi ); \
- lo = mm_swap_64( lo ); \
- t  = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
-                 0xfffffffful, 0xfffffffful, 0xfffffffful,          0ul )); \
- lo = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
-                          0ul,          0ul,          0ul, 0xfffffffful )); \
- hi = t; \
+ v1 = mm_swap_64( v1 ); \
+ v2 = mm_swap_64( v2 ); \
+ t  = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
+                         0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
+ v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
+                                           0ul, 0ul, 0ul, 0xfffffffful )); \
+ v1 = t; \
 } while(0)
 
-#define mm_rotr256_1x32( hi, lo ) \
+#define mm_rotr256_1x32( v1, v2 ) \
 do { \
  __m128i t; \
- hi = mm_swap_64( hi ); \
- lo = mm_swap_64( lo ); \
- t  = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
-                          0ul,          0ul,          0ul, 0xfffffffful )); \
- lo = _mm_blendv_epi8( hi, lo, _mm_set_epi32( \
-                 0xfffffffful, 0xfffffffful, 0xfffffffful,          0ul )); \
- hi = t; \
+ v1 = mm_swap_64( v1 ); \
+ v2 = mm_swap_64( v2 ); \
+ t  = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
+                                           0ul, 0ul, 0ul, 0xfffffffful )); \
+ v2 = _mm_blendv_epi8( v1, v2, _mm_set_epi32( \
+                         0xfffffffful, 0xfffffffful, 0xfffffffful, 0ul )); \
+ v1 = t; \
 } while(0)
 
-// Return hi 128 bits with elements shifted one lane with vacated lane filled
-// with data rotated from lo.
-// Partially rotate elements in two 128 bit vectors as one 256 bit vector
-// and return the rotated high 128 bits.
-// Similar to mm_rotr256_1x32 but only a partial rotation as lo is not
-// completed. It's faster than a full rotation.
-
-inline __m128i mm_rotr256hi_1x32( __m128i hi, __m128i lo, int n )
-{
-   return _mm_or_si128( _mm_srli_si128( hi, n<<2 ),
-                        _mm_slli_si128( lo, 16 - (n<<2) ) );
-}
-
-inline __m128i mm_rotl256hi_1x32( __m128i hi, __m128i lo, int n )
-{
-   return _mm_or_si128( _mm_slli_si128( hi, n<<2 ), 
-                        _mm_srli_si128( lo, 16 - (n<<2) ) );
-}
-
 //
 // Swap bytes in vector elements
-
-inline __m128i mm_byteswap_64( __m128i x )
-{
-  return _mm_shuffle_epi8( x, _mm_set_epi8(
+static inline __m128i mm_bswap_64( __m128i v )
+{  return _mm_shuffle_epi8( v, _mm_set_epi8(
                            0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
                            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
 }
 
-inline __m128i mm_byteswap_32( __m128i x )
-{
-  return _mm_shuffle_epi8( x, _mm_set_epi8(
+static inline __m128i mm_bswap_32( __m128i v )
+{  return _mm_shuffle_epi8( v, _mm_set_epi8(
                            0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
                            0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
 }
 
-inline __m128i mm_byteswap_16( __m128i x )
-{
-  return _mm_shuffle_epi8( x, _mm_set_epi8(
+static inline __m128i mm_bswap_16( __m128i v )
+{  return _mm_shuffle_epi8( v, _mm_set_epi8(
                            0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
                            0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01 ) );
 }
@@ -312,101 +539,111 @@ inline __m128i mm_byteswap_16( __m128i x )
 //
 // 256 bit utilities and Shortcuts
 
+// Vector overlays used by compile time vector constants.
+// Vector operands of these types require union member .v be
+// appended to the symbol name.
+
+// can this be used with aes
+union m256_v128 {
+  uint64_t v64[4];
+  __m128i  v128[2];
+  __m256i  m256i;
+};
+
+typedef union m256_v128 m256_v128;
+
+union m256_v64 {
+  uint64_t u64[4];
+  __m256i m256i;
+};
+typedef union m256_v64 m256_v64;
+
+union m256_v32 {
+  uint32_t u32[8];
+  __m256i m256i;
+};
+typedef union m256_v32 m256_v32;
+
+union m256_v16 {
+  uint16_t u16[16];
+  __m256i m256i;
+};
+typedef union m256_v16 m256_v16;
+
+union m256_v8 {
+  uint8_t u8[32];
+  __m256i m256i;
+};
+typedef union m256_v8 m256_v8;
+
+// The following macro constants and fucntions may only be used
+// for compile time intialization of constant and variable vectors
+// and should only be used for arrays. Use _mm256_set at run time for
+// simple constant vectors.
+ 
+#define mm256_setc_64( x3, x2, x1, x0 ) {{ x3, x2, x1, x0 }}
+#define mm256_setc1_64( x ) {{ [0 ... 3] = x }}
+
+#define mm256_setc_32( x7, x6, x5, x4, x3, x2, x1, x0 ) \
+                    {{ x7, x6, x5, x4, x3, x2, x1, x0 }}
+#define mm256_setc1_32( x ) {{ [0 ... 7] = x }}
+
+#define mm256_setc_16( x15, x14, x13, x12, x11, x10, x09, x08, \
+                        x07, x06, x05, x04, x03, x02, x01, x00 ) \
+                     {{ x15, x14, x13, x12, x11, x10, x09, x08, \
+                        x07, x06, x05, x04, x03, x02, x01, x00 }}
+#define mm256_setc1_16( x ) {{ [0 ... 15] = x }}
+
+#define mm256_setc_8( x31, x30, x29, x28, x27, x26, x25, x24, \
+                      x23, x22, x21, x20, x19, x18, x17, x16, \
+                      x15, x14, x13, x12, x11, x10, x09, x08, \
+                      x07, x06, x05, x04, x03, x02, x01, x00 ) \
+                   {{ x31, x30, x29, x28, x27, x26, x25, x24, \
+                      x23, x22, x21, x20, x19, x18, x17, x16, \
+                      x15, x14, x13, x12, x11, x10, x09, x08, \
+                      x07, x06, x05, x04, x03, x02, x01, x00 }}
+#define mm256_setc1_8( x ) {{ [0 ... 31] = x }}
+
+// Predefined compile time constant vectors.
+// Use Pseudo constants at run time for all simple constant vectors.
+#define c256_zero      mm256_setc1_64( 0ULL )
+#define c256_neg1      mm256_setc1_64( 0xFFFFFFFFFFFFFFFFULL )
+#define c256_one_256   mm256_setc_64(  0ULL, 0ULL, 0ULL, 1ULL )  
+#define c256_one_128   mm256_setc_64(  0ULL, 1ULL, 0ULL, 1ULL )  
+#define c256_one_64    mm256_setc1_64( 1ULL )
+#define c256_one_32    mm256_setc1_32( 1UL )
+#define c256_one_16    mm256_setc1_16( 1U )
+#define c256_one_8     mm256_setc1_8(  1U )
+
 //
-// Pseudo constants, there are no real vector constants.
-// These can't be used for compile time initialization
+// Pseudo constants.
+// These can't be used for compile time initialization but are preferable
+// for simple constant vectors at run time.
 
 // Constant zero
-#define mm256_zero _mm256_setzero_si256()
+#define m256_zero _mm256_setzero_si256()
 
 // Constant 1
-#define mm256_one_128        _mm256_set_epi64x(  0ULL, 1ULL, 0ULL, 1ULL )
-#define mm256_one_64         _mm256_set1_epi64x( 1ULL )
-#define mm256_one_32         _mm256_set1_epi32(  1UL )
-#define mm256_one_16         _mm256_set1_epi16(  1U )
+#define m256_one_256        _mm256_set_epi64x(  0ULL, 0ULL, 0ULL, 1ULL )
+#define m256_one_128        _mm256_set_epi64x(  0ULL, 1ULL, 0ULL, 1ULL )
+#define m256_one_64         _mm256_set1_epi64x( 1ULL )
+#define m256_one_32         _mm256_set1_epi32(  1UL )
+#define m256_one_16         _mm256_set1_epi16(  1U )
+#define m256_one_8          _mm256_set1_epi16(  1U )
 
 // Constant minus 1
-#define mm256_neg1           _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
+#define m256_neg1            _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL )
 
 //
 // Basic operations without SIMD equivalent
 
 // Bitwise not ( ~x )
-#define mm256_not( x )       _mm256_xor_si256( (x), mm256_neg1 ) \
+#define mm256_not( x )       _mm256_xor_si256( (x), m256_neg1 ) \
 
 // Unary negation ( -a )
-#define mm256_negate_64( a ) _mm256_sub_epi64( mm256_zero, a )
-#define mm256_negate_32( a ) _mm256_sub_epi32( mm256_zero, a )  
-#define mm256_negate_16( a ) _mm256_sub_epi16( mm256_zero, a )  
-
-//
-// Bit operations
-
-// return bit n in position, all othr bits cleared
-#define mm256_bitextract_64 ( x, n ) \
-   _mm256_and_si128( _mm256_slli_epi64( mm256_one_64, n ), x )
-#define mm256_bitextract_32 ( x, n ) \
-   _mm256_and_si128( _mm256_slli_epi32( mm256_one_32, n ), x )
-#define mm256_bitextract_16 ( x, n ) \
-   _mm256_and_si128( _mm256_slli_epi16( mm256_one_16, n ), x )
-
-// Return bit n as bool (bit 0)
-#define mm256_bittest_64( x, n ) \
-   _mm256_and_si256( mm256_one_64, _mm256_srli_epi64( x, n ) )
-#define mm256_bittest_32( x, n ) \
-   _mm256_and_si256( mm256_one_32, _mm256_srli_epi32( x, n ) )
-#define mm256_bittest_16( x, n ) \
-   _mm256_and_si256( mm256_one_16, _mm256_srli_epi16( x, n ) )
-
-// Return x with bit n set/cleared in all elements
-#define mm256_bitset_64( x, n ) \
-    _mm256_or_si256( _mm256_slli_epi64( mm256_one_64, n ), x )
-#define mm256_bitclr_64( x, n ) \
-    _mm256_andnot_si256( _mm256_slli_epi64( mm256_one_64, n ), x )
-#define mm256_bitset_32( x, n ) \
-    _mm256_or_si256( _mm256_slli_epi32( mm256_one_32, n ), x )
-#define mm256_bitclr_32( x, n ) \
-    _mm256_andnot_si256( _mm256_slli_epi32( mm256_one_32, n ), x )
-#define mm256_bitset_16( x, n ) \
-    _mm256_or_si256( _mm256_slli_epi16( mm256_one_16, n ), x )
-#define mm256_bitclr_16( x, n ) \
-    _mm256_andnot_si256( _mm256_slli_epi16( mm256_one_16, n ), x )
-
-// Return x with bit n toggled
-#define mm256_bitflip_64( x, n ) \
-   _mm256_xor_si128( _mm256_slli_epi64( mm256_one_64, n ), x )
-#define mm256_bitflip_32( x, n ) \
-   _mm256_xor_si128( _mm256_slli_epi32( mm256_one_32, n ), x )
-#define mm256_bitflip_16( x, n ) \
-   _mm256_xor_si128( _mm256_slli_epi16( mm256_one_16, n ), x )
-
-
-//
-// Memory functions
-// n = number of 256 bit (32 byte) vectors
-
-inline void memset_zero_256( __m256i *dst, int n )
-{
-   for ( int i = 0; i < n; i++ ) dst[i] = mm256_zero;
-}
-
-inline void memset_256( __m256i *dst, const __m256i a,  int n )
-{
-   for ( int i = 0; i < n; i++ ) dst[i] = a;
-}
-
-inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
-{
-   for ( int i = 0; i < n; i ++ ) dst[i] = src[i];
-}
-
-// Compare data in memory, return true if different
-inline bool memcmp_256( __m256i src1, __m256i src2, int n )
-{
-   for ( int i = 0; i < n; i++ )
-     if ( src1[i] != src2[i] ) return true;
-   return false;
-}
+#define mm256_negate_64( a ) _mm256_sub_epi64( m256_zero, a )
+#define mm256_negate_32( a ) _mm256_sub_epi32( m256_zero, a )  
+#define mm256_negate_16( a ) _mm256_sub_epi16( m256_zero, a )  
 
 //
 // Pointer casting
@@ -423,25 +660,268 @@ inline bool memcmp_256( __m256i src1, __m256i src2, int n )
 // returns p[i]
 #define casti_m256i(p,i) (((__m256i*)(p))[(i)])
 
+//
+// Memory functions
+// n = number of 256 bit (32 byte) vectors
+
+static inline void memset_zero_256( __m256i *dst, int n )
+{   for ( int i = 0; i < n; i++ ) dst[i] = m256_zero; }
+
+static inline void memset_256( __m256i *dst, const __m256i a,  int n )
+{   for ( int i = 0; i < n; i++ ) dst[i] = a; }
+
+static inline void memcpy_256( __m256i *dst, const __m256i *src, int n )
+{   for ( int i = 0; i < n; i ++ ) dst[i] = src[i]; }
+
+// Compare data in memory, return true if different
+static inline bool memcmp_256( __m256i src1, __m256i src2, int n )
+{
+   for ( int i = 0; i < n; i++ )
+      if ( src1[i] != src2[i] ) return true;
+   return false;
+}
+
+//
+// Mask conversion
+
+// converting bitmask to vector mask
+// return vector with each element set to -1 if the corresponding
+// bit in the bitmask is set and zero if the corresponding bit is clear.
+// Can be used by blend
+static inline __m256i mm256_mask_to_vmask_64( uint8_t m )
+{  return _mm256_set_epi64x(  -( (m>>3) & 1 ), -( (m>>2) & 1 ),
+                              -( (m>>1) & 1 ), -( m & 1 ) ); }
+
+static inline __m256i mm256_mask_to_vmask_32( uint8_t m )
+{  return _mm256_set_epi32( -( (m>>7) & 1 ), -( (m>>6) & 1 ),
+                            -( (m>>5) & 1 ), -( (m>>4) & 1 ),
+                            -( (m>>3) & 1 ), -( (m>>2) & 1 ),
+                            -( (m>>1) & 1 ), -(  m     & 1 ) );
+}
+
+static inline __m256i mm256_mask_to_vmask_16( uint8_t m )
+{  return _mm256_set_epi16( -( (m>>15) & 1 ), -( (m>>14) & 1 ),
+                            -( (m>>13) & 1 ), -( (m>>12) & 1 ),
+                            -( (m>>11) & 1 ), -( (m>>10) & 1 ),
+                            -( (m>> 9) & 1 ), -( (m>> 8) & 1 ),
+                            -( (m>> 7) & 1 ), -( (m>> 6) & 1 ),
+                            -( (m>> 5) & 1 ), -( (m>> 4) & 1 ),
+                            -( (m>> 3) & 1 ), -( (m>> 2) & 1 ),
+                            -( (m>> 1) & 1 ), -(  m      & 1 ) );
+}
+
+// converting immediate index to vector index, used by permute, shuffle, shift
+// Return vector with each element set from the corresponding n bits in imm8
+// index i.
+static inline __m256i mm256_index_to_vindex_64( uint8_t i, uint8_t n )
+{  uint8_t mask = ( 2 << n ) - 1;
+   return _mm256_set_epi64x( ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
+                             ( (i >>   n) & mask ), (  i         & mask ) );
+}
+
+static inline __m256i mm256_index_to_vindex_32( uint8_t i, uint8_t n )
+{  uint8_t mask = ( 2 << n ) - 1;
+   return _mm256_set_epi32( ( (i >> 7*n) & mask ), ( (i >> 6*n) & mask ),
+                            ( (i >> 5*n) & mask ), ( (i >> 4*n) & mask ),
+                            ( (i >> 3*n) & mask ), ( (i >> 2*n) & mask ),
+                            ( (i >>   n) & mask ), (  i         & mask ) );
+}
+
+static inline __m256i mm256_index_to_vindex_16( uint8_t i, uint8_t n )
+{  uint8_t mask = ( 2 << n ) - 1;
+   return _mm256_set_epi16( ( (i >> 15*n) & mask ), ( (i >> 14*n) & mask ),
+                            ( (i >> 13*n) & mask ), ( (i >> 12*n) & mask ),
+                            ( (i >> 11*n) & mask ), ( (i >> 10*n) & mask ),
+                            ( (i >>  9*n) & mask ), ( (i >>  8*n) & mask ),
+                            ( (i >>  7*n) & mask ), ( (i >>  6*n) & mask ),
+                            ( (i >>  5*n) & mask ), ( (i >>  4*n) & mask ),
+                            ( (i >>  3*n) & mask ), ( (i >>  2*n) & mask ),
+                            ( (i >>    n) & mask ), (  i          & mask ) );
+}
+
+static inline uint8_t m256_vindex_to_imm8_64( __m256i v, uint8_t n )
+{  m256_v64 s = (m256_v64)v;
+   return ( s.u64[3] << 3*n ) | ( s.u64[2] << 2*n )
+        | ( s.u64[1] <<   n ) | ( s.u64[0]        );
+}
+
+static inline uint8_t mm256_vindex_to_imm8_32( __m256i v, uint8_t n )
+{  m256_v32 s = (m256_v32)v;
+   return ( s.u32[7] << 7*n ) | ( s.u32[6] << 6*n )
+        | ( s.u32[5] << 5*n ) | ( s.u32[4] << 4*n )
+        | ( s.u32[3] << 3*n ) | ( s.u32[2] << 2*n )
+        | ( s.u32[1] <<   n ) | ( s.u32[0]        );
+}
+
+static inline uint8_t mm256_vindex_to_imm8_16( __m256i v, uint8_t n )
+{  m256_v16 s = (m256_v16)v;
+   return ( s.u16[15] << 15*n ) | ( s.u16[14] << 14*n )
+        | ( s.u16[13] << 13*n ) | ( s.u16[12] << 12*n )
+        | ( s.u16[11] << 11*n ) | ( s.u16[10] << 10*n )
+        | ( s.u16[ 9] <<  9*n ) | ( s.u16[ 8] <<  8*n )
+        | ( s.u16[ 7] <<  7*n ) | ( s.u16[ 6] <<  6*n )
+        | ( s.u16[ 5] <<  5*n ) | ( s.u16[ 4] <<  4*n )
+        | ( s.u16[ 3] <<  3*n ) | ( s.u16[ 2] <<  2*n )
+        | ( s.u16[ 1] <<    n ) | ( s.u16[ 0]         );
+}
+
+
+//
+// Bit operations
+
+// Return a vector with bits [i..i+n] extracted and right justified from each
+// element of v.
+static inline __m256i mm256_bfextract_64( __m256i v, int i, int n )
+{   return _mm256_srli_epi64( _mm256_slli_epi64( v, 64 - i - n ), 64 - n ); }
+
+static inline __m256i mm256_bfextract_32( __m256i v, int i, int n )
+{   return _mm256_srli_epi32( _mm256_slli_epi32( v, 32 - i - n ), 32 - n ); }
+
+static inline __m256i mm256_bfextract_16( __m256i v, int i, int n )
+{   return _mm256_srli_epi16( _mm256_slli_epi16( v, 16 - i - n ), 16 - n ); }
+
+// Return v1 with bits [i..i+n] of each element replaced with the corresponding
+// bits from a from v2.
+static inline __m256i mm256_bfinsert_64( __m256i v, __m256i a, int i, int n )
+{
+ return _mm256_or_si256(
+           _mm256_and_si256( v,
+                             _mm256_srli_epi64(
+                                _mm256_slli_epi64( m256_neg1, 64-n ), 64-i ) ),
+        _mm256_slli_epi64( a, i) );
+}
+
+static inline __m256i mm256_bfinsert_32( __m256i v, __m256i a, int i, int n )
+{
+ return _mm256_or_si256(
+           _mm256_and_si256( v,
+                             _mm256_srli_epi32(
+                                _mm256_slli_epi32( m256_neg1, 32-n ), 32-i ) ),
+        _mm256_slli_epi32( a, i) );
+}
+
+static inline __m256i mm256_bfinsert_16( __m256i v, __m256i a, int i, int n )
+{
+ return _mm256_or_si256(
+           _mm256_and_si256( v,
+                             _mm256_srli_epi16(
+                                _mm256_slli_epi16( m256_neg1, 16-n ), 16-i ) ),
+        _mm256_slli_epi16( a, i) );
+}
+
+
+// return bit n in position, all other bits cleared
+#define mm256_bitextract_64 ( x, n ) \
+   _mm256_and_si256( _mm256_slli_epi64( m256_one_64, n ), x )
+#define mm256_bitextract_32 ( x, n ) \
+   _mm256_and_si256( _mm256_slli_epi32( m256_one_32, n ), x )
+#define mm256_bitextract_16 ( x, n ) \
+   _mm256_and_si256( _mm256_slli_epi16( m256_one_16, n ), x )
+
+// Return bit n as bool (bit 0)
+#define mm256_bittest_64( x, n ) \
+   _mm256_and_si256( m256_one_64, _mm256_srli_epi64( x, n ) )
+#define mm256_bittest_32( x, n ) \
+   _mm256_and_si256( m256_one_32, _mm256_srli_epi32( x, n ) )
+#define mm256_bittest_16( x, n ) \
+   _mm256_and_si256( m256_one_16, _mm256_srli_epi16( x, n ) )
+
+// Return x with bit n set/cleared in all elements
+#define mm256_bitset_64( x, n ) \
+    _mm256_or_si256( _mm256_slli_epi64( m256_one_64, n ), x )
+#define mm256_bitclr_64( x, n ) \
+    _mm256_andnot_si256( _mm256_slli_epi64( m256_one_64, n ), x )
+#define mm256_bitset_32( x, n ) \
+    _mm256_or_si256( _mm256_slli_epi32( m256_one_32, n ), x )
+#define mm256_bitclr_32( x, n ) \
+    _mm256_andnot_si256( _mm256_slli_epi32( m256_one_32, n ), x )
+#define mm256_bitset_16( x, n ) \
+    _mm256_or_si256( _mm256_slli_epi16( m256_one_16, n ), x )
+#define mm256_bitclr_16( x, n ) \
+    _mm256_andnot_si256( _mm256_slli_epi16( m256_one_16, n ), x )
+
+// Return x with bit n toggled
+#define mm256_bitflip_64( x, n ) \
+   _mm256_xor_si256( _mm256_slli_epi64( m256_one_64, n ), x )
+#define mm256_bitflip_32( x, n ) \
+   _mm256_xor_si256( _mm256_slli_epi32( m256_one_32, n ), x )
+#define mm256_bitflip_16( x, n ) \
+   _mm256_xor_si256( _mm256_slli_epi16( m256_one_16, n ), x )
+
 //
 // Bit rotations
 
 //
-// Rotate bits in vector elements
-// w = packed data, c = number of bits to rotate
+// Rotate each element of v by c bits
+static inline __m256i mm256_rotr_64( __m256i v, int c )
+{
+   return _mm256_or_si256( _mm256_srli_epi64( v, c ),
+                           _mm256_slli_epi64( v, 64-(c) ) );
+}
 
-#define  mm256_rotr_64( w, c ) \
-    _mm256_or_si256( _mm256_srli_epi64(w, c), _mm256_slli_epi64(w, 64-(c)) )
-#define  mm256_rotl_64( w, c ) \
-    _mm256_or_si256( _mm256_slli_epi64(w, c), _mm256_srli_epi64(w, 64-(c)) )
-#define  mm256_rotr_32( w, c ) \
-    _mm256_or_si256( _mm256_srli_epi32(w, c), _mm256_slli_epi32(w, 32-(c)) )
-#define  mm256_rotl_32( w, c ) \
-    _mm256_or_si256( _mm256_slli_epi32(w, c), _mm256_srli_epi32(w, 32-(c)) )
-#define  mm256_rotr_16( w, c ) \
-    _mm256_or_si256( _mm256_srli_epi16(w, c), _mm256_slli_epi16(w, 32-(c)) )
-#define  mm256_rotl_16( w, c ) \
-    _mm256_or_si256( _mm256_slli_epi16(w, c), _mm256_srli_epi16(w, 32-(c)) )
+static inline __m256i mm256_rotl_64( __m256i v, int c )
+{
+   return _mm256_or_si256( _mm256_slli_epi64( v, c ),
+                           _mm256_srli_epi64( v, 64-(c) ) );
+}
+
+static inline __m256i mm256_rotr_32( __m256i v, int c )
+{
+   return _mm256_or_si256( _mm256_srli_epi32( v, c ),
+                           _mm256_slli_epi32( v, 32-(c) ) );
+}
+
+static inline __m256i mm256_rotl_32( __m256i v, int c )
+{
+   return _mm256_or_si256( _mm256_slli_epi32( v, c ),
+                           _mm256_srli_epi32( v, 32-(c) ) );
+}
+
+static inline __m256i  mm256_rotr_16( __m256i v, int c )
+{ 
+  return _mm256_or_si256( _mm256_srli_epi16(v, c),
+                          _mm256_slli_epi16(v, 32-(c)) );
+}
+
+static inline __m256i mm256_rotl_16( __m256i v, int c )
+{
+  return _mm256_or_si256( _mm256_slli_epi16(v, c),
+                          _mm256_srli_epi16(v, 32-(c)) );
+}
+
+// Rotate bits in each element of v by amount in corresponding element of
+// index vector c
+static inline __m256i mm256_rotrv_64( __m256i v, __m256i c )
+{
+  return _mm256_or_si256(
+            _mm256_srlv_epi64( v, c ),
+            _mm256_sllv_epi64( v,
+                              _mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) );
+}
+
+static inline __m256i mm256_rotlv_64( __m256i v, __m256i c )
+{
+  return _mm256_or_si256(
+            _mm256_sllv_epi64( v, c ),
+            _mm256_srlv_epi64( v,
+                              _mm256_sub_epi64( _mm256_set1_epi64x(64), c ) ) );
+}
+
+static inline __m256i mm256_rotrv_32( __m256i v, __m256i c )
+{
+  return _mm256_or_si256(
+            _mm256_srlv_epi32( v, c ),
+            _mm256_sllv_epi32( v,
+                              _mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) );
+}
+
+static inline __m256i mm256_rotlv_32( __m256i v, __m256i c )
+{
+  return _mm256_or_si256(
+            _mm256_sllv_epi32( v, c ),
+            _mm256_srlv_epi32( v,
+                              _mm256_sub_epi32( _mm256_set1_epi32(32), c ) ) );
+}
 
 //
 // Rotate elements in vector
@@ -449,126 +929,139 @@ inline bool memcmp_256( __m256i src1, __m256i src2, int n )
 // shift, a little more work is needed.
 
 // Optimized 64 bit permutations
-// Swap 128 bit elements in 256 bit vector
-#define mm256_swap_128( w )      _mm256_permute4x64_epi64( w, 0x4e )
+// Swap 128 bit elements in v
+#define mm256_swap_128( v )      _mm256_permute4x64_epi64( v, 0x4e )
 
-// Rotate 256 bit vector by one 64 bit element
-#define mm256_rotl256_1x64( w )  _mm256_permute4x64_epi64( w, 0x93 )
-#define mm256_rotr256_1x64( w )  _mm256_permute4x64_epi64( w, 0x39 )
+// Rotate v by one 64 bit element
+#define mm256_rotl256_1x64( v )  _mm256_permute4x64_epi64( v, 0x93 )
+#define mm256_rotr256_1x64( v )  _mm256_permute4x64_epi64( v, 0x39 )
 
-// Swap 64 bits in each 128 bit element of 256 bit vector
-#define mm256_swap128_64( x )    _mm256_shuffle_epi32( x, 0x4e )
+// Swap 64 bit elements in each 128 bit lane of v
+#define mm256_swap128_64( v )    _mm256_shuffle_epi32( v, 0x4e )
 
-// Rotate 128 bit elements in 256 bit vector by 32 bits
-#define mm256_rotr128_1x32( x )  _mm256_shuffle_epi32( x, 0x39 )
-#define mm256_rotl128_1x32( x )  _mm256_shuffle_epi32( x, 0x93 )
+// Rotate each 128 bit lane in v by one 32 bit element
+#define mm256_rotr128_1x32( v )  _mm256_shuffle_epi32( v, 0x39 )
+#define mm256_rotl128_1x32( v )  _mm256_shuffle_epi32( v, 0x93 )
 
-// Swap 32 bits in each 64 bit element of 256 bit vector
-#define mm256_swap64_32( x )     _mm256_shuffle_epi32( x, 0xb1 )
+// Swap 32 bit elements in each 64 bit lane of v
+#define mm256_swap64_32( v )     _mm256_shuffle_epi32( v, 0xb1 )
 
 // Less efficient but more versatile. Use only for rotations that are not 
 // integrals of 64 bits. Use permutations above when possible.
 
-// Rotate 256 bit vector by c bytes.
-#define mm256_rotr256_x8( w, c ) \
-   _mm256_or_si256( _mm256_srli_si256( w, c ), \
-                     mm256_swap_128( _mm256i_slli_si256( w, 32-(c) ) ) )
-#define mm256_rotl256_x8( w, c ) \
-   _mm256_or_si256( _mm256_slli_si256( w, c ), \
-                     mm256_swap_128( _mm256i_srli_si256( w, 32-(c) ) ) )
+// Rotate 256 bit vector v by c bytes.
+static inline __m256i mm256_brotr_256( __m256i v, int c )
+{ return _mm256_or_si256( _mm256_bsrli_epi128( v, c ),
+                          mm256_swap_128( _mm256_bslli_epi128( v, 16-(c) ) ) );
+}
 
-// Rotate 256 bit vector by c elements, use only for odd value rotations
-#define mm256_rotr256_x32( w, c )   mm256_rotr256_x8( w, (c)>>2 ) 
-#define mm256_rotl256_x32( w, c )   mm256_rotl256_x8( w, (c)>>2 )
-#define mm256_rotr256_x16( w, c )   mm256_rotr256_x8( w, (c)>>1 ) 
-#define mm256_rotl256_x16( w, c )   mm256_rotl256_x8( w, (c)>>1 )
+static inline __m256i mm256_brotl_256( __m256i v, int c )
+{ return _mm256_or_si256( _mm256_bslli_epi128( v, c ),
+                          mm256_swap_128( _mm256_bsrli_epi128( v, 16-(c) ) ) );
+}
+
+// Rotate each 128 bit lane in v by c bytes
+static inline __m256i mm256_brotr_128( __m256i v, int c )
+{ return _mm256_or_si256( _mm256_bsrli_epi128( v, c ),
+                          _mm256_bslli_epi128( v, 16 - (c) ) );
+}
+
+static inline __m256i mm256_brotl_128( __m256i v, int c )
+{ return _mm256_or_si256( _mm256_bslli_epi128( v, c ),
+                          _mm256_bsrli_epi128( v, 16 - (c) ) );
+}
+
+// Rotate 256 bit vector v by c elements, use only for odd value rotations
+#define mm256_rotr256_x32( v, c )   mm256_rotr256_x8( v, (c)>>2 ) 
+#define mm256_rotl256_x32( v, c )   mm256_rotl256_x8( v, (c)>>2 )
+#define mm256_rotr256_x16( v, c )   mm256_rotr256_x8( v, (c)>>1 ) 
+#define mm256_rotl256_x16( v, c )   mm256_rotl256_x8( v, (c)>>1 )
 
 //
 // Rotate two 256 bit vectors as one 512 bit vector
 
 // Fast but limited to 128 bit granularity
-#define mm256_swap512_256(a, b)    _mm256_permute2x128_si256( a, b, 0x4e )
-#define mm256_rotr512_1x128(a, b)  _mm256_permute2x128_si256( a, b, 0x39 )
-#define mm256_rotl512_1x128(a, b)  _mm256_permute2x128_si256( a, b, 0x93 )
+#define mm256_swap512_256(v1, v2)    _mm256_permute2x128_si256( v1, v2, 0x4e )
+#define mm256_rotr512_1x128(v1, v2)  _mm256_permute2x128_si256( v1, v2, 0x39 )
+#define mm256_rotl512_1x128(v1, v2)  _mm256_permute2x128_si256( v1, v2, 0x93 )
 
 // Much slower, for 64 and 32 bit granularity
-#define mm256_rotr512_1x64(a, b) \
+#define mm256_rotr512_1x64(v1, v2) \
 do { \
    __m256i t; \
-   t = _mm256_or_si256( _mm256_srli_si256(a,8), _mm256_slli_si256(b,24) ); \
-   b = _mm256_or_si256( _mm256_srli_si256(b,8), _mm256_slli_si256(a,24) ); \
-   a = t; \
+   t = _mm256_or_si256( _mm256_srli_si256(v1,8), _mm256_slli_si256(v2,24) ); \
+  v2 = _mm256_or_si256( _mm256_srli_si256(v2,8), _mm256_slli_si256(v1,24) ); \
+  v1 = t; \
 while (0);              
 
-#define mm256_rotl512_1x64(a, b) \
+#define mm256_rotl512_1x64(v1, v2) \
 do { \
    __m256i t; \
-   t = _mm256_or_si256( _mm256_slli_si256(a,8), _mm256_srli_si256(b,24) ); \
-   b = _mm256_or_si256( _mm256_slli_si256(b,8), _mm256_srli_si256(a,24) ); \
-   a = t; \
+   t = _mm256_or_si256( _mm256_slli_si256(v1,8), _mm256_srli_si256(v2,24) ); \
+  v2 = _mm256_or_si256( _mm256_slli_si256(v2,8), _mm256_srli_si256(v1,24) ); \
+  v1 = t; \
 while (0);              
 
-#define mm256_rotr512_1x32(a, b) \
+#define mm256_rotr512_1x32(v1, v2) \
 do { \
    __m256i t; \
-   t = _mm256_or_si256( _mm256_srli_si256(a,4), _mm256_slli_si256(b,28) ); \
-   b = _mm256_or_si256( _mm256_srli_si256(b,4), _mm256_slli_si256(a,28) ); \
-   a = t; \
+   t = _mm256_or_si256( _mm256_srli_si256(v1,4), _mm256_slli_si256(v2,28) ); \
+  v2 = _mm256_or_si256( _mm256_srli_si256(v2,4), _mm256_slli_si256(v1,28) ); \
+  v1 = t; \
 while (0);              
 
-#define mm256_rotl512_1x32(a, b) \
+#define mm256_rotl512_1x32(v1, v2) \
 do { \
    __m256i t; \
-   t = _mm256_or_si256( _mm256_slli_si256(a,4), _mm256_srli_si256(b,28) ); \
-   b = _mm256_or_si256( _mm256_slli_si256(b,4), _mm256_srli_si256(a,28) ); \
-   a = t; \
+   t = _mm256_or_si256( _mm256_slli_si256(v1,4), _mm256_srli_si256(v2,28) ); \
+  v2 = _mm256_or_si256( _mm256_slli_si256(v2,4), _mm256_srli_si256(v1,28) ); \
+  v1 = t; \
 while (0);              
 
 // Byte granularity but even a bit slower
-#define mm256_rotr512_x8( a, b, n ) \
+#define mm256_rotr512_x8( v1, v2, c ) \
 do { \
    __m256i t; \
-   t = _mm256_or_si256( _mm256_srli_epi64( a, n ), \
-                        _mm256_slli_epi64( b, ( 32 - (n) ) ) ); \
-   b = _mm256_or_si256( _mm256_srli_epi64( b, n ), \
-                        _mm256_slli_epi64( a, ( 32 - (n) ) ) ); \
-   a = t; \
+    t = _mm256_or_si256( _mm256_srli_epi64( v1, c ), \
+                         _mm256_slli_epi64( v2, ( 32 - (c) ) ) ); \
+   v2 = _mm256_or_si256( _mm256_srli_epi64( v2, c ), \
+                         _mm256_slli_epi64( v1, ( 32 - (c) ) ) ); \
+   v1 = t; \
 while (0);              
 
-#define mm256_rotl512_x8( a, b, n ) \
+#define mm256_rotl512_x8( v1, v2, c ) \
 do { \
    __m256i t; \
-   t = _mm256_or_si256( _mm256_slli_epi64( a, n ), \
-                        _mm256_srli_epi64( b, ( 32 - (n) ) ) ); \
-   b = _mm256_or_si256( _mm256_slli_epi64( b, n ), \
-                        _mm256_srli_epi64( a, ( 32 - (n) ) ) ); \
-   a = t; \
+    t = _mm256_or_si256( _mm256_slli_epi64( v1, c ), \
+                         _mm256_srli_epi64( v2, ( 32 - (c) ) ) ); \
+   v2 = _mm256_or_si256( _mm256_slli_epi64( v2, c ), \
+                         _mm256_srli_epi64( v1, ( 32 - (c) ) ) ); \
+   v2 = t; \
 while (0);              
 
 //
 // Swap bytes in vector elements
-
-inline __m256i mm256_byteswap_64( __m256i x )
+static inline __m256i mm256_bswap_64( __m256i v )
 {
-  return _mm256_shuffle_epi8( x, _mm256_set_epi8(
+  return _mm256_shuffle_epi8( v, _mm256_set_epi8(
                             0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
                             0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
                             0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
                             0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 ) );
 }
 
-inline __m256i  mm256_byteswap_32( __m256i x )
+static inline __m256i  mm256_bswap_32( __m256i v )
 {
-   return _mm256_shuffle_epi8( x, _mm256_set_epi8(
+   return _mm256_shuffle_epi8( v, _mm256_set_epi8(
                            0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
                            0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
                            0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b,
                            0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 ) );
 }
 
-inline __m256i mm256_byteswap_16( __m256i x )
+static inline __m256i mm256_bswap_16( __m256i v )
 {
-  return _mm256_shuffle_epi8( x, _mm256_set_epi8(
+  return _mm256_shuffle_epi8( v, _mm256_set_epi8(
                            0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
                            0x06, 0x07, 0x04, 0x05, 0x02, 0x03, 0x00, 0x01,
                            0x0e, 0x0f, 0x0c, 0x0d, 0x0a, 0x0b, 0x08, 0x09,
@@ -580,7 +1073,7 @@ inline __m256i mm256_byteswap_16( __m256i x )
 // usefulness tbd
 // __m128i hi, __m128i lo, returns __m256i
 #define mm256_pack_2x128( hi, lo ) \
-   _mm256_inserti128_si256( _mm256_castsi128_si256( lo ), hi, 1 ) \
+   _mm256_inserti128_si256( _mm256_castsi128_si256( hi ), lo, 0 ) \
 
 // __m128i hi, __m128i lo, __m256i src 
 #define mm256_unpack_2x128( hi, lo, src ) \
@@ -606,8 +1099,8 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
     __m128i hi, lo;
 
     mm256_unpack_2x128( hi, lo, x );
-    lo = _mm_aesenc_si128( lo, mm_zero );
-    hi = _mm_aesenc_si128( hi, mm_zero );
+    lo = _mm_aesenc_si128( lo, m128_zero );
+    hi = _mm_aesenc_si128( hi, m128_zero );
     return mm256_pack_2x128( hi, lo );
 }
 
@@ -642,8 +1135,8 @@ inline __m256i mm256_aesenc_nokey_2x128( __m256i x )
 
 // interleave 4 arrays of 32 bit elements for 128 bit processing
 // bit_len must be 256, 512 or 640 bits.
-inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1,
-                             const void *src2, const void *src3, int bit_len )
+static inline void mm_interleave_4x32( void *dst, const void *src0,
+           const void *src1, const void *src2, const void *src3, int bit_len )
 {
    uint32_t *s0 = (uint32_t*)src0;
    uint32_t *s1 = (uint32_t*)src1;
@@ -697,8 +1190,8 @@ inline void mm_interleave_4x32( void *dst, const void *src0, const void *src1,
 }
 
 // bit_len must be multiple of 32
-inline void mm_interleave_4x32x( void *dst, void *src0, void  *src1,
-                                 void *src2, void *src3, int bit_len )
+static inline void mm_interleave_4x32x( void *dst, void *src0, void  *src1,
+                                        void *src2, void *src3, int bit_len )
 {
    uint32_t *d  = (uint32_t*)dst;
    uint32_t *s0 = (uint32_t*)src0;
@@ -715,8 +1208,8 @@ inline void mm_interleave_4x32x( void *dst, void *src0, void  *src1,
    }
 }
 
-inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
-                                  void *dst3, const void *src, int bit_len )
+static inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
+                                     void *dst3, const void *src, int bit_len )
 {
    uint32_t *s = (uint32_t*)src;
    __m128i* d0 = (__m128i*)dst0;
@@ -774,8 +1267,8 @@ inline void mm_deinterleave_4x32( void *dst0, void *dst1, void *dst2,
 
 // deinterleave 4 arrays into individual buffers for scalarm processing
 // bit_len must be multiple of 32
-inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
-                                   void *dst3, const void *src, int bit_len )
+static inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
+                                    void *dst3, const void *src, int bit_len )
 {
   uint32_t *s  = (uint32_t*)src;
   uint32_t *d0 = (uint32_t*)dst0;
@@ -796,7 +1289,7 @@ inline void mm_deinterleave_4x32x( void *dst0, void *dst1, void *dst2,
 
 // Interleave 4 source buffers containing 64 bit data into the destination
 // buffer. Only bit_len 256, 512, 640 & 1024 are supported.
-inline void mm256_interleave_4x64( void *dst, const void *src0,
+static inline void mm256_interleave_4x64( void *dst, const void *src0,
             const void *src1, const void *src2, const void *src3, int bit_len )
 {
    __m256i* d = (__m256i*)dst;
@@ -836,7 +1329,7 @@ inline void mm256_interleave_4x64( void *dst, const void *src0,
 
 // Slower version
 // bit_len must be multiple of 64
-inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1,
+static inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1,
                                     void *src2, void *src3, int bit_len )
 {
    uint64_t *d = (uint64_t*)dst;
@@ -857,7 +1350,7 @@ inline void mm256_interleave_4x64x( void *dst, void *src0, void *src1,
 // Deinterleave 4 buffers of 64 bit data from the source buffer.
 // bit_len must be 256, 512, 640 or 1024 bits.
 // Requires overrun padding for 640 bit len.
-inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
+static inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
                                      void *dst3, const void *src, int bit_len )
 {
    __m256i* d0 = (__m256i*)dst0;
@@ -904,8 +1397,8 @@ inline void mm256_deinterleave_4x64( void *dst0, void *dst1, void *dst2,
 
 // Slower version
 // bit_len must be multiple 0f 64
-inline void mm256_deinterleave_4x64x( void *dst0, void *dst1, void *dst2,
-                                      void *dst3, void *src, int bit_len )
+static inline void mm256_deinterleave_4x64x( void *dst0, void *dst1,
+                             void *dst2, void *dst3, void *src, int bit_len )
 {
   uint64_t *s = (uint64_t*)src;
   uint64_t *d0 = (uint64_t*)dst0;
@@ -924,9 +1417,9 @@ inline void mm256_deinterleave_4x64x( void *dst0, void *dst1, void *dst2,
 
 // Interleave 8 source buffers containing 32 bit data into the destination
 // vector
-inline void mm256_interleave_8x32( void *dst, const void *src0,
-      const void *src1, const void *src2, const void *src3, const void *src4,
-      const void *src5, const void *src6, const void *src7, int bit_len )
+static inline void mm256_interleave_8x32( void *dst, const void *src0,
+        const void *src1, const void *src2, const void *src3, const void *src4,
+        const void *src5, const void *src6, const void *src7, int bit_len )
 {
    uint32_t *s0 = (uint32_t*)src0;
    uint32_t *s1 = (uint32_t*)src1;
@@ -989,9 +1482,9 @@ inline void mm256_interleave_8x32( void *dst, const void *src0,
 // probably obsolete with double pack 2x32->64, 4x64->256.
 // Slower but it works with 32 bit data
 // bit_len must be multiple of 32
-inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0,
-     uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4,
-     uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len )
+static inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0,
+          uint32_t *src1, uint32_t *src2, uint32_t *src3, uint32_t *src4,
+          uint32_t *src5, uint32_t *src6, uint32_t *src7, int bit_len )
 {
    uint32_t *d = dst;;
    for ( int i = 0; i < bit_len>>5; i++, d += 8 )
@@ -1008,7 +1501,7 @@ inline void mm256_interleave_8x32x( uint32_t *dst, uint32_t *src0,
 }
 
 // Deinterleave 8 buffers of 32 bit data from the source buffer.
-inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
+static inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
               void *dst3, void *dst4, void *dst5, void *dst6, void *dst7,
               const void *src, int bit_len )
 {
@@ -1091,7 +1584,7 @@ inline void mm256_deinterleave_8x32( void *dst0, void *dst1, void *dst2,
 
 // Deinterleave 8 arrays into indivdual buffers for scalar processing
 // bit_len must be multiple of 32
-inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
+static inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
                 uint32_t *dst2,uint32_t *dst3, uint32_t *dst4, uint32_t *dst5,
                 uint32_t *dst6,uint32_t *dst7,uint32_t *src, int bit_len )
 {
@@ -1110,7 +1603,7 @@ inline void mm256_deinterleave_8x32x( uint32_t *dst0, uint32_t *dst1,
 }
 
 // Can't do it in place
-inline void mm256_reinterleave_4x64( void *dst, void *src, int  bit_len )
+static inline void mm256_reinterleave_4x64( void *dst, void *src, int  bit_len )
 {
    __m256i* d = (__m256i*)dst;
    uint32_t *s = (uint32_t*)src;
@@ -1148,8 +1641,8 @@ inline void mm256_reinterleave_4x64( void *dst, void *src, int  bit_len )
 // convert 4x32 byte (128 bit) vectors to 4x64 (256 bit) vectors for AVX2
 // bit_len must be multiple of 64
 // broken
-inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
-                                         int  bit_len )
+static inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
+                                             int  bit_len )
 {
    uint32_t *d = (uint32_t*)dst;
    uint32_t *s = (uint32_t*)src;
@@ -1168,7 +1661,7 @@ inline void mm256_reinterleave_4x64x( uint64_t *dst, uint32_t *src,
 
 // convert 4x64 byte (256 bit) vectors to 4x32 (128 bit) vectors for AVX
 // bit_len must be multiple of 64
-inline void mm256_reinterleave_4x32( void *dst, void *src, int  bit_len )
+static inline void mm256_reinterleave_4x32( void *dst, void *src, int  bit_len )
 {
    __m256i  *d = (__m256i*)dst;
    uint32_t *s = (uint32_t*)src;
@@ -1202,8 +1695,70 @@ inline void mm256_reinterleave_4x32( void *dst, void *src, int  bit_len )
    // bit_len == 1024
 }
 
+static inline void mm256_interleave_2x128( void *dst, void *src0, void *src1,
+                                           int bit_len )
+{
+   __m256i  *d = (__m256i*)dst;
+   uint64_t *s0 = (uint64_t*)src0;
+   uint64_t *s1 = (uint64_t*)src1;   
+
+   d[0] = _mm256_set_epi64x( s1[ 1], s1[ 0], s0[ 1], s0[ 0] );
+   d[1] = _mm256_set_epi64x( s1[ 3], s1[ 2], s0[ 3], s0[ 2] );
+
+   if ( bit_len <= 256 ) return;
+
+   d[2] = _mm256_set_epi64x( s1[ 5], s1[ 4], s0[ 5], s0[ 4] );
+   d[3] = _mm256_set_epi64x( s1[ 7], s1[ 6], s0[ 7], s0[ 6] );
+
+   if ( bit_len <= 512 ) return;
+
+   d[4] = _mm256_set_epi64x( s1[ 9], s1[ 8], s0[ 9], s0[ 8] );
+   
+   if ( bit_len <= 640 ) return;
+
+   d[5] = _mm256_set_epi64x( s1[11], s1[10], s0[11], s0[10] );
+
+   d[6] = _mm256_set_epi64x( s1[13], s1[12], s0[13], s0[12] );
+   d[7] = _mm256_set_epi64x( s1[15], s1[14], s0[15], s0[14] );
+
+   // bit_len == 1024
+}
+
+static inline void mm256_deinterleave_2x128( void *dst0, void *dst1, void *src,
+                                             int bit_len )
+{
+   uint64_t *s = (uint64_t*)src;
+   __m256i  *d0 = (__m256i*)dst0;
+   __m256i  *d1 = (__m256i*)dst1;
+
+   d0[0] = _mm256_set_epi64x( s[ 5], s[4], s[ 1], s[ 0] );
+   d1[0] = _mm256_set_epi64x( s[ 7], s[6], s[ 3], s[ 2] );
+
+   if ( bit_len <= 256 ) return;
+
+   d0[1] = _mm256_set_epi64x( s[13], s[12], s[ 9], s[ 8] );
+   d1[1] = _mm256_set_epi64x( s[15], s[14], s[11], s[10] );
+
+   if ( bit_len <= 512 ) return;
+
+   if ( bit_len <= 640 )
+   {
+      d0[2] = _mm256_set_epi64x( d0[2][3], d0[2][2], s[17], s[16] );
+      d1[2] = _mm256_set_epi64x( d1[2][3], d1[2][2], s[19], s[18] );
+      return;
+   }
+
+   d0[2] = _mm256_set_epi64x( s[21], s[20], s[17], s[16] );
+   d1[2] = _mm256_set_epi64x( s[23], s[22], s[19], s[18] );
+
+   d0[3] = _mm256_set_epi64x( s[29], s[28], s[25], s[24] );
+   d1[3] = _mm256_set_epi64x( s[31], s[30], s[27], s[26] );
+
+   // bit_len == 1024
+}
+
 // not used
-inline void mm_reinterleave_4x32( void *dst, void *src, int  bit_len )
+static inline void mm_reinterleave_4x32( void *dst, void *src, int  bit_len )
 {
    uint32_t *d = (uint32_t*)dst;
    uint32_t *s = (uint32_t*)src;
diff --git a/build-allarch.sh b/build-allarch.sh
index eb1c16e..84d31a9 100755
--- a/build-allarch.sh
+++ b/build-allarch.sh
@@ -3,16 +3,6 @@
 make distclean || echo clean
 rm -f config.status
 ./autogen.sh || echo done
-CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure --with-curl
-make -j 4
-strip -s cpuminer.exe
-mv cpuminer.exe cpuminer-4way.exe
-strip -s cpuminer
-mv cpuminer cpuminer-4way
-
-make clean
-rm -f config.status
-./autogen.sh || echo done
 CFLAGS="-O3 -march=core-avx2 -Wall" ./configure --with-curl
 make -j 4
 strip -s cpuminer.exe
diff --git a/build.sh b/build.sh
index bf713ea..d5f111e 100755
--- a/build.sh
+++ b/build.sh
@@ -18,8 +18,8 @@ rm -f config.status
 # Debian 7.7 / Ubuntu 14.04 (gcc 4.7+)
 #extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores"
 
-#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
-CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
+CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr
+#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl
 #CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl
 
 make -j 4
diff --git a/configure b/configure
index 95ac974..03c03be 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.0.1.
+# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.1.
 #
 #
 # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -577,8 +577,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='cpuminer-opt'
 PACKAGE_TARNAME='cpuminer-opt'
-PACKAGE_VERSION='3.8.0.1'
-PACKAGE_STRING='cpuminer-opt 3.8.0.1'
+PACKAGE_VERSION='3.8.1'
+PACKAGE_STRING='cpuminer-opt 3.8.1'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures cpuminer-opt 3.8.0.1 to adapt to many kinds of systems.
+\`configure' configures cpuminer-opt 3.8.1 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1392,7 +1392,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of cpuminer-opt 3.8.0.1:";;
+     short | recursive ) echo "Configuration of cpuminer-opt 3.8.1:";;
    esac
   cat <<\_ACEOF
 
@@ -1497,7 +1497,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-cpuminer-opt configure 3.8.0.1
+cpuminer-opt configure 3.8.1
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by cpuminer-opt $as_me 3.8.0.1, which was
+It was created by cpuminer-opt $as_me 3.8.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2981,7 +2981,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='cpuminer-opt'
- VERSION='3.8.0.1'
+ VERSION='3.8.1'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by cpuminer-opt $as_me 3.8.0.1, which was
+This file was extended by cpuminer-opt $as_me 3.8.1, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6743,7 +6743,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-cpuminer-opt config.status 3.8.0.1
+cpuminer-opt config.status 3.8.1
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index b8981e6..de28f8d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cpuminer-opt], [3.8.0.1])
+AC_INIT([cpuminer-opt], [3.8.1])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 01c825e..3665b25 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -3238,10 +3238,10 @@ int main(int argc, char *argv[])
       }
    }
 
-//#ifdef HAVE_SYSLOG_H
-//	if (use_syslog)
-//		openlog("cpuminer", LOG_PID, LOG_USER);
-//#endif
+#ifdef HAVE_SYSLOG_H
+	if (use_syslog)
+		openlog("cpuminer", LOG_PID, LOG_USER);
+#endif
 
 	work_restart = (struct work_restart*) calloc(opt_n_threads, sizeof(*work_restart));
 	if (!work_restart)
diff --git a/miner.h b/miner.h
index 625772f..64d005b 100644
--- a/miner.h
+++ b/miner.h
@@ -80,10 +80,10 @@ void *alloca (size_t);
 # endif
 //#endif
 
-//#ifdef HAVE_SYSLOG_H
-//#include <syslog.h>
-//#define LOG_BLUE 0x10 /* unique value */
-//#else
+#ifdef HAVE_SYSLOG_H
+#include <syslog.h>
+#define LOG_BLUE 0x10 /* unique value */
+#else
 enum {
 	LOG_ERR,
 	LOG_WARNING,
@@ -93,7 +93,7 @@ enum {
 	/* custom notices */
 	LOG_BLUE = 0x10,
 };
-//#endif
+#endif
 
 static inline bool is_windows(void)
 {